How to read csv file with column containing unqouted newlines?-CodePudding

library(tidyverse)

I have this stupid csv file where someone forgot to quote the string in the last column which may contain newlines. (The 2nd row in this example doesn’t)

csv_file <- str_c(
  "a,b,c\n",
  "1,1,first\nrow\n",
  "1,1,second row\n",
  "1,1,third\nrow\n",
  collapse = ""
)

Trying to read the file with read_csv() gives rise to problems.

read_csv(csv_file)
#> Warning: One or more parsing issues, call `problems()` on your data frame for details,
#> e.g.:
#>   dat <- vroom(...)
#>   problems(dat)
#> Rows: 5 Columns: 3
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (2): a, c
#> dbl (1): b
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 5 × 3
#>   a         b c         
#>   <chr> <dbl> <chr>     
#> 1 1         1 first     
#> 2 row      NA <NA>      
#> 3 1         1 second row
#> 4 1         1 third     
#> 5 row      NA <NA>

How can one parse the file to get the expected results?

tibble(
  a = c(1, 1, 1),
  b = c(1, 1, 1),
  c = c("first\nrow", "second row", "third\nrow")
)
#> # A tibble: 3 × 3
#>       a     b c           
#>   <dbl> <dbl> <chr>       
#> 1     1     1 "first\nrow"
#> 2     1     1 "second row"
#> 3     1     1 "third\nrow"

^{Created on 2023-01-06 with reprex v2.0.2}

CodePudding user response：

As long as the first column doesn't contain commas, one method would be to replace all newlines with a tab character, and then use gsub to replace any tab that isn't followed by another tab before the next comma. Once you have read in the data, you can easily gsub the tabs in the final column back into newlines:

read.csv(text = gsub('\t([[:alnum:]] ),', '\n\\1,', 
                     gsub('\n', '\t', csv_file))) %>%
  as_tibble() %>%
  mutate(c = gsub('\t', '\n', c))
#> # A tibble: 3 x 3
#>       a     b c             
#>   <int> <int> <chr>         
#> 1     1     1 "first\nrow"  
#> 2     1     1 "second row"  
#> 3     1     1 "third\nrow\n"

Once you have read the data in you can easily gsub the tabs in the final column into newlines

Or alternatively with stringr

csv_file |> 
  str_replace_all(
    regex("\n(\\w )(?=\n)", multiline = TRUE), "|\\1"
  ) |> 
  read_csv() |> 
  mutate(
    c = str_replace(c, "[|]", "\n")
  )

CodePudding user response：

Another option is to use readLines() to read in the data, build an index of fields where there are no commas and use that to concatenate the fields together.

txt <- readLines(textConnection(csv_file))
idx <- cumsum(grepl(",", txt))
readr::read_csv(I(tapply(txt, idx, paste, collapse = " ")), show_col_types = FALSE)

# A tibble: 3 × 3                                                                                                                                  
      a     b c         
  <dbl> <dbl> <chr>     
1     1     1 first row 
2     1     1 second row
3     1     1 third row

CodePudding user response：

This is what I came up with. It's not as elegant as the other proposed solutions, but I figured that the approach might come in handy for other similar type problems.

library(meltr)

I discovered that the meltr package might come in handy here since it allows us to parse the file as is, and returns a tabular format that can be used to fix the issues

melt_csv(csv_file)
#> # A tibble: 14 × 4
#>      row   col data_type value     
#>    <dbl> <dbl> <chr>     <chr>     
#>  1     1     1 character a         
#>  2     1     2 character b         
#>  3     1     3 character c         
#>  4     2     1 integer   1         
#>  5     2     2 integer   1         
#>  6     2     3 character first     
#>  7     3     1 character row       
#>  8     4     1 integer   1         
#>  9     4     2 integer   1         
#> 10     4     3 character second row
#> 11     5     1 integer   1         
#> 12     5     2 integer   1         
#> 13     5     3 character third     
#> 14     6     1 character row

First, I find the rows where there are issues

(step1 <- melt_csv(csv_file) |> 
  add_row(col = 1) |>  # add dummy 
  mutate(
    bad = (col == 1) & lead(col == 1)
  ) |> 
  filter(!is.na(bad))  # remove dummy
)
#> # A tibble: 14 × 5
#>      row   col data_type value      bad  
#>    <dbl> <dbl> <chr>     <chr>      <lgl>
#>  1     1     1 character a          FALSE
#>  2     1     2 character b          FALSE
#>  3     1     3 character c          FALSE
#>  4     2     1 integer   1          FALSE
#>  5     2     2 integer   1          FALSE
#>  6     2     3 character first      FALSE
#>  7     3     1 character row        TRUE 
#>  8     4     1 integer   1          FALSE
#>  9     4     2 integer   1          FALSE
#> 10     4     3 character second row FALSE
#> 11     5     1 integer   1          FALSE
#> 12     5     2 integer   1          FALSE
#> 13     5     3 character third      FALSE
#> 14     6     1 character row        TRUE

Then I adjust row and col, and combine the strings that were split on multiple rows.

(step2 <- step1 |> 
  mutate(
    row = if_else(bad, lag(row), row),
    col = if_else(bad, lag(col), col)
  ) |> 
  group_by(row, col) |> 
  summarise(
    value = str_c(value, collapse = "\n"), .groups = "drop"
  )
)
#> # A tibble: 12 × 3
#>      row   col value       
#>    <dbl> <dbl> <chr>       
#>  1     1     1 "a"         
#>  2     1     2 "b"         
#>  3     1     3 "c"         
#>  4     2     1 "1"         
#>  5     2     2 "1"         
#>  6     2     3 "first\nrow"
#>  7     4     1 "1"         
#>  8     4     2 "1"         
#>  9     4     3 "second row"
#> 10     5     1 "1"         
#> 11     5     2 "1"         
#> 12     5     3 "third\nrow"

The final step is used to fish out the column names from the first row, and finally pivot the data to the “wide” format.

(step3 <- step2 |> 
  mutate(
    col = value[col]
  ) |> 
  filter(row != 1) |> 
  pivot_wider(
    names_from = col, values_from = value
  ) |> 
  select(-row)
)
#> # A tibble: 3 × 3
#>   a     b     c           
#>   <chr> <chr> <chr>       
#> 1 1     1     "first\nrow"
#> 2 1     1     "second row"
#> 3 1     1     "third\nrow"