Loading CSV in Julia fails-CodePudding

I am trying to read a large DataFrame in Julia with CSV.read(file, DataFrame) and receive the following error:

ERROR: TaskFailedException

    nested task error: BoundsError: attempt to access 308-element Vector{BigFloat} at index [0]
    Stacktrace:
      [1] getindex
        @ .\array.jl:861 [inlined]
      [2] _scale(#unused#::Type{Float64}, v::BigInt, exp::Int64, neg::Bool)
        @ Parsers C:\Users\mazoi\.julia\packages\Parsers\KmPKe\src\floats.jl:524
      [3] scale(#unused#::Type{Float64}, v::BigInt, exp::Int64, neg::Bool)
        @ Parsers C:\Users\mazoi\.julia\packages\Parsers\KmPKe\src\floats.jl:408
      [4] parseexp
        @ C:\Users\mazoi\.julia\packages\Parsers\KmPKe\src\floats.jl:356 [inlined]
      [5] parsefrac
        @ C:\Users\mazoi\.julia\packages\Parsers\KmPKe\src\floats.jl:320 [inlined]
      [6] parsedigits
        @ C:\Users\mazoi\.julia\packages\Parsers\KmPKe\src\floats.jl:251 [inlined]
      [7] _parsedigits(#unused#::Type{Float64}, source::Vector{UInt8}, pos::Int64, len::Int64, b::UInt8, code::Int16, options::Parsers.Options, digits::BigInt, neg::Bool, startpos::Int64)
        @ Parsers C:\Users\mazoi\.julia\packages\Parsers\KmPKe\src\floats.jl:186
      [8] parsedigits
        @ C:\Users\mazoi\.julia\packages\Parsers\KmPKe\src\floats.jl:210 [inlined]
      [9] _parsedigits(#unused#::Type{Float64}, source::Vector{UInt8}, pos::Int64, len::Int64, b::UInt8, code::Int16, options::Parsers.Options, digits::UInt128, neg::Bool, startpos::Int64)
        @ Parsers C:\Users\mazoi\.julia\packages\Parsers\KmPKe\src\floats.jl:186
     [10] parsedigits
        @ C:\Users\mazoi\.julia\packages\Parsers\KmPKe\src\floats.jl:210 [inlined]
     [11] typeparser
        @ C:\Users\mazoi\.julia\packages\Parsers\KmPKe\src\floats.jl:179 [inlined]
     [12] xparse(::Type{Float64}, source::Vector{UInt8}, pos::Int64, len::Int64, options::Parsers.Options, ::Type{Float64})
        @ Parsers C:\Users\mazoi\.julia\packages\Parsers\KmPKe\src\Parsers.jl:316
     [13] xparse
        @ C:\Users\mazoi\.julia\packages\Parsers\KmPKe\src\Parsers.jl:266 [inlined]
     [14] detect
        @ C:\Users\mazoi\.julia\packages\CSV\jFiCn\src\utils.jl:470 [inlined]
     [15] detect
        @ C:\Users\mazoi\.julia\packages\CSV\jFiCn\src\utils.jl:459 [inlined]
     [16] findchunkrowstart(ranges::Vector{Int64}, i::Int64, buf::Vector{UInt8}, opts::Parsers.Options, typemap::Dict{Type, Type}, downcast::Bool, ncols::Int64, rows_to_check::Int64, columns::Vector{CSV.Column}, origcoltypes::Vector{Type}, columnlock::ReentrantLock, stringtype::Any, totalbytes::Base.Threads.Atomic{Int64}, totalrows::Base.Threads.Atomic{Int64}, succeeded::Base.Threads.Atomic{Bool})
        @ CSV C:\Users\mazoi\.julia\packages\CSV\jFiCn\src\detection.jl:383
     [17] macro expansion
        @ C:\Users\mazoi\.julia\packages\CSV\jFiCn\src\detection.jl:470 [inlined]
     [18] (::CSV.var"#16#17"{Vector{UInt8}, Parsers.Options, Vector{Int64}, Int64, Vector{CSV.Column}, DataType, Dict{Type, Type}, Bool, Int64, Vector{Type}, ReentrantLock, Base.Threads.Atomic{Bool}, Base.Threads.Atomic{Int64}, Base.Threads.Atomic{Int64}, Int64})()
        @ CSV .\threadingconstructs.jl:178
Stacktrace:
 [1] sync_end(c::Channel{Any})
   @ Base .\task.jl:381
 [2] macro expansion
   @ .\task.jl:400 [inlined]
 [3] findrowstarts!(buf::Vector{UInt8}, opts::Parsers.Options, ranges::Vector{Int64}, ncols::Int64, columns::Vector{CSV.Column}, stringtype::Any, typemap::Dict{Type, Type}, downcast::Bool, rows_to_check::Int64)
   @ CSV C:\Users\mazoi\.julia\packages\CSV\jFiCn\src\detection.jl:468
 [4] CSV.Context(source::CSV.Arg, header::CSV.Arg, normalizenames::CSV.Arg, datarow::CSV.Arg, skipto::CSV.Arg, footerskip::CSV.Arg, transpose::CSV.Arg, comment::CSV.Arg, ignoreemptyrows::CSV.Arg, ignoreemptylines::CSV.Arg, select::CSV.Arg, drop::CSV.Arg, limit::CSV.Arg, buffer_in_memory::CSV.Arg, threaded::CSV.Arg, ntasks::CSV.Arg, tasks::CSV.Arg, rows_to_check::CSV.Arg, lines_to_check::CSV.Arg, missingstrings::CSV.Arg, missingstring::CSV.Arg, delim::CSV.Arg, ignorerepeated::CSV.Arg, quoted::CSV.Arg, quotechar::CSV.Arg, openquotechar::CSV.Arg, closequotechar::CSV.Arg, escapechar::CSV.Arg, dateformat::CSV.Arg, dateformats::CSV.Arg, decimal::CSV.Arg, truestrings::CSV.Arg, falsestrings::CSV.Arg, stripwhitespace::CSV.Arg, type::CSV.Arg, types::CSV.Arg, typemap::CSV.Arg, pool::CSV.Arg, downcast::CSV.Arg, lazystrings::CSV.Arg, stringtype::CSV.Arg, strict::CSV.Arg, silencewarnings::CSV.Arg, maxwarnings::CSV.Arg, debug::CSV.Arg, parsingdebug::CSV.Arg, validate::CSV.Arg, streaming::CSV.Arg)
   @ CSV C:\Users\mazoi\.julia\packages\CSV\jFiCn\src\context.jl:608
 [5] #File#25
   @ C:\Users\mazoi\.julia\packages\CSV\jFiCn\src\file.jl:221 [inlined]
 [6] CSV.File(source::String)
   @ CSV C:\Users\mazoi\.julia\packages\CSV\jFiCn\src\file.jl:221
 [7] top-level scope
   @ REPL[16]:1

I think I can pinpoint the error to some rows that have strange mismatching types. I can read the data in Python and the problematic columns show \\N.

I tried to get around the issue by specifying escapechar='\\' but it didn't help. Any ways to get around these problematic rows or find a more robust package than CSV.jl? Thanks!

Update: added full Stacktrace of the error.

CodePudding user response：

If your file has fields that start with several digits (eg. SHA digest values), this may be due to this issue. If so, a fix has been added to CSV.jl, but isn't available in a release yet.

In any case, the error occurs during parsing done for multithreaded processing of the file, so you can avoid it by passing ntasks = 1 to the CSV.read call.