Import csv with json inside in Pandas-CodePudding

I'm trying to import with Pandas a CSV file that looks like the following:

First line:

LOAD_ID,CHAIN_ID,BLOCK,TIMESTAMP,TX_HASH,CALL_ID,CALL_TYPE,FROM_ADDRESS,FROM_NAME,TO_ADDRESS,TO_NAME,FUNCTION_SIGNATURE,FUNCTION_NAME,VALUE,ARGUMENTS,RAW_ARGUMENTS,OUTPUTS,RAW_OUTPUTS,GAS_USED,ERROR,STATUS,ORDER_INDEX,DECODING_STATUS,STORAGE_ADDRESS

Other lines:

2022-05-09 14:57:50.000,mainnet,12023665,2021-03-12 12:27:23.000,0x2c6006b6a82b58574fc9d3b60455d343d976f03ae7dd2dd7dc064139315c2755,0_3,call,0x19a8ed4860007a66805782ed7e0bed4e44fc6717,Marketplace,0xf87e31492faf9a91b02ee0deaad50d51d56d5d4d,LAND,0xe985e9c5,isApprovedForAll,0.00000000,{"operator":"0x8e5660b4ab70168b5a6feea0e0315cb49c8cd539"\,"owner":"0x821d9a3f2ea71e19c862218f4b041fa5a2827490"},[{"name":"owner"\,"raw":"0x000000000000000000000000821d9a3f2ea71e19c862218f4b041fa5a2827490"\,"type":"address"}\,{"name":"operator"\,"raw":"0x0000000000000000000000008e5660b4ab70168b5a6feea0e0315cb49c8cd539"\,"type":"address"}],{"__out0":"True"},[{"name":""\,"raw":"0x0000000000000000000000000000000000000000000000000000000000000001"\,"type":"bool"}],6288,\\N,true,4855,true,0xf87e31492faf9a91b02ee0deaad50d51d56d5d4d

I've tried with df = pd.read_csv(f, sep=",", engine='python') but it continues to give the error ',' expected after '"'.

I believe that the part with JSON-like data is causing the error but I don't know how to fix it.

UPDATE:

I added engine=python to avoid the error: Error tokenizing data. C error: Expected 28 fields in line 5, saw 32.

The full trace of the error is:

File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\python_parser.py:760, in PythonParser._next_iter_line(self, row_num)
    759 assert self.data is not None
--> 760 line = next(self.data)
    761 # for mypy

Error: ',' expected after '"'

During handling of the above exception, another exception occurred:

ParserError                               Traceback (most recent call last)
c:\Users\Utente\Desktop\pm-decentraland\scripts\traces_internal_execution.ipynb Cell 1' in <cell line: 12>()
      3 import pandas as pd
      6 # path = r'C:\Users\Utente\Desktop\sf_csv'
      7 # all_files = glob.glob(os.path.join(path, "*.csv"))
      8 
      9 # df = pd.concat((pd.read_csv(f,  sep=',', quotechar='"', error_bad_lines=False)
     10 #                for f in all_files))
---> 12 df = pd.read_csv(r'C:\Users\Utente\Desktop\sf_csv\output_files_0_0_0.csv', sep=",", engine='python')
     14 print(df)

File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\util\_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    305 if len(args) > num_allow_args:
    306     warnings.warn(
    307         msg.format(arguments=arguments),
    308         FutureWarning,
    309         stacklevel=stacklevel,
    310     )
--> 311 return func(*args, **kwargs)

File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\readers.py:680, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    665 kwds_defaults = _refine_defaults_read(
    666     dialect,
    667     delimiter,
   (...)
    676     defaults={"delimiter": ","},
    677 )
    678 kwds.update(kwds_defaults)
--> 680 return _read(filepath_or_buffer, kwds)

File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\readers.py:575, in _read(filepath_or_buffer, kwds)
    572 _validate_names(kwds.get("names", None))
    574 # Create the parser.
--> 575 parser = TextFileReader(filepath_or_buffer, **kwds)
    577 if chunksize or iterator:
    578     return parser

File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\readers.py:933, in TextFileReader.__init__(self, f, engine, **kwds)
    930     self.options["has_index_names"] = kwds["has_index_names"]
    932 self.handles: IOHandles | None = None
--> 933 self._engine = self._make_engine(f, self.engine)

File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\readers.py:1235, in TextFileReader._make_engine(self, f, engine)
   1232     raise ValueError(msg)
   1234 try:
-> 1235     return mapping[engine](f, **self.options)
   1236 except Exception:
   1237     if self.handles is not None:

File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\python_parser.py:138, in PythonParser.__init__(self, f, **kwds)
    134 # needs to be cleaned/refactored
    135 # multiple date column thing turning into a real spaghetti factory
    137 if not self._has_complex_date_col:
--> 138     (index_names, self.orig_names, self.columns) = self._get_index_name(
    139         self.columns
    140     )
    141     self._name_processed = True
    142     if self.index_names is None:

File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\python_parser.py:899, in PythonParser._get_index_name(self, columns)
    897 else:
    898     try:
--> 899         line = self._next_line()
    900     except StopIteration:
    901         line = None

File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\python_parser.py:696, in PythonParser._next_line(self)
    693     next(self.data)
    695 while True:
--> 696     orig_line = self._next_iter_line(row_num=self.pos   1)
    697     self.pos  = 1
    699     if orig_line is not None:

File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\python_parser.py:789, in PythonParser._next_iter_line(self, row_num)
    780         reason = (
    781             "Error could possibly be due to "
    782             "parsing errors in the skipped footer rows "
   (...)
    785             "all rows)."
    786         )
    787         msg  = ". "   reason
--> 789     self._alert_malformed(msg, row_num)
    790 return None

File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\python_parser.py:739, in PythonParser._alert_malformed(self, msg, row_num)
    722 """
    723 Alert a user about a malformed row, depending on value of
    724 `self.on_bad_lines` enum.
   (...)
    736     even though we 0-index internally.
    737 """
    738 if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
--> 739     raise ParserError(msg)
    740 elif self.on_bad_lines == self.BadLineHandleMethod.WARN:
    741     base = f"Skipping line {row_num}: "

ParserError: ',' expected after '"'

CodePudding user response：

Adding escapechar='\\' solve the problem and catch all the 24 columns.

df = pd.read_csv(f, sep=",", engine="python", escapechar='\\')

CodePudding user response：

if you see error like this :

ParserError: Expected 28 fields in line 5, saw 32

it means pandas in step one red first row of the csv and found 27 separator. so it built a table schema of 28 column ..

in the line of error , pandas have found only 4 separator and couldn't match to the previous table that it created.

The problem here is that the commas inside your json string are being treated as delimiters. You should modify the input data (if you don't have direct access to the file, you can always read the contents into a list of strings using open first).

Here are a few modification options that you can try:

Option 1: Quote json string with single quote

Use a single quote (or another character that doesn't otherwise appear in your data) as a quote character for your json string.

2022-05-09 14:57:50.000,mainnet,12023665,2021-03-12 12:27:23.000,0x2c6006b6a82b58574fc9d3b60455d343d976f03ae7dd2dd7dc064139315c2755,0_3,call,0x19a8ed4860007a66805782ed7e0bed4e44fc6717,Marketplace,0xf87e31492faf9a91b02ee0deaad50d51d56d5d4d,LAND,0xe985e9c5,isApprovedForAll,0.00000000,'{"operator":"0x8e5660b4ab70168b5a6feea0e0315cb49c8cd539"\,"owner":"0x821d9a3f2ea71e19c862218f4b041fa5a2827490"}','[{"name":"owner"\,"raw":"0x000000000000000000000000821d9a3f2ea71e19c862218f4b041fa5a2827490"\,"type":"address"}\,{"name":"operator"\,"raw":"0x0000000000000000000000008e5660b4ab70168b5a6feea0e0315cb49c8cd539"\,"type":"address"}]',{"__out0":"True"},'[{"name":""\,"raw":"0x0000000000000000000000000000000000000000000000000000000000000001"\,"type":"bool"}]',6288,\\N,true,4855,true,0xf87e31492faf9a91b02ee0deaad50d51d56d5d4d

Then use quotechar="'" when you read the data

df=pd.read_csv('f.csv', quotechar="'", sep=",")

Option 2 (recommended) : you can use literal_eval

you have specify the json like column base on their headers in the converter input of pandas's csv reader

from ast import literal_eval
import pandas as pd
df = pd.read_csv('f.csv', converters={'ARGUMENTS': literal_eval , 'RAW_ARGUMENTS': literal_eval ,'RAW_OUTPUTS': literal_eval },quotechar="'", sep=",")