I'm trying to import with Pandas a CSV file that looks like the following:
First line:
LOAD_ID,CHAIN_ID,BLOCK,TIMESTAMP,TX_HASH,CALL_ID,CALL_TYPE,FROM_ADDRESS,FROM_NAME,TO_ADDRESS,TO_NAME,FUNCTION_SIGNATURE,FUNCTION_NAME,VALUE,ARGUMENTS,RAW_ARGUMENTS,OUTPUTS,RAW_OUTPUTS,GAS_USED,ERROR,STATUS,ORDER_INDEX,DECODING_STATUS,STORAGE_ADDRESS
Other lines:
2022-05-09 14:57:50.000,mainnet,12023665,2021-03-12 12:27:23.000,0x2c6006b6a82b58574fc9d3b60455d343d976f03ae7dd2dd7dc064139315c2755,0_3,call,0x19a8ed4860007a66805782ed7e0bed4e44fc6717,Marketplace,0xf87e31492faf9a91b02ee0deaad50d51d56d5d4d,LAND,0xe985e9c5,isApprovedForAll,0.00000000,{"operator":"0x8e5660b4ab70168b5a6feea0e0315cb49c8cd539"\,"owner":"0x821d9a3f2ea71e19c862218f4b041fa5a2827490"},[{"name":"owner"\,"raw":"0x000000000000000000000000821d9a3f2ea71e19c862218f4b041fa5a2827490"\,"type":"address"}\,{"name":"operator"\,"raw":"0x0000000000000000000000008e5660b4ab70168b5a6feea0e0315cb49c8cd539"\,"type":"address"}],{"__out0":"True"},[{"name":""\,"raw":"0x0000000000000000000000000000000000000000000000000000000000000001"\,"type":"bool"}],6288,\\N,true,4855,true,0xf87e31492faf9a91b02ee0deaad50d51d56d5d4d
I've tried with df = pd.read_csv(f, sep=",", engine='python')
but it continues to give the error ',' expected after '"'
.
I believe that the part with JSON-like data is causing the error but I don't know how to fix it.
UPDATE:
I added engine=python
to avoid the error: Error tokenizing data. C error: Expected 28 fields in line 5, saw 32
.
The full trace of the error is:
File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\python_parser.py:760, in PythonParser._next_iter_line(self, row_num)
759 assert self.data is not None
--> 760 line = next(self.data)
761 # for mypy
Error: ',' expected after '"'
During handling of the above exception, another exception occurred:
ParserError Traceback (most recent call last)
c:\Users\Utente\Desktop\pm-decentraland\scripts\traces_internal_execution.ipynb Cell 1' in <cell line: 12>()
3 import pandas as pd
6 # path = r'C:\Users\Utente\Desktop\sf_csv'
7 # all_files = glob.glob(os.path.join(path, "*.csv"))
8
9 # df = pd.concat((pd.read_csv(f, sep=',', quotechar='"', error_bad_lines=False)
10 # for f in all_files))
---> 12 df = pd.read_csv(r'C:\Users\Utente\Desktop\sf_csv\output_files_0_0_0.csv', sep=",", engine='python')
14 print(df)
File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\util\_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
305 if len(args) > num_allow_args:
306 warnings.warn(
307 msg.format(arguments=arguments),
308 FutureWarning,
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\readers.py:680, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
665 kwds_defaults = _refine_defaults_read(
666 dialect,
667 delimiter,
(...)
676 defaults={"delimiter": ","},
677 )
678 kwds.update(kwds_defaults)
--> 680 return _read(filepath_or_buffer, kwds)
File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\readers.py:575, in _read(filepath_or_buffer, kwds)
572 _validate_names(kwds.get("names", None))
574 # Create the parser.
--> 575 parser = TextFileReader(filepath_or_buffer, **kwds)
577 if chunksize or iterator:
578 return parser
File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\readers.py:933, in TextFileReader.__init__(self, f, engine, **kwds)
930 self.options["has_index_names"] = kwds["has_index_names"]
932 self.handles: IOHandles | None = None
--> 933 self._engine = self._make_engine(f, self.engine)
File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\readers.py:1235, in TextFileReader._make_engine(self, f, engine)
1232 raise ValueError(msg)
1234 try:
-> 1235 return mapping[engine](f, **self.options)
1236 except Exception:
1237 if self.handles is not None:
File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\python_parser.py:138, in PythonParser.__init__(self, f, **kwds)
134 # needs to be cleaned/refactored
135 # multiple date column thing turning into a real spaghetti factory
137 if not self._has_complex_date_col:
--> 138 (index_names, self.orig_names, self.columns) = self._get_index_name(
139 self.columns
140 )
141 self._name_processed = True
142 if self.index_names is None:
File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\python_parser.py:899, in PythonParser._get_index_name(self, columns)
897 else:
898 try:
--> 899 line = self._next_line()
900 except StopIteration:
901 line = None
File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\python_parser.py:696, in PythonParser._next_line(self)
693 next(self.data)
695 while True:
--> 696 orig_line = self._next_iter_line(row_num=self.pos 1)
697 self.pos = 1
699 if orig_line is not None:
File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\python_parser.py:789, in PythonParser._next_iter_line(self, row_num)
780 reason = (
781 "Error could possibly be due to "
782 "parsing errors in the skipped footer rows "
(...)
785 "all rows)."
786 )
787 msg = ". " reason
--> 789 self._alert_malformed(msg, row_num)
790 return None
File c:\Users\Utente\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\python_parser.py:739, in PythonParser._alert_malformed(self, msg, row_num)
722 """
723 Alert a user about a malformed row, depending on value of
724 `self.on_bad_lines` enum.
(...)
736 even though we 0-index internally.
737 """
738 if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
--> 739 raise ParserError(msg)
740 elif self.on_bad_lines == self.BadLineHandleMethod.WARN:
741 base = f"Skipping line {row_num}: "
ParserError: ',' expected after '"'
CodePudding user response:
Adding escapechar='\\'
solve the problem and catch all the 24 columns.
df = pd.read_csv(f, sep=",", engine="python", escapechar='\\')
CodePudding user response:
if you see error like this :
ParserError: Expected 28 fields in line 5, saw 32
it means pandas in step one red first row of the csv and found 27 separator. so it built a table schema of 28 column ..
in the line of error , pandas have found only 4 separator and couldn't match to the previous table that it created.
The problem here is that the commas inside your json string are being treated as delimiters. You should modify the input data (if you don't have direct access to the file, you can always read the contents into a list of strings using open first).
Here are a few modification options that you can try:
Option 1: Quote json string with single quote
Use a single quote (or another character that doesn't otherwise appear in your data) as a quote character for your json string.
2022-05-09 14:57:50.000,mainnet,12023665,2021-03-12 12:27:23.000,0x2c6006b6a82b58574fc9d3b60455d343d976f03ae7dd2dd7dc064139315c2755,0_3,call,0x19a8ed4860007a66805782ed7e0bed4e44fc6717,Marketplace,0xf87e31492faf9a91b02ee0deaad50d51d56d5d4d,LAND,0xe985e9c5,isApprovedForAll,0.00000000,'{"operator":"0x8e5660b4ab70168b5a6feea0e0315cb49c8cd539"\,"owner":"0x821d9a3f2ea71e19c862218f4b041fa5a2827490"}','[{"name":"owner"\,"raw":"0x000000000000000000000000821d9a3f2ea71e19c862218f4b041fa5a2827490"\,"type":"address"}\,{"name":"operator"\,"raw":"0x0000000000000000000000008e5660b4ab70168b5a6feea0e0315cb49c8cd539"\,"type":"address"}]',{"__out0":"True"},'[{"name":""\,"raw":"0x0000000000000000000000000000000000000000000000000000000000000001"\,"type":"bool"}]',6288,\\N,true,4855,true,0xf87e31492faf9a91b02ee0deaad50d51d56d5d4d
Then use quotechar="'" when you read the data
df=pd.read_csv('f.csv', quotechar="'", sep=",")
Option 2 (recommended) : you can use literal_eval
you have specify the json like column base on their headers in the converter input of pandas's csv reader
from ast import literal_eval
import pandas as pd
df = pd.read_csv('f.csv', converters={'ARGUMENTS': literal_eval , 'RAW_ARGUMENTS': literal_eval ,'RAW_OUTPUTS': literal_eval },quotechar="'", sep=",")