I am trying to read 4 .txt files delimited by |.
As one of them is over 1Gb df_tradeCash_mhi = pd.concat(chunk_read(mhi_tradeCashFiles, "MHI"))
I found the 'chunk' method to read them, but I am getting Error tokenizing data. Out of memory.
Does anyone know how I can solve this problem?
Below is my code
def findmefile(directory, containsInFilename):
entity_filenames = {}
for file in os.listdir(directory):
if containsInFilename in file:
if file[:5] == "Trade":
entity_filenames["MHI"] = file
else:
entity_filenames[re.findall("(.*?)_", file)[0]] = file
return entity_filenames
# Get the core Murex file names
mhi_tradeFiles = findmefile(CoreMurexFilesLoc, "Trade")
mhi_tradeCashFiles = findmefile(CoreMurexFilesLoc, "TradeCash_")
mheu_tradeFiles = findmefile(CoreMurexFilesLoc, "MHEU")
mheu_tradeCashFiles = findmefile(CoreMurexFilesLoc, "MHEU_TradeCash")
# Read the csv using chunck
mylist = []
size = 10**2
def chunk_read(fileName, entity):
for chunk in pd.read_csv(
CoreMurexFilesLoc "\\" fileName[entity],
delimiter="|",
low_memory=False,
chunksize=size,
):
mylist.append(chunk)
return mylist
df_trade_mhi = pd.concat(chunk_read(mhi_tradeFiles, "MHI"))
df_trade_mheu = pd.concat(chunk_read(mheu_tradeFiles, "MHEU"))
df_tradeCash_mheu = pd.concat(chunk_read(mheu_tradeCashFiles, "MHEU"))
df_tradeCash_mhi = pd.concat(chunk_read(mhi_tradeCashFiles, "MHI"))
df_trades = pd.concat(
[df_trade_mheu, df_trade_mhi, df_tradeCash_mheu, df_tradeCash_mhi]
)
del df_trade_mhi
del df_tradeCash_mhi
del df_trade_mheu
del df_tradeCash_mheu
# Drop any blank fields and duplicates
nan_value = float("NaN")
df_trades.replace("", nan_value, inplace=True)
df_trades.dropna(subset=["MurexCounterpartyRef"], inplace=True)
df_trades.drop_duplicates(subset=["MurexCounterpartyRef"], inplace=True)
counterpartiesList = df_trades["MurexCounterpartyRef"].tolist()
print(colored('All Core Murex trade and tradeCash data loaded.', "green"))
Error:
Traceback (most recent call last):
File "h:\DESKTOP\test_check\check_securityPrices.py", line 52, in <module>
df_tradeCash_mhi = pd.concat(chunk_read(mhi_tradeCashFiles, "MHI"))
File "h:\DESKTOP\test_check\check_securityPrices.py", line 39, in chunk_read
for chunk in pd.read_csv(
File "C:\Users\MIRABR\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\parsers\readers.py", line 1024, in __next__
return self.get_chunk()
File "C:\Users\MIRABR\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\parsers\readers.py", line 1074, in get_chunk
return self.read(nrows=size)
File "C:\Users\MIRABR\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\parsers\readers.py", line 1047, in read
index, columns, col_dict = self._engine.read(nrows)
File "C:\Users\MIRABR\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\io\parsers\c_parser_wrapper.py", line 228, in read
data = self._reader.read(nrows)
File "pandas\_libs\parsers.pyx", line 783, in pandas._libs.parsers.TextReader.read
File "pandas\_libs\parsers.pyx", line 857, in pandas._libs.parsers.TextReader._read_rows
File "pandas\_libs\parsers.pyx", line 843, in pandas._libs.parsers.TextReader._tokenize_rows
File "pandas\_libs\parsers.pyx", line 1925, in pandas._libs.parsers.raise_parser_error
pandas.errors.ParserError: Error tokenizing data. C error: out of memory
CodePudding user response:
I think the problem is obvious - you're running out of memory because you're trying to load so much data into memory at once, and then process it.
You need to either:
- get a machine with more memory.
- re-architect the solution to use a pipelined approach using either a generator or coroutine pipeline to do the processing stepwise over your data.
The problem with the first approach is it won't scale indefinitely and is expensive. The second way is the right way to do it, but needs more coding.
As a good reference on generator/coroutine type pipeline approaches check out any of the pycon talks by David Beazley.