I want to convert this corpus hu.txt.xz 15GB which becomes around 60GB after unpacking to small versions of text files, each file with less than 1GB or 100000 lines
The expected output:
| siplit_1.txt
| siplit_2.txt
| siplit_3.txt
.....
| siplit_n.txt
I have this script on a local machine but doesn't work it just loads without process because bigdata as I think :
import fun
import sys
import os
import shutil
# //-----------------------
# Retrieve and return output file max lines from input
def how_many_lines_per_file():
try:
return int(input("Max lines per output file: "))
except ValueError:
print("Error: Please use a valid number.")
sys.exit(1)
# //-----------------------
# Retrieve input filename and return file pointer
def file_dir():
try:
filename = input("Input filename: ")
return open(filename, 'r')
except FileNotFoundError:
print("Error: File not found.")
sys.exit(1)
# //-----------------------
# Create output file
def create_output_file_dir(num, filename):
return open(f"./data/output_{filename}/split_{num}.txt", "a")
# //-----------------------
# Create output directory
def create_output_directory(filename):
output_path = f"./data/output_{filename}"
try:
if os.path.exists(output_path): # Remove directory if exists
shutil.rmtree(output_path)
os.mkdir(output_path)
except OSError:
print("Error: Failed to create output directory.")
sys.exit(1)
def ch_dir():
# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))
# Change the current working directory
os.chdir('./data')
# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))
# //-----------------------
def split_file():
try:
line_count = 0
split_count = 1
max_lines = how_many_lines_per_file()
# ch_dir()
input_file = fun.file_dir()
input_lines = input_file.readlines()
create_output_directory(input_file.name)
output_file = create_output_file_dir(split_count, input_file.name)
for line in input_lines:
output_file.write(line)
line_count = 1
# Create new output file if current output file's line count is greater than max line count
if line_count > max_lines:
split_count = 1
line_count = 0
output_file.close()
# Prevent creation of an empty file after splitting is finished
if not len(input_lines) == max_lines:
output_file = create_output_file_dir(split_count, input_file.name)
# Handle errors
except Exception as e:
print(f"An unknown error occurred: {e}")
# Success message
else:
print(f"Successfully split {input_file.name} into {split_count} output files!")
# //-----------------------
if __name__ == "__main__":
split_file()
Is there any python script or deep learning tool to split them for using the to next task
CodePudding user response:
By calling readlines()
on the input file handle, you are reading (or trying to) the whole file into memory at the same time. You can do this instead to process the file one line at a time, never having more than a single line in memory:
input_file = fun.file_dir()
...
for line in input_file:
...
Another issue to be aware of is that this line:
if not len(input_lines) == max_lines:
output_file = create_output_file_dir(split_count, input_file.name)
is likely not doing what you think it is. Neither input_lines
or max_lines
will ever change inside the loop, so this will either always create a new file or never will. Unless you happen to process a file with exactly max_lines
lines in it, this will always be true
. This is not a big deal, but I think as your code is now you're going to end up with an extra empty file. You need to change the logic anyway, so you'll have to rethink how to make this work.
UPDATE:
Here's how I would modify the logic to do the right thing regarding opening each of the output files:
input_file = fun.file_dir()
# output_file = create_output_file_dir(split_count, input_file.name)
output_file = None
...
for line in input_file:
# Open a new output file if we don't have one open
if not output_file:
output_file = create_output_file_dir(split_count, input_file.name)
output_file.write(line)
line_count = 1
# Close the current output file if the line count has reached its max
if line_count > max_lines:
split_count = 1
line_count = 0
output_file.close()
output_file = None
The key idea here is that you can't know if you need a new output file until you have tried to read the next line after closing the current output file. This logic only opens an output file when it has a line to write out and there is no open output file.
CodePudding user response:
You're trying to allocate a big file into memory which is not possible.
Instead of reading all the content at once just read line by line and process it.
I've fixed the bug seen by @CryptoFool
import fun
import sys
import os
import shutil
# //-----------------------
# Retrieve and return output file max lines from input
def how_many_lines_per_file():
try:
return int(input("Max lines per output file: "))
except ValueError:
print("Error: Please use a valid number.")
sys.exit(1)
# //-----------------------
# Retrieve input filename and return file pointer
def file_dir():
try:
filename = input("Input filename: ")
return open(filename, 'r')
except FileNotFoundError:
print("Error: File not found.")
sys.exit(1)
# //-----------------------
# Create output file
def create_output_file_dir(num, filename):
return open(f"./data/output_{filename}/split_{num}.txt", "a")
# //-----------------------
# Create output directory
def create_output_directory(filename):
output_path = f"./data/output_{filename}"
try:
if os.path.exists(output_path): # Remove directory if exists
shutil.rmtree(output_path)
os.mkdir(output_path)
except OSError:
print("Error: Failed to create output directory.")
sys.exit(1)
def ch_dir():
# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))
# Change the current working directory
os.chdir('./data')
# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))
# //-----------------------
def split_file():
try:
line_count = 0
split_count = 1
max_lines = how_many_lines_per_file()
# ch_dir()
input_file = fun.file_dir()
create_output_directory(input_file.name)
output_file = create_output_file_dir(split_count, input_file.name)
for line in input_file:
output_file.write(line)
line_count = 1
# Create new output file if current output file's line count is greater than max line count
if line_count > max_lines:
split_count = 1
line_count = 0
output_file.close()
# Prevent creation of an empty file after splitting is finished
if not line_count == max_lines:
output_file = create_output_file_dir(split_count, input_file.name)
# Handle errors
except Exception as e:
print(f"An unknown error occurred: {e}")
# Success message
else:
print(f"Successfully split {input_file.name} into {split_count} output files!")
# //-----------------------
if __name__ == "__main__":
split_file()