I am converting multiple text files to a csv file. My text file looks like this:
ITEM: TIMESTEP
55000
ITEM: NUMBER OF ATOMS
4365
ITEM: BOX BOUNDS ff ff ff
-0.3 0.3
-0.6 0.6
-0.6 0.6
ITEM: ATOMS id type x y z vx vy vz fx fy fz omegax omegay omegaz radius
4356 1 -0.0885288 -0.0101421 -0.48871 -0.000941682 0.778688 -0.0153902 -0.00720861 -0.0533703 0.0104717 0.35581 -0.0601358 -0.436049 0.01
4227 1 0.0157977 0.00542603 -0.488429 -0.00996111 0.784119 0.00813807 -0.000491847 0.0144889 -0.0120111 1.08208 -0.0671177 0.369492 0.01
3973 1 0.0179724 0.0256167 -0.48799 -0.00582994 0.772455 0.0394544 0.0109589 -0.0187232 -0.00111718 -0.0586513 -0.162943 1.12784 0.01
4300 1 0.0900919 0.0248592 -0.488025 -0.000455483 0.769978 0.0388239 -0.00364509 0.0409803 -0.00269227 3.94355 -0.0249566 -0.223111 0.01
4200 1 -0.0230223 0.0329911 -0.483108 -0.00238 0.778547 0.0500186 0.0421189 -0.021588 0.05607 0.112989 -0.0813771 -1.09981 0.015
4339 1 0.00143577 0.0368542 -0.488107 0.000587848 0.784672 0.0593572 0.00385562 -0.00475113 -0.00710483 -0.201196 0.158512 -5.63826 0.01
4106 1 0.0648392 0.0269728 -0.483248 -0.00365836 0.766081 0.0395827 0.0418642 0.1802 0.0547313 -0.0578358 0.124205 -0.96464 0.015
4104 1 -0.084453 0.0507114 -0.482726 -0.000596577 0.75636 0.0806599 0.000817826 0.0119286 -0.0150014 -0.0864852 -0.103877 0.198773 0.015
Right now my csv file contains value after line 9 (in python code line 8). I want to include line 2 (Header - TIMESTEP) also in csv along with all the values after 9.
I tried to edit my code but couldn't succeed. Can I get some help: My code is here:
import numpy as np
import pandas as pd
import csv
import glob
import time
def main():
start = time.time()
data_folder = "./all/" #folder name
files = glob.glob(data_folder '*dump*.data')
print("Total files:", len(files))
# get header from one of the files
#header = []
with open('all/dump46000.data', 'r') as f:
#lines = f.readlines()
for _ in range(8):
next(f) # skip first 8 lines
header = ','.join(f.readline().split()[2:]) '\n'
headers = ','.join(f.readline().split()[2:])
#header.append(headers)
#header.append('timestep')
print(header)
for file in files:
with open(file, 'r') as f, open(f'all.csv', 'a') as g: # note the 'a'
g.write(header) # write the header
for _ in range(9):
next(f) # skip first 9 lines
for line in f:
g.write(line.rstrip().replace(' ', ',') '\n')
print(time.time() - start)
if __name__ == "__main__":
main()
My folder all
contains more than 600 files:
['./all/dump501000.data',
'./all/dump307000.data',
'./all/dump612000.data',
'./all/dump369000.data',
'./all/dump23000.data',
'./all/dump470000.data',
'./all/dump235000.data',
'./all/dump6000.data',
'./all/dump568000.data',
'./all/dump506000.data',
'./all/dump623000.data',
'./all/dump329000.data',
'./all/dump220000.data',
.....................
....................
I want this csv file from text file:
id type x y z vx vy vz fx fy fz omegax omegay omegaz radius TIMESTEP
But I am getting this csv
id type x y z vx vy vz fx fy fz omegax omegay omegaz radius
Thank you
CodePudding user response:
Based on what you want, here's what should work
import numpy as np
import pandas as pd
import csv
import glob
import time
def main():
start = time.perf_counter()
data_folder = "./all/" #folder name
files = glob.glob(data_folder '*dump*.data')
print("Total files:", len(files))
for file in files:
with open(file, 'r') as f, open(f'all.csv', 'a') as g: # note the 'a'
header = f.readline().split("ITEM: ")[1] '\n'
headers = f.readline()
print(header)
g.write(header)
g.write(headers)
for _ in range(6):
next(f)
for line in f:
g.write(line.rstrip().replace(' ', ',') '\n')
print(time.perf_counter() - start)
if __name__ == "__main__":
main()
Let me know if you need any other syntax or something else in the final CSV. Also to time something always use time.perf_counter it's more accurate.
CodePudding user response:
Here is something you can try to add TIMESTEP with your data in csv. I am just wondering if you need to print the header for each file. My understanding is you can print header at the top for once. If you want to print that for each file, bring it into the for loop.
import numpy as np
import pandas as pd
import csv
import glob
import time
def main():
start = time.time()
data_folder = "./all/" #folder name
files = glob.glob(data_folder '*dump*.data')
print("Total files:", len(files))
# get header from one of the files
header = []
with open('all/dump46000.data', 'r') as f:
#lines = f.readlines()
header.extend(f.readline().split()[1:])
timeStep = f.readline().split()
for _ in range(6):
next(f) # skip first 8 lines
header.extend(f.readline().split()[2:])
a = True
print(header)
headerString = ','.join(header)
g.write(headerString '\n') # write the header
for file in files:
with open(file, 'r') as f, open(f'all.csv', 'a') as g: # note the 'a'
next(f)
timeStep = f.readline().split()
for _ in range(7):
next(f)
for line in f:
file_line = line.split()
file_line.insert(0,timeStep[0])
data = ','.join(file_line)
g.write(data '\n')
print(time.time() - start)
if __name__ == "__main__":
main()