I try to parse in python several csv file from several zip files, to inject in graphite timeseries DB.
I already posted a topic here how to parse one csv file manually, but now i'm trying to parse several csv from several zip files.
CSV files are composed as follow:
Object
Serial number
From : 2021/12/13 12:04
To : 2021/12/14 11:59
sampling rate : 1
"No.","time","00:00:00","00:00:01","00:00:02","00:00:03","00:00:04","00:00:05","00:00:06","00:00:07","00:00:08","00:00:09","00:00:0A"
"1","2021/09/12 02:16",235,610,345,997,446,130,129,94,555,274,4
"2","2021/09/12 02:17",364,210,371,341,294,87,179,106,425,262,3
"3","2021/09/12 02:18",297,343,860,216,275,81,73,113,566,274,3
"4","2021/09/12 02:19",305,243,448,262,387,64,63,119,633,249,3
"5","2021/09/12 02:20",276,151,164,263,315,86,92,175,591,291,1
"6","2021/09/12 02:21",264,343,287,542,312,83,72,122,630,273,4
"7","2021/09/12 02:22",373,157,266,446,246,90,173,90,442,273,2
"8","2021/09/12 02:23",265,112,241,307,329,64,71,82,515,260,3
"9","2021/09/12 02:24",285,247,240,372,176,92,67,83,609,620,1
"10","2021/09/12 02:25",289,964,277,476,356,84,74,104,560,294,1
"11","2021/09/12 02:26",279,747,227,573,569,82,77,99,589,229,5
"12","2021/09/12 02:27",338,370,315,439,653,85,165,346,367,281,2
"13","2021/09/12 02:28",269,135,372,262,307,73,86,93,512,283,4
"14","2021/09/12 02:29",281,207,688,322,233,75,69,85,663,276,2
This is my script:
import re, glob, sys, time, socket, platform, subprocess, pickle, struct, os, zipfile
import pandas as pd
from shutil import copyfile, copy
from pathlib import Path
from io import StringIO
CARBON_SERVER = '127.0.0.1'
CARBON_PICKLE_PORT = 2004
DELAY = 10
def get_filepaths(directory):
file_paths = [] # List which will store all of the full filepaths.
# Walk the tree.
for root, directories, files in os.walk(directory):
for filename in files:
# Join the two strings in order to form the full filepath.
filepath = os.path.join(root, filename)
file_paths.append(filepath) # Add it to the list.
return file_paths # Self-explanatory.
def execute_run(csv_file):
delay = DELAY
if len(sys.argv) > 1:
arg = sys.argv[1]
if arg.isdigit():
delay = int(arg)
else:
sys.stderr.write("Ignoring non-integer argument. Using default: %ss\n" % delay)
sock = socket.socket()
try:
sock.connect( (CARBON_SERVER, CARBON_PICKLE_PORT) )
print("socket ok, write")
except socket.error:
raise SystemExit("Couldn't connect to %(server)s on port %(port)d, is carbon-cache.py running?" % { 'server':CARBON_SERVER, 'port':CARBON_PICKLE_PORT })
try:
run(sock, delay, csv_file)
except KeyboardInterrupt:
sys.stderr.write("\nExiting on CTRL-c\n")
sys.exit(0)
def run(sock, delay, zipobj):
zf = zipfile.ZipFile(zipobj)
df = [pd.read_csv(zf.open(f), skiprows=[0,1,2,3,4,5]) for f in zf.namelist()]
print(df, '\n')
date_pattern='%Y/%m/%d %H:%M'
df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=1) # create epoch as a column, i block here
print(3)
df
tuples=[] # data will be saved in a list
formated_str='hds.perf.type.serial.object.00.00.00.TOTAL_IOPS'
for each_column in list(df.columns)[2:-1]:
for e in zip(list(df['epoch']),list(df[each_column])):
each_column=each_column.replace("X", '')
print(f"perf.serial.sn.LDEV.{each_column}.TOTAL_IOPS",e)
tuples.append((f"perf.serial.sn.LDEV.{each_column}.TOTAL_IOPS",e))
package = pickle.dumps(tuples, 1)
size = struct.pack('!L', len(package))
sock.sendall(size)
sock.sendall(package)
time.sleep(delay)
def main():
Liste = [ 'MyZip1.zip', 'MyZip2.zip', 'MyZip3.zip' ]
unzip_et()
et_rep='/opt/import2grafana/out/'
full_file_paths = get_filepaths(et_rep)
for idx1, lst_metrics in enumerate(Liste):
for idx2, Lst_f in enumerate(full_file_paths):
if lst_metrics in Lst_f:
zip_file = Lst_f
try:
with zipfile.ZipFile(zip_file) as zipobj:
#print("1 zipobj: ", zipobj.namelist)
#print("1 zipobj: ", zipobj.filename)
execute_run(zipobj.filename)
except Exception as err :
print("Erreur parsing zipfile")
if __name__ == "__main__":
main()
i can list csv, but the line bellow bug
df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=1) # create epoch as a column, i block here.
[1435 rows x 6 columns], No. time 00:10:00X 00:10:01X 00:10:02X 00:10:03X 00:10:04X 00:10:05X 00:10:06X 00:10:07X 00:10:08X ... 00:10:75X 00:10:76X 00:10:77X 00:10:78X 00:10:79X 00:10:7AX 00:10:7BX 00:10:7CX 00:10:7DX 00:10:7EX 00:10:7FX
0 1 2021/12/13 12:05 -4 -4 -4 -4 -4 -4 -4 -4 -4 ... -4 -4 -4 -4 -4 -4 -4 -4.0 -4.0 -4.0 -4.0
1 2 2021/12/13 12:06 -4 -4 -4 -4 -4 -4 -4 -4 -4 ... -4 -4 -4 -4 -4 -4 -4 -4.0 -4.0 -4.0 -4.0
2 3 2021/12/13 12:07 -4 -4 -4 -4 -4 -4 -4 -4 -4 ... -4 -4 -4 -4 -4 -4 -4 -4.0 -4.0 -4.0 -4.0
3 4 2021/12/13 12:08 -4 -4 -4 -4 -4 -4 -4 -4 -4 ... -4 -4 -4 -4 -4 -4 -4 -4.0 -4.0 -4.0 -4.0
4 5 2021/12/13 12:09 -4 -4 -4 -4 -4 -4 -4 -4 -4 ... -4 -4 -4 -4 -4 -4 -4 -4.0 -4.0 -4.0 -4.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2866 1431 2021/12/14 11:55 2 0 3 2 0 0 1 1 546 ... 1 1 1 1 2 0 0 NaN NaN NaN NaN
2867 1432 2021/12/14 11:56 2 0 1 3 0 0 1 1 565 ... 1 1 1 1 1 0 0 NaN NaN NaN NaN
2868 1433 2021/12/14 11:57 1 0 1 5 0 0 1 1 549 ... 1 1 1 1 1 0 0 NaN NaN NaN NaN
2869 1434 2021/12/14 11:58 1 0 1 7 0 0 1 1 537 ... 1 3 1 1 1 0 0 NaN NaN NaN NaN
2870 1435 2021/12/14 11:59 1 0 1 5 0 0 1 1 532 ... 1 1 1 1 1 0 0 NaN NaN NaN NaN
Many thanks for help
CodePudding user response:
df
is a list of dataframes - not a single dataframe, so naturally df['epoch']
will throw an error. Looping zf.namelist()
should do the trick:
def run(sock, delay, zipobj):
zf = zipfile.ZipFile(zipobj)
for f in zf.namelist():
df = pd.read_csv(zf.open(f), skiprows=[0,1,2,3,4,5])
print(df, '\n')
date_pattern='%Y/%m/%d %H:%M'
df['epoch'] = df.apply(lambda row: int(time.mktime(time.strptime(row.time,date_pattern))), axis=1) # create epoch as a column, i block here
#etc..............