Need small help to parse the console to get data in desired format.
Actual data:
> PROF_DETAILS:
|prof_name | cnt| min| max| last| tot| avg| intvl| hist| pf_max| pf_avg| pf_tot|ignoredCnt| intHist|intMax|
|PROF_1 | 16816| 0| 158| 0| 197893| 11| 500| 16816: 0: 0| 0| 0| 0| 72592|85578:3830: 0| 4|
|PROF_2 |32436579| 0| 193| 0|936877226| 28| 500|32436571: 0: 0| 0| 0| 0| 0|32394220:42351: 0| 4|
|PROF_3 |32511390| 0| 210| 0|2023670249| 62| 500|32511390: 0: 0| 0| 0| 0| 146|29121201:3390335: 0| 4|
|PROF_4 | 3| 0| 7| 0| 20| 6| 500| 3: 0: 0| 0| 0| 0| 12| 15: 0: 0| 0|
|PROF_5 | 0| 0| 0| 0| 0| 0| 500| 0: 0: 0| 0| 0| 0| 0| 0: 0: 0| 0|
|PROF_6 | 0| 0| 0| 0| 0| 0| 500| 0: 0: 0| 0| 0| 0| 0| 0: 0: 0| 0|
Expected output:
> PROF_DETAILS = {'PROF_1':{'cnt':'0','min':16816,.......},
'PROF_2':{'cnt':'32436579','min':16816,.......},}
CodePudding user response:
It's relatively straight forward with pandas. You can read your file with read_csv
, then delete the first and last columns (before and after the leading and trailing |
) and finally export as dict:
import pandas as pd
import json
df = pd.read_csv(f_in, sep="\s*\|\s*") # using regex to remove spaces before/after |
del df[df.columns[0]] # deleting first column
del df[df.columns[-1]] # deleting last column
df.set_index("prof_name", inplace=True)
print(json.dumps(df.transpose().to_dict(), indent=4))
Output:
{
"PROF_1": {
"cnt": 16816,
"min": 0,
"max": 158,
"last": 0,
"tot": 197893,
"avg": 11,
"intvl": 500,
"hist": "16816: 0: 0",
"pf_max": 0,
"pf_avg": 0,
"pf_tot": 0,
"ignoredCnt": 72592,
"intHist": "85578:3830: 0",
"intMax": 4
},
"PROF_2": {
"cnt": 32436579,
"min": 0,
"max": 193,
"last": 0,
"tot": 936877226,
"avg": 28,
"intvl": 500,
"hist": "32436571: 0: 0",
"pf_max": 0,
"pf_avg": 0,
"pf_tot": 0,
"ignoredCnt": 0,
"intHist": "32394220:42351: 0",
"intMax": 4
},
"PROF_3": {
"cnt": 32511390,
"min": 0,
"max": 210,
"last": 0,
"tot": 2023670249,
"avg": 62,
"intvl": 500,
"hist": "32511390: 0: 0",
"pf_max": 0,
"pf_avg": 0,
"pf_tot": 0,
"ignoredCnt": 146,
"intHist": "29121201:3390335: 0",
"intMax": 4
},
"PROF_4": {
"cnt": 3,
"min": 0,
"max": 7,
"last": 0,
"tot": 20,
"avg": 6,
"intvl": 500,
"hist": "3: 0: 0",
"pf_max": 0,
"pf_avg": 0,
"pf_tot": 0,
"ignoredCnt": 12,
"intHist": "15: 0: 0",
"intMax": 0
},
"PROF_5": {
"cnt": 0,
"min": 0,
"max": 0,
"last": 0,
"tot": 0,
"avg": 0,
"intvl": 500,
"hist": "0: 0: 0",
"pf_max": 0,
"pf_avg": 0,
"pf_tot": 0,
"ignoredCnt": 0,
"intHist": "0: 0: 0",
"intMax": 0
},
"PROF_6": {
"cnt": 0,
"min": 0,
"max": 0,
"last": 0,
"tot": 0,
"avg": 0,
"intvl": 500,
"hist": "0: 0: 0",
"pf_max": 0,
"pf_avg": 0,
"pf_tot": 0,
"ignoredCnt": 0,
"intHist": "0: 0: 0",
"intMax": 0
}
}
CodePudding user response:
You can also do this without regex and pandas
if I'm understanding you correctly. Example below:
from pprint import pprint
data = """\
|prof_name | cnt| min| max| last| tot| avg| intvl| hist| pf_max| pf_avg| pf_tot|ignoredCnt| intHist|intMax|
|PROF_1 | 16816| 0| 158| 0| 197893| 11| 500| 16816: 0: 0| 0| 0| 0| 72592|85578:3830: 0| 4|
|PROF_2 |32436579| 0| 193| 0|936877226| 28| 500|32436571: 0: 0| 0| 0| 0| 0|32394220:42351: 0| 4|
|PROF_3 |32511390| 0| 210| 0|2023670249| 62| 500|32511390: 0: 0| 0| 0| 0| 146|29121201:3390335: 0| 4|
|PROF_4 | 3| 0| 7| 0| 20| 6| 500| 3: 0: 0| 0| 0| 0| 12| 15: 0: 0| 0|
|PROF_5 | 0| 0| 0| 0| 0| 0| 500| 0: 0: 0| 0| 0| 0| 0| 0: 0: 0| 0|
|PROF_6 | 0| 0| 0| 0| 0| 0| 500| \
"""
def as_int_safe(s: str): return int(s) if s.isnumeric() else s
rows = [line.strip('|').split('|') for line in data.replace(' ', '').split('\n')]
col_headers = rows.pop(0)
prof_name_label, *rest_headers = col_headers
data = [{prof_name: dict(zip(rest_headers, map(as_int_safe, rest_cols)))}
for prof_name, *rest_cols in rows]
pprint(data)
Output:
[{'PROF_1': {'avg': 11,
'cnt': 16816,
'hist': '16816:0:0',
'ignoredCnt': 72592,
'intHist': '85578:3830:0',
'intMax': 4,
'intvl': 500,
'last': 0,
'max': 158,
'min': 0,
'pf_avg': 0,
'pf_max': 0,
'pf_tot': 0,
'tot': 197893}},
{'PROF_2': {'avg': 28,
'cnt': 32436579,
...
}]