Match lines and get values as list-CodePudding

Need small help to parse the console to get data in desired format.

Input from logfile:

Actual data:

> PROF_DETAILS:
|prof_name                       |     cnt|     min|     max|    last|     tot|     avg|     intvl|                      hist|  pf_max|  pf_avg|  pf_tot|ignoredCnt|       intHist|intMax|
|PROF_1           |   16816|       0|     158|       0|  197893|      11|            500|   16816:       0:       0|       0|       0|       0|     72592|85578:3830:   0|    4|
|PROF_2  |32436579|       0|     193|       0|936877226|      28|            500|32436571:       0:       0|       0|       0|       0|         0|32394220:42351:   0|    4|
|PROF_3  |32511390|       0|     210|       0|2023670249|      62|            500|32511390:       0:       0|       0|       0|       0|       146|29121201:3390335:   0|    4|
|PROF_4  |       3|       0|       7|       0|      20|       6|            500|       3:       0:       0|       0|       0|       0|        12|  15:   0:   0|    0|
|PROF_5  |       0|       0|       0|       0|       0|       0|            500|       0:       0:       0|       0|       0|       0|         0|   0:   0:   0|    0|
|PROF_6  |       0|       0|       0|       0|       0|       0|            500|       0:       0:       0|       0|       0|       0|         0|   0:   0:   0|    0|

Expected output:

> PROF_DETAILS = {'PROF_1':{'cnt':'0','min':16816,.......},
                'PROF_2':{'cnt':'32436579','min':16816,.......},}

CodePudding user response：

It's relatively straight forward with pandas. You can read your file with read_csv, then delete the first and last columns (before and after the leading and trailing |) and finally export as dict:

import pandas as pd
import json

df = pd.read_csv(f_in, sep="\s*\|\s*") # using regex to remove spaces before/after |
del df[df.columns[0]] # deleting first column
del df[df.columns[-1]] # deleting last column
df.set_index("prof_name", inplace=True)
print(json.dumps(df.transpose().to_dict(), indent=4))

Output:

{
    "PROF_1": {
        "cnt": 16816,
        "min": 0,
        "max": 158,
        "last": 0,
        "tot": 197893,
        "avg": 11,
        "intvl": 500,
        "hist": "16816:       0:       0",
        "pf_max": 0,
        "pf_avg": 0,
        "pf_tot": 0,
        "ignoredCnt": 72592,
        "intHist": "85578:3830:   0",
        "intMax": 4
    },
    "PROF_2": {
        "cnt": 32436579,
        "min": 0,
        "max": 193,
        "last": 0,
        "tot": 936877226,
        "avg": 28,
        "intvl": 500,
        "hist": "32436571:       0:       0",
        "pf_max": 0,
        "pf_avg": 0,
        "pf_tot": 0,
        "ignoredCnt": 0,
        "intHist": "32394220:42351:   0",
        "intMax": 4
    },
    "PROF_3": {
        "cnt": 32511390,
        "min": 0,
        "max": 210,
        "last": 0,
        "tot": 2023670249,
        "avg": 62,
        "intvl": 500,
        "hist": "32511390:       0:       0",
        "pf_max": 0,
        "pf_avg": 0,
        "pf_tot": 0,
        "ignoredCnt": 146,
        "intHist": "29121201:3390335:   0",
        "intMax": 4
    },
    "PROF_4": {
        "cnt": 3,
        "min": 0,
        "max": 7,
        "last": 0,
        "tot": 20,
        "avg": 6,
        "intvl": 500,
        "hist": "3:       0:       0",
        "pf_max": 0,
        "pf_avg": 0,
        "pf_tot": 0,
        "ignoredCnt": 12,
        "intHist": "15:   0:   0",
        "intMax": 0
    },
    "PROF_5": {
        "cnt": 0,
        "min": 0,
        "max": 0,
        "last": 0,
        "tot": 0,
        "avg": 0,
        "intvl": 500,
        "hist": "0:       0:       0",
        "pf_max": 0,
        "pf_avg": 0,
        "pf_tot": 0,
        "ignoredCnt": 0,
        "intHist": "0:   0:   0",
        "intMax": 0
    },
    "PROF_6": {
        "cnt": 0,
        "min": 0,
        "max": 0,
        "last": 0,
        "tot": 0,
        "avg": 0,
        "intvl": 500,
        "hist": "0:       0:       0",
        "pf_max": 0,
        "pf_avg": 0,
        "pf_tot": 0,
        "ignoredCnt": 0,
        "intHist": "0:   0:   0",
        "intMax": 0
    }
}

CodePudding user response：

You can also do this without regex and pandas if I'm understanding you correctly. Example below:

from pprint import pprint

data = """\
|prof_name                       |     cnt|     min|     max|    last|     tot|     avg|     intvl|                      hist|  pf_max|  pf_avg|  pf_tot|ignoredCnt|       intHist|intMax|
|PROF_1           |   16816|       0|     158|       0|  197893|      11|            500|   16816:       0:       0|       0|       0|       0|     72592|85578:3830:   0|    4|
|PROF_2  |32436579|       0|     193|       0|936877226|      28|            500|32436571:       0:       0|       0|       0|       0|         0|32394220:42351:   0|    4|
|PROF_3  |32511390|       0|     210|       0|2023670249|      62|            500|32511390:       0:       0|       0|       0|       0|       146|29121201:3390335:   0|    4|
|PROF_4  |       3|       0|       7|       0|      20|       6|            500|       3:       0:       0|       0|       0|       0|        12|  15:   0:   0|    0|
|PROF_5  |       0|       0|       0|       0|       0|       0|            500|       0:       0:       0|       0|       0|       0|         0|   0:   0:   0|    0|
|PROF_6  |       0|       0|       0|       0|       0|       0|            500| \
"""

def as_int_safe(s: str): return int(s) if s.isnumeric() else s

rows = [line.strip('|').split('|') for line in data.replace(' ', '').split('\n')]
col_headers = rows.pop(0)
prof_name_label, *rest_headers = col_headers

data = [{prof_name: dict(zip(rest_headers, map(as_int_safe, rest_cols)))}
        for prof_name, *rest_cols in rows]

pprint(data)

Output:

[{'PROF_1': {'avg': 11,
             'cnt': 16816,
             'hist': '16816:0:0',
             'ignoredCnt': 72592,
             'intHist': '85578:3830:0',
             'intMax': 4,
             'intvl': 500,
             'last': 0,
             'max': 158,
             'min': 0,
             'pf_avg': 0,
             'pf_max': 0,
             'pf_tot': 0,
             'tot': 197893}},
 {'PROF_2': {'avg': 28,
             'cnt': 32436579,
  ...
}]