want to list out the 1| 2| 3| in separate column and sum of all related number with another column-CodePudding

want to list out the 1| 2| 3| in separate column (a) it the first items seperated by ; and sum of all related number with another column (d)

data structure

1|1=89325|2=96682|3=81940   267947
2|1=17162|2=21282|3=23033;  61477
3|1=71761|2=73375|3=83581;  228717

coding

a = {'TAG': ';1|1=89325|2=96682|3=81940;2|1=17162|2=21282|3=23033;3|1=71761|2=73375|3=83581'}
parsed_data = re.findall(r'([\d.] )=([\d.] )', a['TAG'])
a = ','.join(str(dict(zip(['a', 'b', 'c'], i))) for i in parsed_data)
a = pd.DataFrame(eval(a))
a

code output

a   b
0   1   89325
1   2   96682
2   3   81940
3   1   17162
4   2   21282  
5   3   23033
6   1   71761
7   2   73375
8   3   83581

expected output

    a   b   c       d       
0   1   1   89325   
1   1   2   96682   
2   1   3   81940   267947
3   2   1   17162   
4   2   2   21282   
5   2   3   23033   61477
6   3   1   71761   
7   3   2   73375   
8   3   3   83581   228717

CodePudding user response：

Solution which doesn't use regex but it generates the expected output:

import pandas as pd
from itertools import chain

a = {'TAG': '1@@@;1|1=89325|2=96682|3=81940;2|1=17162|2=21282|3=23033;3|1=71761|2=73375|3=83581'}

list_of_lists = []

# Transform string into list of lists of tuples
for row in a['TAG'].split(';'):
    content = tuple(row.split('|'))
    row_num = int(content[0][0])
    if len(content) > 1:
        list_of_lists.append([tuple([row_num]   [int(elem) for elem in c.split('=')]) for c in content[1:]])

# Calculate the total for each list
totals = [sum([values[-1] for values in t]) for t in list_of_lists]

# Append the total to the last tuple in each list
for index, elem in enumerate(list_of_lists):
    elem[-1] = (*elem[-1], totals[index])
    
# Flatten the list_of_lists structure
data = list(chain(*list_of_lists))
# Create the dataframe
df = pd.DataFrame(data, columns =['a', 'b', 'c', 'd'])

UPDATE: Solution where map function is applied to input parsed using regular expression:

import re
from itertools import chain
import pandas as pd

a = {'TAG': ';1|1=89325|2=96682|3=81940;2|1=17162|2=21282|3=23033;3|1=71761|2=73375|3=83581'}

def list_of_tuples(tup):
    indexes = [(0,1,2), (0,3,4), (0,5,6)]
    lot = [tuple([int(tup[i]) for i in index]) for index in indexes]
    lot[-1] = (*lot[-1], sum([int(tup[i]) for i in (2,4,6)]))
    return lot

regex = r"\;(\d )\|(\d )\=(\d )\|(\d )\=(\d )\|(\d )\=(\d )"

data = list(chain(*map(list_of_tuples, re.findall(regex, a['TAG']))))

df = pd.DataFrame(data, columns =['a', 'b', 'c', 'd'])

UPDATE 2: A more efficient map function, although the code is more verbose.

def list_of_tuples(tup):
    values_total = 0
    tuples_list = []
    for index,elem in enumerate(tup):
        if index == 0:
            a = int(elem)
        else:
            if index%2 == 1:
                b = int(elem)
            else:
                c = int(elem)
                values_total  = c
                tuples_list.append((a, b, c))
    tuples_list[-1] = (*tuples_list[-1], values_total)
    return tuples_list