want to list out the 1|
2|
3|
in separate column (a)
it the first items seperated by ;
and sum of all related number with another column (d)
data structure
1|1=89325|2=96682|3=81940 267947
2|1=17162|2=21282|3=23033; 61477
3|1=71761|2=73375|3=83581; 228717
coding
a = {'TAG': ';1|1=89325|2=96682|3=81940;2|1=17162|2=21282|3=23033;3|1=71761|2=73375|3=83581'}
parsed_data = re.findall(r'([\d.] )=([\d.] )', a['TAG'])
a = ','.join(str(dict(zip(['a', 'b', 'c'], i))) for i in parsed_data)
a = pd.DataFrame(eval(a))
a
code output
a b
0 1 89325
1 2 96682
2 3 81940
3 1 17162
4 2 21282
5 3 23033
6 1 71761
7 2 73375
8 3 83581
expected output
a b c d
0 1 1 89325
1 1 2 96682
2 1 3 81940 267947
3 2 1 17162
4 2 2 21282
5 2 3 23033 61477
6 3 1 71761
7 3 2 73375
8 3 3 83581 228717
CodePudding user response:
Solution which doesn't use regex but it generates the expected output:
import pandas as pd
from itertools import chain
a = {'TAG': '1@@@;1|1=89325|2=96682|3=81940;2|1=17162|2=21282|3=23033;3|1=71761|2=73375|3=83581'}
list_of_lists = []
# Transform string into list of lists of tuples
for row in a['TAG'].split(';'):
content = tuple(row.split('|'))
row_num = int(content[0][0])
if len(content) > 1:
list_of_lists.append([tuple([row_num] [int(elem) for elem in c.split('=')]) for c in content[1:]])
# Calculate the total for each list
totals = [sum([values[-1] for values in t]) for t in list_of_lists]
# Append the total to the last tuple in each list
for index, elem in enumerate(list_of_lists):
elem[-1] = (*elem[-1], totals[index])
# Flatten the list_of_lists structure
data = list(chain(*list_of_lists))
# Create the dataframe
df = pd.DataFrame(data, columns =['a', 'b', 'c', 'd'])
UPDATE: Solution where map function is applied to input parsed using regular expression:
import re
from itertools import chain
import pandas as pd
a = {'TAG': ';1|1=89325|2=96682|3=81940;2|1=17162|2=21282|3=23033;3|1=71761|2=73375|3=83581'}
def list_of_tuples(tup):
indexes = [(0,1,2), (0,3,4), (0,5,6)]
lot = [tuple([int(tup[i]) for i in index]) for index in indexes]
lot[-1] = (*lot[-1], sum([int(tup[i]) for i in (2,4,6)]))
return lot
regex = r"\;(\d )\|(\d )\=(\d )\|(\d )\=(\d )\|(\d )\=(\d )"
data = list(chain(*map(list_of_tuples, re.findall(regex, a['TAG']))))
df = pd.DataFrame(data, columns =['a', 'b', 'c', 'd'])
UPDATE 2: A more efficient map function, although the code is more verbose.
def list_of_tuples(tup):
values_total = 0
tuples_list = []
for index,elem in enumerate(tup):
if index == 0:
a = int(elem)
else:
if index%2 == 1:
b = int(elem)
else:
c = int(elem)
values_total = c
tuples_list.append((a, b, c))
tuples_list[-1] = (*tuples_list[-1], values_total)
return tuples_list