transform array to dataframe-CodePudding

I have the following array:

['preprocessed\\AB_30624_badchannels.set', {'E88', 'E91', 'E248', 'E139', 'E245', 'E216', 'E111'}, 'preprocessed\\ACM_98630_badchannels.set', {'E88', 'E186', 'E91', 'E139', 'E102', 'E216', 'E111', 'E238'}, 'preprocessed\\AL_96705_badchannels.set', {'E88', 'E37', 'E91'}]

Could you tell me how to create a dataframe where the column name is the filename without the path and the extension (i.e.preprocessed\AB_30624_badchannels.set -> AB_30624_badchannels) and the rows are the list of channels contained in the corresponding dictionary (i.e. 'E88', 'E91', 'E248', 'E139', 'E245', 'E216', 'E111' fro AB_30624).

The desired output looks like bellow:

AB_30624 | ACM_98630 | AL_96705|
E88      |E88        |E88
E91      |E37        |E37
E248     |E91        |E91
E139     |E73
E245     |E232
E216     |E256
E111     |E139
         |E235
         |E216
         |E46

Thank you for your help

CodePudding user response：

Since the requested structure of the df is not clear to me, you can find 2 options below.

option 1 is creating a row from file name and channel list
option 2 is creating a row from file name and each channel is the channel set.

import pandas as pd

data = ['preprocessed\\AB_30624_badchannels.set', {'E88', 'E91', 'E248', 'E139', 'E245', 'E216', 'E111'}, 'preprocessed\\ACM_98630_badchannels.set', {'E88', 'E186', 'E91', 'E139', 'E102', 'E216', 'E111', 'E238'}, 'preprocessed\\AL_96705_badchannels.set', {'E88', 'E37', 'E91', 'E73', 'E232', 'E256', 'E139', 'E235', 'E216', 'E46'}, 'preprocessed\\AP_43781_badchannels.set', {'E25', 'E164', 'E253', 'E104', 'E230', 'E237', 'E18', 'E198', 'E120', 'E10', 'E233', 'E46', 'E54', 'E82', 'E31'}]

# option 1
entries = []
file = None
channels = None
for x in data:
    if isinstance(x,str):
        file = x.split('\\')[-1][:-4]
    else:
        channels = list(x)
    if file and channels:
        entries= {'file': file,'channels': channels}
        file = None
        channels = None

df = pd.DataFrame(entries)
print(df.to_string())


# option 2

entries = []
file = None
channels = None
for x in data:
    if isinstance(x,str):
        file = x.split('\\')[-1][:-4]
    else:
        channels = x
        for c in channels:
            entries.append({'file':file, 'channel':c})
        file = None
        channels = None

df = pd.DataFrame(entries)
print(df.to_string())

output (1)

                    file channels
0   AP_43781_badchannels      E46
1   AP_43781_badchannels     E233
2   AP_43781_badchannels     E237
3   AP_43781_badchannels      E18
4   AP_43781_badchannels     E164
5   AP_43781_badchannels     E104
6   AP_43781_badchannels      E82
7   AP_43781_badchannels     E253
8   AP_43781_badchannels     E120
9   AP_43781_badchannels      E10
10  AP_43781_badchannels      E54
11  AP_43781_badchannels     E198
12  AP_43781_badchannels      E25
13  AP_43781_badchannels      E31
14  AP_43781_badchannels     E230

output (2)

   channel                   file
0      E88   AB_30624_badchannels
1     E216   AB_30624_badchannels
2     E248   AB_30624_badchannels
3     E111   AB_30624_badchannels
4     E139   AB_30624_badchannels
5     E245   AB_30624_badchannels
6      E91   AB_30624_badchannels
7      E88  ACM_98630_badchannels
8     E216  ACM_98630_badchannels
9     E111  ACM_98630_badchannels
10    E186  ACM_98630_badchannels
11    E139  ACM_98630_badchannels
12    E238  ACM_98630_badchannels
13    E102  ACM_98630_badchannels
14     E91  ACM_98630_badchannels
15     E88   AL_96705_badchannels
16    E216   AL_96705_badchannels
17    E232   AL_96705_badchannels
18    E235   AL_96705_badchannels
19     E46   AL_96705_badchannels
20     E73   AL_96705_badchannels
21    E139   AL_96705_badchannels
22    E256   AL_96705_badchannels
23     E37   AL_96705_badchannels
24     E91   AL_96705_badchannels
25     E46   AP_43781_badchannels
26    E233   AP_43781_badchannels
27    E237   AP_43781_badchannels
28     E18   AP_43781_badchannels
29    E164   AP_43781_badchannels
30    E104   AP_43781_badchannels
31     E82   AP_43781_badchannels
32    E253   AP_43781_badchannels
33    E120   AP_43781_badchannels
34     E10   AP_43781_badchannels
35     E54   AP_43781_badchannels
36    E198   AP_43781_badchannels
37     E25   AP_43781_badchannels
38     E31   AP_43781_badchannels
39    E230   AP_43781_badchannels

CodePudding user response：

my solution is a bit more brutal:

import pandas as pd

array = ['preprocessed\\AB_30624_badchannels.set', {'E88', 'E91', 'E248', 'E139', 'E245', 'E216', 'E111'},
         'preprocessed\\ACM_98630_badchannels.set', {'E88', 'E186', 'E91', 'E139', 'E102', 'E216', 'E111', 'E238'},
         'preprocessed\\AL_96705_badchannels.set',
         {'E88', 'E37', 'E91', 'E73', 'E232', 'E256', 'E139', 'E235', 'E216', 'E46'},
         'preprocessed\\AP_43781_badchannels.set',
         {'E25', 'E164', 'E253', 'E104', 'E230', 'E237', 'E18', 'E198', 'E120', 'E10', 'E233', 'E46', 'E54', 'E82',
          'E31'}]

row_names = []
rows = []
for i in array:
    if isinstance(i, str):
        colum_name = i.split("\\")
        row_names.append(colum_name[1])
    elif isinstance(i, set):
        rows.append([j for j in i])

# merge them to DataFrame
max_length = max([len(i) for i in rows])
df = pd.DataFrame(index=row_names, columns=list(range(15)))

for i in range(df.shape[0]):
    for j, val in enumerate(rows[i]):
        df.iloc[i,j] = val

df = df.T

CodePudding user response：

You have several problems to deal with. The first is that your list alternates between a name string and set of values. A quick generator that converts this alternation into pairs fixes that. The names in the data can be converted to your desired column name with a regular expression. Your values have different lengths and because they are sets, their order is random. We can turn them into named Series but can't fix that order problem. Finally, concatenate the series and you've got your dataframe.

import re
import pandas as pd
import numpy as np

# extracts channel name from ex. "preprocessed\\AB_30624_badchannels.set"
colname_re = re.compile(r"\\(.*?)_badchannels.set")

# test data set
data = ['preprocessed\\AB_30624_badchannels.set', {'E88', 'E91', 'E248', 'E139', 'E245', 'E216', 'E111'}, 'preprocessed\\ACM_98630_badchannels.set', {'E88', 'E186', 'E91', 'E139', 'E102', 'E216', 'E111', 'E238'}, 'preprocessed\\AL_96705_badchannels.set', {'E88', 'E37', 'E91'}]

def iter_pairs(seq):
    """Iterate seq as pairs. E.g, (1,2,3,4) iterates as (1,2) then (3,4)"""
    iseq = iter(seq)
    try:
        while True:
            yield next(iseq), next(iseq)
    except StopIteration:
        pass

# create list of named series for the dataframe
interim_series = [pd.Series(list(values), name=colname_re.search(name).group(1))
        for name, values in iter_pairs(data)]

# build dataframe, convert NaN to ""
df = pd.concat(interim_series, axis=1)
del interim_series
df = df.replace(np.NaN, "")
print(df)