I have the following array:
['preprocessed\\AB_30624_badchannels.set', {'E88', 'E91', 'E248', 'E139', 'E245', 'E216', 'E111'}, 'preprocessed\\ACM_98630_badchannels.set', {'E88', 'E186', 'E91', 'E139', 'E102', 'E216', 'E111', 'E238'}, 'preprocessed\\AL_96705_badchannels.set', {'E88', 'E37', 'E91'}]
Could you tell me how to create a dataframe where the column name is the filename without the path and the extension (i.e.preprocessed\AB_30624_badchannels.set -> AB_30624_badchannels) and the rows are the list of channels contained in the corresponding dictionary (i.e. 'E88', 'E91', 'E248', 'E139', 'E245', 'E216', 'E111' fro AB_30624).
The desired output looks like bellow:
AB_30624 | ACM_98630 | AL_96705|
E88 |E88 |E88
E91 |E37 |E37
E248 |E91 |E91
E139 |E73
E245 |E232
E216 |E256
E111 |E139
|E235
|E216
|E46
Thank you for your help
CodePudding user response:
Since the requested structure of the df is not clear to me, you can find 2 options below.
option 1 is creating a row from file name and channel list
option 2 is creating a row from file name and each channel is the channel set.
import pandas as pd
data = ['preprocessed\\AB_30624_badchannels.set', {'E88', 'E91', 'E248', 'E139', 'E245', 'E216', 'E111'}, 'preprocessed\\ACM_98630_badchannels.set', {'E88', 'E186', 'E91', 'E139', 'E102', 'E216', 'E111', 'E238'}, 'preprocessed\\AL_96705_badchannels.set', {'E88', 'E37', 'E91', 'E73', 'E232', 'E256', 'E139', 'E235', 'E216', 'E46'}, 'preprocessed\\AP_43781_badchannels.set', {'E25', 'E164', 'E253', 'E104', 'E230', 'E237', 'E18', 'E198', 'E120', 'E10', 'E233', 'E46', 'E54', 'E82', 'E31'}]
# option 1
entries = []
file = None
channels = None
for x in data:
if isinstance(x,str):
file = x.split('\\')[-1][:-4]
else:
channels = list(x)
if file and channels:
entries= {'file': file,'channels': channels}
file = None
channels = None
df = pd.DataFrame(entries)
print(df.to_string())
# option 2
entries = []
file = None
channels = None
for x in data:
if isinstance(x,str):
file = x.split('\\')[-1][:-4]
else:
channels = x
for c in channels:
entries.append({'file':file, 'channel':c})
file = None
channels = None
df = pd.DataFrame(entries)
print(df.to_string())
output (1)
file channels
0 AP_43781_badchannels E46
1 AP_43781_badchannels E233
2 AP_43781_badchannels E237
3 AP_43781_badchannels E18
4 AP_43781_badchannels E164
5 AP_43781_badchannels E104
6 AP_43781_badchannels E82
7 AP_43781_badchannels E253
8 AP_43781_badchannels E120
9 AP_43781_badchannels E10
10 AP_43781_badchannels E54
11 AP_43781_badchannels E198
12 AP_43781_badchannels E25
13 AP_43781_badchannels E31
14 AP_43781_badchannels E230
output (2)
channel file
0 E88 AB_30624_badchannels
1 E216 AB_30624_badchannels
2 E248 AB_30624_badchannels
3 E111 AB_30624_badchannels
4 E139 AB_30624_badchannels
5 E245 AB_30624_badchannels
6 E91 AB_30624_badchannels
7 E88 ACM_98630_badchannels
8 E216 ACM_98630_badchannels
9 E111 ACM_98630_badchannels
10 E186 ACM_98630_badchannels
11 E139 ACM_98630_badchannels
12 E238 ACM_98630_badchannels
13 E102 ACM_98630_badchannels
14 E91 ACM_98630_badchannels
15 E88 AL_96705_badchannels
16 E216 AL_96705_badchannels
17 E232 AL_96705_badchannels
18 E235 AL_96705_badchannels
19 E46 AL_96705_badchannels
20 E73 AL_96705_badchannels
21 E139 AL_96705_badchannels
22 E256 AL_96705_badchannels
23 E37 AL_96705_badchannels
24 E91 AL_96705_badchannels
25 E46 AP_43781_badchannels
26 E233 AP_43781_badchannels
27 E237 AP_43781_badchannels
28 E18 AP_43781_badchannels
29 E164 AP_43781_badchannels
30 E104 AP_43781_badchannels
31 E82 AP_43781_badchannels
32 E253 AP_43781_badchannels
33 E120 AP_43781_badchannels
34 E10 AP_43781_badchannels
35 E54 AP_43781_badchannels
36 E198 AP_43781_badchannels
37 E25 AP_43781_badchannels
38 E31 AP_43781_badchannels
39 E230 AP_43781_badchannels
CodePudding user response:
my solution is a bit more brutal:
import pandas as pd
array = ['preprocessed\\AB_30624_badchannels.set', {'E88', 'E91', 'E248', 'E139', 'E245', 'E216', 'E111'},
'preprocessed\\ACM_98630_badchannels.set', {'E88', 'E186', 'E91', 'E139', 'E102', 'E216', 'E111', 'E238'},
'preprocessed\\AL_96705_badchannels.set',
{'E88', 'E37', 'E91', 'E73', 'E232', 'E256', 'E139', 'E235', 'E216', 'E46'},
'preprocessed\\AP_43781_badchannels.set',
{'E25', 'E164', 'E253', 'E104', 'E230', 'E237', 'E18', 'E198', 'E120', 'E10', 'E233', 'E46', 'E54', 'E82',
'E31'}]
row_names = []
rows = []
for i in array:
if isinstance(i, str):
colum_name = i.split("\\")
row_names.append(colum_name[1])
elif isinstance(i, set):
rows.append([j for j in i])
# merge them to DataFrame
max_length = max([len(i) for i in rows])
df = pd.DataFrame(index=row_names, columns=list(range(15)))
for i in range(df.shape[0]):
for j, val in enumerate(rows[i]):
df.iloc[i,j] = val
df = df.T
CodePudding user response:
You have several problems to deal with. The first is that your list alternates between a name string and set of values. A quick generator that converts this alternation into pairs fixes that. The names in the data can be converted to your desired column name with a regular expression. Your values have different lengths and because they are sets, their order is random. We can turn them into named Series but can't fix that order problem. Finally, concatenate the series and you've got your dataframe.
import re
import pandas as pd
import numpy as np
# extracts channel name from ex. "preprocessed\\AB_30624_badchannels.set"
colname_re = re.compile(r"\\(.*?)_badchannels.set")
# test data set
data = ['preprocessed\\AB_30624_badchannels.set', {'E88', 'E91', 'E248', 'E139', 'E245', 'E216', 'E111'}, 'preprocessed\\ACM_98630_badchannels.set', {'E88', 'E186', 'E91', 'E139', 'E102', 'E216', 'E111', 'E238'}, 'preprocessed\\AL_96705_badchannels.set', {'E88', 'E37', 'E91'}]
def iter_pairs(seq):
"""Iterate seq as pairs. E.g, (1,2,3,4) iterates as (1,2) then (3,4)"""
iseq = iter(seq)
try:
while True:
yield next(iseq), next(iseq)
except StopIteration:
pass
# create list of named series for the dataframe
interim_series = [pd.Series(list(values), name=colname_re.search(name).group(1))
for name, values in iter_pairs(data)]
# build dataframe, convert NaN to ""
df = pd.concat(interim_series, axis=1)
del interim_series
df = df.replace(np.NaN, "")
print(df)