I have the following text file that looks like the following:
foo_14:14896|foo_23:17988|foo_16:1611|foo_5:14729|foo_6:13008|foo_2:19548|foo_8:2565
foo_18:13236|foo_17:6127|foo_14:5996
foo_4:12561|foo_24:20010|foo_15:20227|foo_6:3489|foo_8:19129|foo_1:2589|foo_25:5204|foo_19:18113|foo_3:20030|foo_16:12739|foo_5:5494
foo_23:3606|foo_13:22588|foo_5:15104|foo_12:3307|foo_16:2039|foo_14:21537|foo_1:13178|foo_20:19090|foo_2:1140
foo_5:10620|foo_21:11142|foo_22:13379|foo_1:16132|foo_10:9330|foo_23:13913|foo_6:8314|foo_13:4967
foo_13:4891|foo_5:22715|foo_2:12098|foo_8:18634|foo_23:1341|foo_4:13995|foo_16:2923|foo_10:13663|foo_7:22939|foo_14:16711|foo_25:17824
foo_22:8180|foo_1:11974|foo_7:6660|foo_17:17839|foo_10:3605|foo_8:22439|foo_6:15494|foo_11:17924|foo_18:17351|foo_20:16627|foo_14:9223|foo_13:2538|foo_16:19077
foo_12:11381|foo_24:541|foo_12:13106|foo_7:6459|foo_11:6398|foo_14:23127|foo_9:15701
using the following code:
df = pd.read_csv("text.txt",names=['text'],header=None)
data = df['text'].str.split("|")
names=[ y.split(':') for x in data for y in x]
ds=pd.DataFrame(names)
ds = ds.pivot(columns=0).fillna('')
I get output that looks like the following:
0 foo_1 foo_10 foo_11 foo_12 foo_13 foo_14 foo_15 foo_16 foo_17 foo_18 foo_19 foo_2 foo_20 foo_21 foo_22 foo_23 foo_24 foo_25 foo_3 foo_4 foo_5 foo_6 foo_7 foo_8 foo_9
0 14896
1 17988
2 1611
3 14729
4 13008
.. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
64 13106
65 6459
66 6398
67 23127
68 15701
However, I would like to see each of the data fields on the same line. such as:
foo_14, foo_23, foo_16, foo_5, foo_6, foo_2 and foo_8 all appearing in row 0
foo_18, foo_17 and foo_14 should all appear in row 1
etc etc...
Instead of one row per entry I'd like to bulk them up to read as such
0 foo_1 foo_10 foo_11 foo_12 foo_13 foo_14 foo_15 foo_16 foo_17 foo_18 foo_19 foo_2 foo_20 foo_21 foo_22 foo_23 foo_24 foo_25 foo_3 foo_4 foo_5 foo_6 foo_7 foo_8 foo_9
0 14896 1611 19548 17988 14729 13008 2565
etc
CodePudding user response:
Here is one way, if I understand you correctly:
from io import StringIO
import pandas as pd
txtfile = StringIO("""foo_14:14896|foo_23:17988|foo_16:1611|foo_5:14729|foo_6:13008|foo_2:19548|foo_8:2565
foo_18:13236|foo_17:6127|foo_14:5996
foo_4:12561|foo_24:20010|foo_15:20227|foo_6:3489|foo_8:19129|foo_1:2589|foo_25:5204|foo_19:18113|foo_3:20030|foo_16:12739|foo_5:5494
foo_23:3606|foo_13:22588|foo_5:15104|foo_12:3307|foo_16:2039|foo_14:21537|foo_1:13178|foo_20:19090|foo_2:1140
foo_5:10620|foo_21:11142|foo_22:13379|foo_1:16132|foo_10:9330|foo_23:13913|foo_6:8314|foo_13:4967
foo_13:4891|foo_5:22715|foo_2:12098|foo_8:18634|foo_23:1341|foo_4:13995|foo_16:2923|foo_10:13663|foo_7:22939|foo_14:16711|foo_25:17824
foo_22:8180|foo_1:11974|foo_7:6660|foo_17:17839|foo_10:3605|foo_8:22439|foo_6:15494|foo_11:17924|foo_18:17351|foo_20:16627|foo_14:9223|foo_13:2538|foo_16:19077
foo_12:11381|foo_24:541|foo_12:13106|foo_7:6459|foo_11:6398|foo_14:23127|foo_9:15701""")
df = pd.read_csv(txtfile, names=['text'])
#df_out = df['text'].str.split('|', expand=True).apply(lambda x: x.str.replace('(:\d )', '', regex=True))
Improvement by @Chris in comments below.
df_out = df['text'].str.split('|', expand=True).replace('(:\d )', '', regex=True)
print(df_out)
Output:
0 1 2 3 4 5 6 7 8 9 10 11 12
0 foo_14 foo_23 foo_16 foo_5 foo_6 foo_2 foo_8 None None None None None None
1 foo_18 foo_17 foo_14 None None None None None None None None None None
2 foo_4 foo_24 foo_15 foo_6 foo_8 foo_1 foo_25 foo_19 foo_3 foo_16 foo_5 None None
3 foo_23 foo_13 foo_5 foo_12 foo_16 foo_14 foo_1 foo_20 foo_2 None None None None
4 foo_5 foo_21 foo_22 foo_1 foo_10 foo_23 foo_6 foo_13 None None None None None
5 foo_13 foo_5 foo_2 foo_8 foo_23 foo_4 foo_16 foo_10 foo_7 foo_14 foo_25 None None
6 foo_22 foo_1 foo_7 foo_17 foo_10 foo_8 foo_6 foo_11 foo_18 foo_20 foo_14 foo_13 foo_16
7 foo_12 foo_24 foo_12 foo_7 foo_11 foo_14 foo_9 None None None None None None
Changed input data for row 7 to make foo_12 unique on this row:
from io import StringIO
import pandas as pd
txtfile = StringIO("""foo_14:14896|foo_23:17988|foo_16:1611|foo_5:14729|foo_6:13008|foo_2:19548|foo_8:2565
foo_18:13236|foo_17:6127|foo_14:5996
foo_4:12561|foo_24:20010|foo_15:20227|foo_6:3489|foo_8:19129|foo_1:2589|foo_25:5204|foo_19:18113|foo_3:20030|foo_16:12739|foo_5:5494
foo_23:3606|foo_13:22588|foo_5:15104|foo_12:3307|foo_16:2039|foo_14:21537|foo_1:13178|foo_20:19090|foo_2:1140
foo_5:10620|foo_21:11142|foo_22:13379|foo_1:16132|foo_10:9330|foo_23:13913|foo_6:8314|foo_13:4967
foo_13:4891|foo_5:22715|foo_2:12098|foo_8:18634|foo_23:1341|foo_4:13995|foo_16:2923|foo_10:13663|foo_7:22939|foo_14:16711|foo_25:17824
foo_22:8180|foo_1:11974|foo_7:6660|foo_17:17839|foo_10:3605|foo_8:22439|foo_6:15494|foo_11:17924|foo_18:17351|foo_20:16627|foo_14:9223|foo_13:2538|foo_16:19077
foo_12:11381|foo_24:541|foo_13:13106|foo_7:6459|foo_11:6398|foo_14:23127|foo_9:15701""")
df = pd.read_csv(txtfile, names=['text'])
dfs = df['text'].str.split('|', expand=True).stack().str.split(':', expand=True).reset_index()\
.set_index(['level_0',0])[1].unstack()
print(dfs)
Output:
0 foo_1 foo_10 foo_11 foo_12 foo_13 foo_14 foo_15 foo_16 foo_17 foo_18 ... foo_23 foo_24 foo_25 foo_3 foo_4 foo_5 foo_6 foo_7 foo_8 foo_9
level_0 ...
0 NaN NaN NaN NaN NaN 14896 NaN 1611 NaN NaN ... 17988 NaN NaN NaN NaN 14729 13008 NaN 2565 NaN
1 NaN NaN NaN NaN NaN 5996 NaN NaN 6127 13236 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 2589 NaN NaN NaN NaN NaN 20227 12739 NaN NaN ... NaN 20010 5204 20030 12561 5494 3489 NaN 19129 NaN
3 13178 NaN NaN 3307 22588 21537 NaN 2039 NaN NaN ... 3606 NaN NaN NaN NaN 15104 NaN NaN NaN NaN
4 16132 9330 NaN NaN 4967 NaN NaN NaN NaN NaN ... 13913 NaN NaN NaN NaN 10620 8314 NaN NaN NaN
5 NaN 13663 NaN NaN 4891 16711 NaN 2923 NaN NaN ... 1341 NaN 17824 NaN 13995 22715 NaN 22939 18634 NaN
6 11974 3605 17924 NaN 2538 9223 NaN 19077 17839 17351 ... NaN NaN NaN NaN NaN NaN 15494 6660 22439 NaN
7 NaN NaN 6398 11381 13106 23127 NaN NaN NaN NaN ... NaN 541 NaN NaN NaN NaN NaN 6459 NaN 15701