Dealing with Merged Tables that are Stacked-CodePudding

I imported a csv via Pandas read_table which is essentially a stacked column where each student is named and then values found in the following rows.

Student-John
01/01/2021	334
01/02/2021	456
Student-Sally
01/01/2021	76
01/04/2021	789

I want to pivot these so that each student has their own column and dates are on the left hand side.

Date	Student-Jon	Student-Sally
01/01/2021	334	76
01/02/2021	456
01/04/2021		789

My approach is to bring in CSV via pandas dataframe.

import pandas as pd
df = pd.read_table('C:/Users/****data.csv', skiprows=1, header=None)
df[2]=""
df.columns = "Date", "Val"

x="Start"

#Started with this although the Student line doesn't work

for ind, row in df.iterrows():
    if df['Date'][ind] == "Student*":
        x = df['Date'][ind]
        df.drop(ind, inplace=True)
    else:
        df['Val'][ind] = x

CodePudding user response：

Use boolean mask to filter out your dataframe then pivot to reshape it:

# Rename columns
df.columns = ['Date', 'Value']

# Find Student rows
m = df[0].str.startswith('Student')

# Create the future column
df['Student'] = df[0].mask(~m).ffill()

# Remove Student rows
df = df[~m]

# Reshape your dataframe
df = df.pivot('Date', 'Student', 'Value').rename_axis(columns=None).reset_index()

Output:

>>> df
         Date Student-John Student-Sally
0  01/01/2021          334            76
1  01/02/2021          456           NaN
2  01/04/2021          NaN           789

Setup:

import pandas as pd
import numpy as np

data = {0: ['Student-John', '01/01/2021', '01/02/2021',
            'Student-Sally', '01/01/2021', '01/04/2021'],
        1: [np.nan, '334', '456', np.nan, '76', '789']}
df = pd.DataFrame(data)
print(df)

# Output
               0    1
0   Student-John  NaN
1     01/01/2021  334
2     01/02/2021  456
3  Student-Sally  NaN
4     01/01/2021   76
5     01/04/2021  789

CodePudding user response：

A Naive Solution will look like:

import pandas as pd

details = {
    'Date' : ['Student-John','01/01/2021','01/02/2021','Student-Sally','01/01/2021','01/02/2021'],
    'val' : ['', 'Y', 'N', '', 'N', 'N'],
}

df = pd.DataFrame(details)
print(df)

            Date val
0   Student-John    
1     01/01/2021   Y
2     01/02/2021   N
3  Student-Sally    
4     01/01/2021   N
5     01/02/2021   N

# Creating a new data frame - df1
# adding new dates to dates list and new column names
# to columns list, temp list contains Y/N values
# Adding dates and Y/N values to dataframe whenever
# we find a new column name

det = {}
df1 = pd.DataFrame(det)
temp_list = []
date_list = []
col_name = 'empty'
for ind in df.index:
  if df['val'][ind] == '':
    df1[col_name] = temp_list
    df1['Date'] = date_list
    temp_list = []
    date_list = []
    col_name = df['Date'][ind]
  else:
    temp_list.append(df['val'][ind])
    date_list.append(df['Date'][ind])
    if ind == len(df)-1:
       df1[col_name] = temp_list

del df1['empty']
print(df1)

         Date Student-John Student-Sally
0  01/01/2021            Y             N
1  01/02/2021            N             N