Split columns base on conditions in Python-CodePudding

I have a dataset where there are multiple values in one column, the problem is there might be some null values in these columns. And I need to create three different columns from this column where the number of characters is not fixed nor the position.

The data before:

df=pd.DataFrame({'Date':['2-18-2019','2-18-2019','2-19-2019','2-19-2019','2-20-2019','2-21-2019','2-21-2019','2-22-2019'],'Item':['NY01','Ld01','Du02','Du01','Ps55','L55','Du85','L85'],'SizeAgeQuantity':['13 3/8 5 846','4 1/2 557 85','9 5/8 47 4464','30 58','32 304 304','32 304 304 ','7 6588 685','4118 587','29']})


   Date    |    Item    |    SizeAgeQuantity
2-18-2019  |    NY01    |     13 3/8 5 846         
2-18-2019  |    Ld01    |     4 1/2 557 85        
2-19-2019  |    Du02    |     9 5/8 47 4464         
2-19-2019  |    Du01    |         30 58      
2-20-2019  |    Ps55    |     32 304 304      
2-21-2019  |    L55     |     7  6588 685  
2-21-2019  |    Du85    |        4118 587       
2-22-2019  |    L85     |        29

The result I am looking for is like this:

   Date    |    Item    |    Size    |    Age   |   Quantity
2-18-2019  |    NY01    |   13 3/8   |     5    |     846         
2-18-2019  |    Ld01    |    4 1/2   |    557   |     85        
2-19-2019  |    Du02    |    9 5/8   |    47    |     4464         
2-19-2019  |    Du01    |     30     |    58    |  
2-20-2019  |    Ps55    |     32     |    304   |     304      
2-21-2019  |    L55     |     7      |    6588  |     685  
2-21-2019  |    Du85    |            |    4118  |     587       
2-22-2019  |    L85     |            |    29    |

The only consistent thing is the column "Size" should only have a value from the following set ("4 1/2","7", "9 5/8", "13 3/8", "18", "30", "32")

I have tried the following code: df['Size'], df['FrakS'], df['Age'], df['Quantity'] = df['SizeAgeQuantity'].str.split(' ', 3).str

But the result comes as the following:

   Date    |    Item    |    Size    |   FrakS   |    Age   |   Quantity
2-18-2019  |    NY01    |     13     |    3/8    |     5    |     846         
2-18-2019  |    Ld01    |     4      |    1/2    |    557   |     85        
2-19-2019  |    Du02    |     9      |    5/8    |    47    |     4464         
2-19-2019  |    Du01    |     30     |    58     |          |  
2-20-2019  |    Ps55    |     32     |    304    |    304   |           
2-21-2019  |    L55     |     7      |    658    |    685   |       
2-21-2019  |    Du85    |    4118    |    587    |          |          
2-22-2019  |    L85     |     29     |           |          |

If anyone can help me I would really appreciate it

CodePudding user response：

try:

df['SizeAgeQuantity'] = df['SizeAgeQuantity'].str.split()

size_set = ["4 1/2","7", "9 5/8", "13 3/8", "18", "30", "32"]

def f_size(x):
    if (len(x)==4) and ('/' in x[1]):
        return ' '.join(x[:2])
    elif (len(x)==3):
        return x[0]

    #_______________________________________________________

    elif (len(x)==2) and (x[0] in size_set): #[size, ...]
        return x[0] 
    elif (len(x)==2) and (x[1] in size_set): #[..., size]
        return x[1] 
    elif (len(x)==1) and (x[0] in size_set): #[size]
        return x[0] 
    #_______________________________________________________
    elif (len(x)==2) and (x[0] not in size_set): #[age, qty]
        return None
    elif (len(x)==1) and (x[0] not in size_set): #[age]
        return None
    #_______________________________________________________

    else:
        return "unknown"

#when we don't have size(the values from the set) and there is only one number we will assume it is for age

def f_age(x):
    if (len(x)==4) and ('/' in x[1]):
        return x[2]
    elif (len(x)==3):
        return x[1]
    #_____________________________________________________
    elif (len(x)==2) and (x[0] in size_set): #[size, ...]
        return "unknown"
    elif (len(x)==2) and (x[1] in size_set): #[..., size]
        return "unknown"
    elif (len(x)==1) and (x[0] in size_set): #[size]
        return None
    #_______________________________________________________
    elif (len(x)==2) and (x[0] not in size_set): #[age, qty]
        return x[0]
    elif (len(x)==1) and (x[0] not in size_set): #[age]
        return x[0]
    #_______________________________________________________
    else:
        return "unknown"
def f_qty(x):
    if (len(x)==4) and ('/' in x[1]):
        return x[-1]
    elif (len(x)==3):
        return x[-1]
    #_______________________________________________________

    elif (len(x)==2) and (x[0] in size_set): #[size, ...]
        return "unknown"
    elif (len(x)==2) and (x[1] in size_set): #[..., size]
        return "unknown"
    elif (len(x)==1) and (x[0] in size_set): #[size]
        return None
    #_______________________________________________________

    elif (len(x)==2) and (x[0] not in size_set): #[age, qty]
        return x[1]
    elif (len(x)==1) and (x[0] not in size_set): #[age]
        return None

    
    else:
        return "unknown"

df['size'] = df['SizeAgeQuantity'].map(f_size)
df['age'] = df['SizeAgeQuantity'].map(f_age)
df['qty'] = df['SizeAgeQuantity'].map(f_qty)

df
    Date        Item    SizeAgeQuantity     size    age     qty
0   2-18-2019   NY01    [13, 3/8, 5, 846]   13 3/8  5       846
1   2-18-2019   Ld01    [4, 1/2, 557, 85]   4 1/2   557     85
2   2-19-2019   Du02    [9, 5/8, 47, 4464]  9 5/8   47      4464
3   2-19-2019   Du01    [30, 58]            30      unknown unknown
4   2-20-2019   Ps55    [32, 304, 304]      32      304     304
5   2-21-2019   L55     [7, 6588, 685]      7       6588    685
6   2-21-2019   Du85    [4118, 587]         None    4118    587
7   2-22-2019   L85     [29]                None    29      None

CodePudding user response：

This should do the trick:

import pandas as pd
import numpy as np

df = pd.DataFrame({
    'Date':['2-18-2019','2-18-2019','2-19-2019','2-19-2019','2-20-2019','2-21-2019','2-21-2019','2-22-2019'],
    'Item':['NY01','Ld01','Du02','Du01','Ps55','L55','Du85','L85'],
    'SizeAgeQuantity':['13 3/8 5 846','4 1/2 557 85','9 5/8 47 4464','30 58','32 304 304','7 6588 685','4118 587','29']})

size_list = ["4 1/2", "7", "9 5/8", "13 3/8", "18", "30", "32"]

columns = []

for row in df.SizeAgeQuantity:
    values = row.split()
    # if there's a "/" in the row, 
    # combine values 1 and 2
    if "/" in row:
        size = " ".join(values[:2])
        del values[0:2]
        values.insert(0, size)
    
    # add nan padding to the values list
    values = values   [np.nan] * (3-len(values))
    
    # if the 1st value is not size, shift list right
    if values[0] not in size_list:
        values = values[-1:]   values[:-1]
    
    columns.append(values)

saq = pd.DataFrame(columns, columns=["Size", "Age", "Quantity"])
out = pd.concat([df.drop("SizeAgeQuantity", axis=1), saq], axis=1)
print(out)

Out:

        Date  Item    Size   Age Quantity
0  2-18-2019  NY01  13 3/8     5      846
1  2-18-2019  Ld01   4 1/2   557       85
2  2-19-2019  Du02   9 5/8    47     4464
3  2-19-2019  Du01      30    58      NaN
4  2-20-2019  Ps55      32   304      304
5  2-21-2019   L55       7  6588      685
6  2-21-2019  Du85     NaN  4118      587
7  2-22-2019   L85     NaN    29      NaN