assigning class name to list column in pandas-CodePudding

I have a column in dataframe of type list

categories

[0, 0, 2, 2, 2]
[0, 0, 2, 2]
[0, 0, 2, 2, 2]
[1, 1, 2, 2]
[2, 2, 0, 0]
[1, 0, 2, 3]

here is the sample list

li = [[0, 0, 2, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2, 2], [1, 1, 2, 2], [2, 2, 0, 0], [1, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2, 2], [0, 0, 2, 2], [2, 2, 0, 0], [2, 2, 0, 0], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [2, 2, 0, 0], [2, 2], [1, 1, 2], [0, 2, 2, 0], [0, 0, 2, 2], [0, 1], [0, 0], [0, 0, 2, 2], [0, 0], [0, 0, 2, 2], [0, 2, 2, 0], [2, 2, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [2, 2, 0, 0], [0, 0, 2, 2], [2, 2, 0, 1], [2, 2, 0, 0], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2, 2], [2, 1], [0, 0, 2, 2, 2], [2, 2, 0, 0], [2, 0], [2, 2, 0, 0], [0, 2], [0, 2, 2], [0, 0, 2, 2], [0, 2, 2, 0], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2, 2], [0, 0, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [3, 2, 0, 0], [0, 0], [0, 0, 2, 2], [0, 0, 2, 2, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [1, 3], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 2, 0, 2], [0, 0, 2, 2], [2, 2, 0, 0], [2, 2, 0, 0], [2, 2], [0, 0, 2, 2], [0, 2], [0, 0, 2, 2], [0, 0, 2, 2], [2, 2, 0], [2, 2, 0, 0], [0, 0, 2, 2], [0, 0, 2], [2], [0, 0, 2, 2], [2, 2, 2, 1, 1], [0, 0], [0, 3], [2, 2], [1, 2], [1,3]]

I want to create a new column (class_name) based on the following rule

The rules are based on priority and should be done one after other

if 1 and 3 are present, set class_name to class1
On the remaining rows, wherever 1 is present, set class_name to class2
On the remaining rows, wherever 3 is present, set class_name to class3
if 0 and 2 are present, set class_name to class4
On the remaining rows, wherever 0 is present, set class_name to class5
On the remaining rows, wherever 2 is present, set class_name to class6

What I have tried so far

df.loc[:, "class_name"] = None
    
for index, row in df.iterrows():
    if row["class_name"] == None:
        categories = list(row["categories"])
        if 1 in categories and 3 in categories:
            df.loc[index, "class_name"] = "class1"

Similarly, for each condition I have a separate loop.. but it's too slow.. is there a way to do it without looping ?

CodePudding user response：

I think I understood the question correctly. And I tried something like this, it seems to work fine.

import pandas as pd
li = [[0, 0, 2, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2, 2], [1, 1, 2, 2], [2, 2, 0, 0], [1, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2, 2], [0, 0, 2, 2], [2, 2, 0, 0], [2, 2, 0, 0], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [2, 2, 0, 0], [2, 2], [1, 1, 2], [0, 2, 2, 0], [0, 0, 2, 2], [0, 1], [0, 0], [0, 0, 2, 2], [0, 0], [0, 0, 2, 2], [0, 2, 2, 0], [2, 2, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [2, 2, 0, 0], [0, 0, 2, 2], [2, 2, 0, 1], [2, 2, 0, 0], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2, 2], [2, 1], [0, 0, 2, 2, 2], [2, 2, 0, 0], [2, 0], [2, 2, 0, 0], [0, 2], [0, 2, 2], [0, 0, 2, 2], [0, 2, 2, 0], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2, 2], [0, 0, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [3, 2, 0, 0], [0, 0], [0, 0, 2, 2], [0, 0, 2, 2, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [1, 3], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 0, 2, 2], [0, 2, 0, 2], [0, 0, 2, 2], [2, 2, 0, 0], [2, 2, 0, 0], [2, 2], [0, 0, 2, 2], [0, 2], [0, 0, 2, 2], [0, 0, 2, 2], [2, 2, 0], [2, 2, 0, 0], [0, 0, 2, 2], [0, 0, 2], [2], [0, 0, 2, 2], [2, 2, 2, 1, 1], [0, 0], [0, 3], [2, 2], [1, 2], [1,3]]

df=pd.DataFrame(data={'category':li})

def check(x):
    class_name=False
    
    if all(item in x for item in [1,3]):
        class_name='class1'
    elif not all(item in x for item in [1,3]) and 1 in x:
        class_name='class2'
    elif not all(item in x for item in [1,3]) and 3 in x:
        class_name='class3'
    elif all(item in x for item in [0,2]):
        class_name='class4'
    elif 0 in x:
        class_name='class5'
    elif 2 in x:
        class_name='class6'
    else:
        class_name='no_class'
    return class_name

df['check']=df['category'].apply(lambda x: check(x))

print(df)
'''
    category    check
66  [1, 3]          class1
92  [1, 3]          class1
3   [1, 1, 2, 2]    class2
5   [1, 0, 2, 2]    class2
17  [1, 1, 2]       class2
20  [0, 1]          class2
32  [2, 2, 0, 1]    class2
41  [2, 1]          class2
87  [2, 2, 2, 1, 1] class2
91  [1, 2]          class2
59  [3, 2, 0, 0]    class3
89  [0, 3]          class3
0   [0, 0, 2, 2, 2] class4

'''

CodePudding user response：

Okay, so I ran your problem using a couple of different methods. The fastest of them all was using pandas.DataFrame.apply. Here's the code:

from __future__ import annotations
import pandas as pd


def class_name(row: list) -> str | None:
    if 1 in row and 3 in row:
        return "class1"
    if 1 in row:
        return "class2"
    if 3 in row:
        return "class3"
    if 0 in row and 2 in row:
        return "class4"
    if 0 in row:
        return "class5"
    if 2 in row:
        return "class6"
    return None


# == How to use ============================

df["class_name"] = df["categories"].apply(class_name)
# Result using on a dataframe with 186000 rows:
"""
CPU times: user 80.9 ms, sys: 904 µs, total: 81.8 ms
Wall time: 82 ms
"""

Other Implementations I Tried

I've also tried some other implementations to compare. Here's them:

from __future__ import annotations
import pandas as pd


# == Code to Generate Sample DataFrame ==============

li = [
    [0, 0, 2, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2, 2],
    [1, 1, 2, 2],
    [2, 2, 0, 0],
    [1, 0, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2, 2],
    [0, 0, 2, 2],
    [2, 2, 0, 0],
    [2, 2, 0, 0],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [2, 2, 0, 0],
    [2, 2],
    [1, 1, 2],
    [0, 2, 2, 0],
    [0, 0, 2, 2],
    [0, 1],
    [0, 0],
    [0, 0, 2, 2],
    [0, 0],
    [0, 0, 2, 2],
    [0, 2, 2, 0],
    [2, 2, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [2, 2, 0, 0],
    [0, 0, 2, 2],
    [2, 2, 0, 1],
    [2, 2, 0, 0],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2, 2],
    [2, 1],
    [0, 0, 2, 2, 2],
    [2, 2, 0, 0],
    [2, 0],
    [2, 2, 0, 0],
    [0, 2],
    [0, 2, 2],
    [0, 0, 2, 2],
    [0, 2, 2, 0],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2, 2],
    [0, 0, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [3, 2, 0, 0],
    [0, 0],
    [0, 0, 2, 2],
    [0, 0, 2, 2, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [1, 3],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [0, 2, 0, 2],
    [0, 0, 2, 2],
    [2, 2, 0, 0],
    [2, 2, 0, 0],
    [2, 2],
    [0, 0, 2, 2],
    [0, 2],
    [0, 0, 2, 2],
    [0, 0, 2, 2],
    [2, 2, 0],
    [2, 2, 0, 0],
    [0, 0, 2, 2],
    [0, 0, 2],
    [2],
    [0, 0, 2, 2],
    [2, 2, 2, 1, 1],
    [0, 0],
    [0, 3],
    [2, 2],
    [1, 2],
    [1, 3],
]


def make_df(size=1):
    return pd.DataFrame({'categories': [v for i in range(size) for v in li]})


# == Implementation 1 ============================

def class_name2(row: list) -> str | None:

    categories = row["categories"]

    if 1 in categories and 3 in categories:
        return "class1"
    if 1 in categories:
        return "class2"
    if 3 in categories:
        return "class3"
    if 0 in categories and 2 in categories:
        return "class4"
    if 0 in categories:
        return "class5"
    if 2 in categories:
        return "class6"
    return None

df = make_df(2000)
df["class_name"] = df.apply(class_name2, axis=1)
# Result:
"""
CPU times: user 1.69 s, sys: 17 ms, total: 1.71 s
Wall time: 1.71 s
"""


# == Implementation 2 ============================
# This is your original implementation

df = make_df(2000)
df.loc[:, "class_name"] = None

for index, row in df.iterrows():
    if row["class_name"] == None:
        categories = list(row["categories"])
        if 1 in categories and 3 in categories:
            df.loc[index, "class_name"] = "class1"
        elif 1 in categories:
            df.loc[index, "class_name"] = "class2"
        elif 3 in categories:
            df.loc[index, "class_name"] = "class3"
        elif 0 in categories and 2 in categories:
            df.loc[index, "class_name"] = "class4"
        elif 0 in categories:
            df.loc[index, "class_name"] = "class5"
        elif 2 in categories:
            df.loc[index, "class_name"] = "class6"

# Result:
"""
CPU times: user 24.2 s, sys: 65.6 ms, total: 24.3 s
Wall time: 24.5 s
"""


# == Implementation 3 ============================
# This is your original implementation without the if statement

df = make_df(2000)
df.loc[:, "class_name"] = None

for index, row in df.iterrows():
    categories = list(row["categories"])
    if 1 in categories and 3 in categories:
        df.loc[index, "class_name"] = "class1"
    elif 1 in categories:
        df.loc[index, "class_name"] = "class2"
    elif 3 in categories:
        df.loc[index, "class_name"] = "class3"
    elif 0 in categories and 2 in categories:
        df.loc[index, "class_name"] = "class4"
    elif 0 in categories:
        df.loc[index, "class_name"] = "class5"
    elif 2 in categories:
        df.loc[index, "class_name"] = "class6"

# Result:
"""
CPU times: user 24 s, sys: 91.2 ms, total: 24.1 s
Wall time: 24.3 s
"""

# == Implementation 4 ============================
# This is your original implementation without the if statement
# and the list conversion

df = make_df(2000)
df.loc[:, "class_name"] = None

for index, row in df.iterrows():
    categories = row["categories"]
    if 1 in categories and 3 in categories:
        df.loc[index, "class_name"] = "class1"
    elif 1 in categories:
        df.loc[index, "class_name"] = "class2"
    elif 3 in categories:
        df.loc[index, "class_name"] = "class3"
    elif 0 in categories and 2 in categories:
        df.loc[index, "class_name"] = "class4"
    elif 0 in categories:
        df.loc[index, "class_name"] = "class5"
    elif 2 in categories:
        df.loc[index, "class_name"] = "class6"

# Result:
"""
CPU times: user 23.4 s, sys: 80 ms, total: 23.5 s
Wall time: 24.2 s
"""

# == Implementation 5 ============================
# Using `swifter`. Install swifter before trying this one:
# pip install swifter

import swifter


def class_name(row: list) -> str | None:
    if 1 in row and 3 in row:
        return "class1"
    if 1 in row:
        return "class2"
    if 3 in row:
        return "class3"
    if 0 in row and 2 in row:
        return "class4"
    if 0 in row:
        return "class5"
    if 2 in row:
        return "class6"
    return None


df = make_df(2000)
df["class_name"] = df["categories"].swifter.apply(class_name)

# Result:
"""
CPU times: user 572 ms, sys: 11 ms, total: 582 ms
Wall time: 930 ms
"""

Summary

Here's a summary of all the results:

Implementation	Total Time	Times Faster
Best	82 ms	300x
Implementation 1	1.71 s	14.3x
Implementation 2	24.5 s	1x
Implementation 3	24.3 s	1.008x
Implementation 4	24.2 s	1.012x
Implementation 5	930 ms	26x