Categorize data from sentence column-CodePudding

I am trying to add a column of one-word categories from analyzing a column that contains a sentence in each row

I tried the following code but it kept giving me errors!

def loan_cat(row):
    rows = df[df.columns[0]].count()
    for i in rows: 
        data = df['purpose'][i]
        if 'house' in data:
            return 'house'
        elif 'education' | 'university' in data:
            return 'education'
        elif 'wedding' in data:
            return 'wedding'
        elif 'car' in data:
            return 'car'
        elif 'real' in data:
            return 'real estate'
        elif 'property'in data:
            return 'property'
        return 'undefined'
    
df['purpose_1'] = df.apply(loan_cat, axis=1)

is there a better way to analyze and categorize the data?

CodePudding user response：

Use a dict

import pandas

data = pandas.Series(["purchase a house",
                      "purchase car",
                      "supplemental education",
                      "burger",
                      "attend university"])

arr = {"house": "house",
       "education": "education",
       "university": "education",
       "car": "car"}


def foo(s, d):
    for k, v in d.items():
        if k in s:
            return v
    return "NA"


data.apply(lambda x: foo(x, arr))
# 0        house
# 1          car
# 2    education
# 3           NA
# 4    education
# dtype: object

CodePudding user response：

I figured out the answer:

def loan_cat(value): 

      if 'hous' in value:
         return 'House'
      elif 'educ' in value:
         return 'Education'
      elif 'university' in value:
         return 'Education'
      elif 'wedding' in value:
         return 'Wedding'
      elif 'car' in value:
         return 'Car'
      elif 'real' in value:
         return 'Real Estate'
      elif 'property'in value:
         return 'Property'
      return 'undefined'

df['purpose_cat'] = df['purpose'].apply(lambda value: loan_cat(value)) 
print(df['purpose_cat'].value_counts())