#!/usr/bin/env python3
import pandas
import numpy
example_dataset = {
'Date' : ['01 Mar 2022', '02 Apr 2022', '10 Apr 2022', '15 Apr 2022'],
'Transaction Type' : ['Contactless payment', 'Payment to', 'Contactless payment', 'Contactless payment'],
'Description' : ['Tesco Store', 'Dentist', 'Cinema', 'Sainsburys'],
'Amount' : ['156.00', '55', '21.50', '176.10']
}
df = pandas.DataFrame(example_dataset)
df ['Date'] = pandas.to_datetime(df['Date'], format='%d %b %Y')
df['Category'] = 'tempvalue'
df['Category'] = numpy.where(df['Description'].str.contains('Tesco|Sainsbury'), 'Groceries', df['Category'])
df['Category'] = numpy.where(df['Description'].str.contains('Dentist|Cinema'), 'Stuff', df['Category'])
print (df)
Given the above code I have two related questions please:
- Is there a better way to create the Category column than with the temp value and later replacing it with specific values as shown? I ask as it feels messy.
- How could I have the terms to search for and the category to assign in a separate file? Is that possible? I ask as I want to make it easy for myself to add new terms and define the categories in the future.
CodePudding user response:
1. First Question
you don't need to pre-create the new column, you could do something like this:
#df['Category'] = 'tempvalue'
df['Category'] = numpy.where(df['Description'].str.contains('Tesco|Sainsbury'), 'Groceries',numpy.nan)
df['Category'] = numpy.where(df['Description'].str.contains('Dentist|Cinema'), 'Stuff',df['Category'])
2. Second Question
let's create a simple key-value file called categories.json
in the same directory of your script
{
"Tesco|Sainsbury":"Groceries",
"Dentist|Cinema":"Stuff"
}
you could do something like this to automate the categories assignement
import pandas
import numpy
import json
example_dataset = {
'Date' : ['01 Mar 2022', '02 Apr 2022', '10 Apr 2022', '15 Apr 2022'],
'Transaction Type' : ['Contactless payment', 'Payment to', 'Contactless payment', 'Contactless payment'],
'Description' : ['Tesco Store', 'Dentist', 'Cinema', 'Sainsburys'],
'Amount' : ['156.00', '55', '21.50', '176.10']
}
df = pandas.DataFrame(example_dataset)
df ['Date'] = pandas.to_datetime(df['Date'], format='%d %b %Y')
with open('categories.json') as file:
categories_dict = json.load(file)
df['Category'] = numpy.nan
for key,value in categories_dict.items():
df['Category'] = numpy.where(df['Description'].str.contains(key), value,df['Category'])
In this scenario I suggest to mantain the column initialization for simplicity
CodePudding user response:
You could write the search terms in a csv e.g. 'search_terms.csv' like:
SearchTerm,Value
Tesco|Sainsbury,Groceries
Dentist|Cinema,Stuff
and read it into a dataframe like:
df_search = pd.read_csv('search_terms.csv')
and build a dictionary like:
search_dict = df_search.set_index('SearchTerm')['Value'].to_dict()
Now initialize Category
column to something like:
df['Category'] = np.nan
and update Category
in place with loc
efficiently like:
for k in d:
df.loc[df['Description'].str.match(k),'Category'] = d[k]
Output df:
Date Transaction Type Description Amount Category
0 01 Mar 2022 Contactless payment Tesco Store 156.00 Groceries
1 02 Apr 2022 Payment to Dentist 55 Stuff
2 10 Apr 2022 Contactless payment Cinema 21.50 Stuff
3 15 Apr 2022 Contactless payment Sainsburys 176.10 Groceries
CodePudding user response:
I found this to be faster than Gam's answer, and also, in my opinion, cleaner code:
category_dict = {'Groceries':
['Tesco', 'Sainsbury'],
'Stuff':
['Dentist', 'Cinema']
}
def get_category(description):
for category, substrings in category_dict.items():
for substring in substrings:
if substring in description:
return category
df['Category'] = df['Description'].apply(get_category)
If you want the substrings as the keys, there's this:
category_dict ={'Tesco':'Groceries',
'Sainsbury':'Groceries',
'Dentist':'Stuff',
'Cinema':'Stuff'
}
def get_category(description):
for substring in category_dict:
if substring in description:
return category_dict[substring]
df['Category'] = df['Description'].apply(get_category)