I am writing a program to discretize a set of attributes via entropy discretization. The goal is to parse the dataset
A,Class
5,1
12.5,1
11.5,2
8.6,2
7,1
6,1
5.9,2
1.5,2
9,2
7.8,1
2.1,1
13.5,2
12.45,2
Into
A,Class
1,1
3,1
3,2
2,2
2,1
2,1
1,2
1,2
3,2
2,1
1,1
3,2
3,2
The specific problem that I am facing with my program is determining the frequencies of 1 and 2 in the class column.
df = s['Class']
df['freq'] = df.groupby('Class')['Class'].transform('count')
print("*****************")
print(df['freq'])
I would like to use a pandas method to return the frequency of 1 and 2 so that I can calculate probabilities p1 and p2.
import pandas as pd
import numpy as np
import entropy_based_binning as ebb
from math import log2
def main():
df = pd.read_csv('S1.csv')
s = df
s = entropy_discretization(s)
# This method discretizes s A1
# If the information gain is 0, i.e the number of
# distinct class is 1 or
# If min f/ max f < 0.5 and the number of distinct values is floor(n/2)
# Then that partition stops splitting.
def entropy_discretization(s):
informationGain = {}
# while(uniqueValue(s)):
# Step 1: pick a threshold
threshold = 6
# Step 2: Partititon the data set into two parttitions
s1 = s[s['A'] < threshold]
print("s1 after spitting")
print(s1)
print("******************")
s2 = s[s['A'] >= threshold]
print("s2 after spitting")
print(s2)
print("******************")
# Step 3: calculate the information gain.
informationGain = information_gain(s1,s2,s)
print(informationGain)
# # Step 5: calculate the max information gain
# minInformationGain = min(informationGain)
# # Step 6: keep the partitions of S based on the value of threshold_i
# s = bestPartition(minInformationGain, s)
def uniqueValue(s):
# are records in s the same? return true
if s.nunique()['A'] == 1:
return False
# otherwise false
else:
return True
def bestPartition(maxInformationGain):
# determine be threshold_i
threshold_i = 6
return
def information_gain(s1, s2, s):
# calculate cardinality for s1
cardinalityS1 = len(pd.Index(s1['A']).value_counts())
print(f'The Cardinality of s1 is: {cardinalityS1}')
# calculate cardinality for s2
cardinalityS2 = len(pd.Index(s2['A']).value_counts())
print(f'The Cardinality of s2 is: {cardinalityS2}')
# calculate cardinality of s
cardinalityS = len(pd.Index(s['A']).value_counts())
print(f'The Cardinality of s is: {cardinalityS}')
# calculate informationGain
informationGain = (cardinalityS1/cardinalityS) * entropy(s1) (cardinalityS2/cardinalityS) * entropy(s2)
print(f'The total informationGain is: {informationGain}')
return informationGain
def entropy(s):
# calculate the number of classes in s
numberOfClasses = s['Class'].nunique()
print(f'Number of classes: {numberOfClasses}')
# TODO calculate pi for each class.
# calculate the frequency of class_i in S1
value_counts = s['Class'].value_counts()
print(f'value_counts : {value_counts}')
df = s['Class']
df['freq'] = df.groupby('Class')['Class'].transform('count')
print("*****************")
print(df['freq'])
# p1 = s.groupby('Class').count()
# p2 = s.groupby('Class').count()
# print(f'p1: {p1}')
# print(f'p2: {p2}')
p1 = 2/4
p2 = 3/4
ent = -(p1*log2(p2)) - (p2*log2(p2))
return ent
Ideally, I'd like to print Number of classes: 2
. This way I can loop over the classes and calculate the frequencies for the attribute Class
from the dataset. I've reviewed the pandas documentation, but I got stuck trying to count the frequency of 1 and 2 from the class section. Any help would be greatly appreciated.
CodePudding user response:
Use value_counts
:
>>> df.value_counts('Class')
Class
2 7
1 6
dtype: int64
Update:
How do get the individual frequencies returned from the
value_counts
method?
counts = df.value_counts('Class')
print(counts[1]) # Freq of 1
6
print(counts[2]) # Freq of 2
7