Home > Software engineering >  Remove nan from pandas binner
Remove nan from pandas binner

Time:07-25

I have created the following pandas dataframe called train:

import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats

ds = {

    'matchKey' : [621062,   622750, 623508, 626451, 626611, 626796, 627114, 630055, 630225],
    'og_max_last_dpd' : [10,    10, -99999, 10, 10, 10, 10, 10, 10],
    'og_min_last_dpd' : [10,    10, -99999, 10, 10, 10, 10, 10, 10],
    'og_max_max_dpd' : [0,  0,  -99999, 1,  0,  5,  0,  4,  0],
    'Target':[1,0,1,0,0,1,1,1,0]

}


train = pd.DataFrame(data=ds)

The dataframe looks like this:

print(train)

   matchKey  og_max_last_dpd  og_min_last_dpd  og_max_max_dpd  Target
0    621062               10               10               0       1
1    622750               10               10               0       0
2    623508           -99999           -99999          -99999       1
3    626451               10               10               1       0
4    626611               10               10               0       0
5    626796               10               10               5       1
6    627114               10               10               0       1
7    630055               10               10               4       1
8    630225               10               10               0       0

I have then binned the column called og_max_max_dpd using this code:

def mono_bin(Y, X, char, n=20):
    X2 = X.fillna(-99999)
    r = 0
    while np.abs(r) < 1:
        d1 = pd.DataFrame({"X": X2, "Y": Y, "Bucket": pd.qcut(X2, n, duplicates="drop")})#,include_lowest=True
        d2 = d1.groupby("Bucket", as_index=True)
        r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
        n = n - 1
    d3 = pd.DataFrame(d2.min().X, columns=["min_"   X.name])
    d3["max_"   X.name] = d2.max().X
    d3[Y.name] = d2.sum().Y
    d3["total"] = d2.count().Y
    d3[Y.name   "_rate"] = d2.mean().Y
    d4 = (d3.sort_values(by="min_"   X.name)).reset_index(drop=True)
    # print("=" * 85)
    # print(d4)
    ninf = float("-inf")
    pinf = float(" inf")

    array = []
    for i in range(len(d4) - 1):
        array.append(d4["max_"   char].iloc[i])

    return [ninf]   array   [pinf]

binner = mono_bin(train['Target'], train['og_max_max_dpd'], 'og_max_max_dpd')

I have printed out the binner which looks like this:

print(binner)

[-inf, -99999.0, nan, 0.0, nan, nan, 1.0, nan, nan, 4.0, nan, inf]

I want to remove the nan from that list so that the binner looks like this:

[-inf, -99999.0, 0.0, 1.0, 4.0, inf]

Does anyone know how to remove the nan?

CodePudding user response:

You can simply use dropna to remove it from d4:

...
d3[Y.name   "_rate"] = d2.mean().Y
d4 = (d3.sort_values(by="min_"   X.name)).reset_index(drop=True)
d4.dropna(inplace=True)
# print("=" * 85)
# print(d4)
ninf = float("-inf")
...
  • Related