Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required-CodePudding

This error keeps coming when I try to find MI values. My code is as follows

X_new = X.copy()
X_new = X_new.fillna(0)
y = data.SalePrice


def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()       
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]   
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")
    
plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(make_mi_scores(X_new,y))

if you want the full notebook here is a link https://www.kaggle.com/code/snigdhkarki/house-price-competition

The error is as follows

    ValueError                                Traceback (most recent call last)
/tmp/ipykernel_19/1575243112.py in <module>
     42 
     43 plt.figure(dpi=100, figsize=(8, 5))
---> 44 plot_mi_scores(make_mi_scores(X_new,y))

/tmp/ipykernel_19/1575243112.py in make_mi_scores(X, y)
     28     print(X.isnull().any().any())
     29     print(y.isnull().any().any())
---> 30     mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
     31     mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
     32     mi_scores = mi_scores.sort_values(ascending=False)

/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in mutual_info_regression(X, y, discrete_features, n_neighbors, copy, random_state)
    382            of a Random Vector", Probl. Peredachi Inf., 23:2 (1987), 9-16
    383     """
--> 384     return _estimate_mi(X, y, discrete_features, False, n_neighbors, copy, random_state)
    385 
    386 

/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in _estimate_mi(X, y, discrete_features, discrete_target, n_neighbors, copy, random_state)
    300     mi = [
    301         _compute_mi(x, y, discrete_feature, discrete_target, n_neighbors)
--> 302         for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
    303     ]
    304 

/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in <listcomp>(.0)
    300     mi = [
    301         _compute_mi(x, y, discrete_feature, discrete_target, n_neighbors)
--> 302         for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
    303     ]
    304 

/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in _compute_mi(x, y, x_discrete, y_discrete, n_neighbors)
    160         return mutual_info_score(x, y)
    161     elif x_discrete and not y_discrete:
--> 162         return _compute_mi_cd(y, x, n_neighbors)
    163     elif not x_discrete and y_discrete:
    164         return _compute_mi_cd(x, y, n_neighbors)

/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in _compute_mi_cd(c, d, n_neighbors)
    137     radius = radius[mask]
    138 
--> 139     kd = KDTree(c)
    140     m_all = kd.query_radius(c, radius, count_only=True, return_distance=False)
    141     m_all = np.array(m_all) - 1.0

sklearn/neighbors/_binary_tree.pxi in sklearn.neighbors._kd_tree.BinaryTree.__init__()

/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
    806                 "Found array with %d sample(s) (shape=%s) while a"
    807                 " minimum of %d is required%s."
--> 808                 % (n_samples, array.shape, ensure_min_samples, context)
    809             )
    810 

ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required.

There were only a few places where this question was asked and even in those places I was unable to find any answers to my question

CodePudding user response：

The issue is arising where you are calling mutual_info_regression over here -

mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)

As per sklearn's documentation, the parameter discrete_features should be a boolean mask that has True for discrete variables and False otherwise.

I checked your Kaggle code and it seems like your technique for identifying discrete and continuous features in your data frame is wrong.

A simple hack to get the code running would be to identify all features as continuous using the following code -

discrete_features = [False]*73
# 73 is the number of columns X has

However, the result might be wrong if the mutual_info_regression algorithm requires you to accurately identify discrete and continuous features.