This error keeps coming when I try to find MI values. My code is as follows
X_new = X.copy()
X_new = X_new.fillna(0)
y = data.SalePrice
def make_mi_scores(X, y):
X = X.copy()
for colname in X.select_dtypes(["object", "category"]):
X[colname], _ = X[colname].factorize()
discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
mi_scores = mi_scores.sort_values(ascending=False)
return mi_scores
def plot_mi_scores(scores):
scores = scores.sort_values(ascending=True)
width = np.arange(len(scores))
ticks = list(scores.index)
plt.barh(width, scores)
plt.yticks(width, ticks)
plt.title("Mutual Information Scores")
plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(make_mi_scores(X_new,y))
if you want the full notebook here is a link https://www.kaggle.com/code/snigdhkarki/house-price-competition
The error is as follows
ValueError Traceback (most recent call last)
/tmp/ipykernel_19/1575243112.py in <module>
42
43 plt.figure(dpi=100, figsize=(8, 5))
---> 44 plot_mi_scores(make_mi_scores(X_new,y))
/tmp/ipykernel_19/1575243112.py in make_mi_scores(X, y)
28 print(X.isnull().any().any())
29 print(y.isnull().any().any())
---> 30 mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
31 mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
32 mi_scores = mi_scores.sort_values(ascending=False)
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in mutual_info_regression(X, y, discrete_features, n_neighbors, copy, random_state)
382 of a Random Vector", Probl. Peredachi Inf., 23:2 (1987), 9-16
383 """
--> 384 return _estimate_mi(X, y, discrete_features, False, n_neighbors, copy, random_state)
385
386
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in _estimate_mi(X, y, discrete_features, discrete_target, n_neighbors, copy, random_state)
300 mi = [
301 _compute_mi(x, y, discrete_feature, discrete_target, n_neighbors)
--> 302 for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
303 ]
304
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in <listcomp>(.0)
300 mi = [
301 _compute_mi(x, y, discrete_feature, discrete_target, n_neighbors)
--> 302 for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
303 ]
304
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in _compute_mi(x, y, x_discrete, y_discrete, n_neighbors)
160 return mutual_info_score(x, y)
161 elif x_discrete and not y_discrete:
--> 162 return _compute_mi_cd(y, x, n_neighbors)
163 elif not x_discrete and y_discrete:
164 return _compute_mi_cd(x, y, n_neighbors)
/opt/conda/lib/python3.7/site-packages/sklearn/feature_selection/_mutual_info.py in _compute_mi_cd(c, d, n_neighbors)
137 radius = radius[mask]
138
--> 139 kd = KDTree(c)
140 m_all = kd.query_radius(c, radius, count_only=True, return_distance=False)
141 m_all = np.array(m_all) - 1.0
sklearn/neighbors/_binary_tree.pxi in sklearn.neighbors._kd_tree.BinaryTree.__init__()
/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
806 "Found array with %d sample(s) (shape=%s) while a"
807 " minimum of %d is required%s."
--> 808 % (n_samples, array.shape, ensure_min_samples, context)
809 )
810
ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required.
There were only a few places where this question was asked and even in those places I was unable to find any answers to my question
CodePudding user response:
The issue is arising where you are calling mutual_info_regression over here -
mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
As per sklearn's documentation, the parameter discrete_features should be a boolean mask that has True for discrete variables and False otherwise.
I checked your Kaggle code and it seems like your technique for identifying discrete and continuous features in your data frame is wrong.
A simple hack to get the code running would be to identify all features as continuous using the following code -
discrete_features = [False]*73
# 73 is the number of columns X has
However, the result might be wrong if the mutual_info_regression algorithm requires you to accurately identify discrete and continuous features.