ValueError: x and y must be the same size (Linear regression)-CodePudding

So, I'm trying to visualize my linear model regression. However, when I try to run it, it giving me a valueError. I have tried different solutions and also looked other at topics with the same problem.

df = pd.read_csv('housingmonthly.csv', sep=',')

X = df[['date', 'area', 'code','houses_sold', 'no_of_crimes']]
y = df['average_price']

X = pd.get_dummies(df[['date', 'area', 'code', 'houses_sold', 'no_of_crimes']])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print("Xtrain", X_train.shape, "y_train", 
      y_train.shape, "Xtest", X_test.shape, "y_test", y_test.shape)  


regr = linear_model.LinearRegression()

lr = LinearRegression()
lr.fit(X_train,y_train)
print("Score on training set: {:.3f}".format(lr.score(X_train, y_train)))
print("Score on test set: {:.3f}".format(lr.score(X_test, y_test)))

regr.fit(X_train, y_train)

y_pred = regr.predict(X_test)

plt.scatter(X_test, y_test, color="black")
plt.plot(X_test, y_pred, color="blue", linewidth=3)
plt.xticks(())
plt.yticks(())

plt.show()

Stacktrace :

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/var/folders/tl/80zdv_rx5sv1t7d5dgz86bzc0000gn/T/ipykernel_29101/3394670003.py in <module>
     15 print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
         16 
    ---> 17 plt.scatter(X_test, y_test, color="black")
         18 plt.plot(X_test, y_pred, color="blue", linewidth=3)
         19 plt.Xticks(())

/opt/anaconda3/lib/python3.8/site-packages/matplotlib/pyplot.py in scatter(x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, plotnonfinite, data, **kwargs)
   2888         verts=cbook.deprecation._deprecated_parameter,
   2889         edgecolors=None, *, plotnonfinite=False, data=None, **kwargs):
-> 2890     __ret = gca().scatter(
   2891         x, y, s=s, c=c, marker=marker, cmap=cmap, norm=norm,
   2892         vmin=vmin, vmax=vmax, alpha=alpha, linewidths=linewidths,

/opt/anaconda3/lib/python3.8/site-packages/matplotlib/__init__.py in inner(ax, data, *args, **kwargs)
   1445     def inner(ax, *args, data=None, **kwargs):
   1446         if data is None:
-> 1447             return func(ax, *map(sanitize_sequence, args), **kwargs)
   1448 
   1449         bound = new_sig.bind(ax, *args, **kwargs)

/opt/anaconda3/lib/python3.8/site-packages/matplotlib/cbook/deprecation.py in wrapper(*inner_args, **inner_kwargs)
    409                          else deprecation_addendum,
    410                 **kwargs)
--> 411         return func(*inner_args, **inner_kwargs)
    412 
    413     return wrapper

/opt/anaconda3/lib/python3.8/site-packages/matplotlib/axes/_axes.py in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, plotnonfinite, **kwargs)
   4439         y = np.ma.ravel(y)
   4440         if x.size != y.size:
-> 4441             raise ValueError("x and y must be the same size")
   4442 
   4443         if s is None:

This is the error code. I don't now, How i really should tackle this problem.

Thanks a lot

CodePudding user response：

I repeated your code by using the housing competition data (just to have a working example. Here my code (I commented lines of your code that did not fit my data)

df = pd.read_csv('data/train.csv')

#X = df[['date', 'area', 'code','houses_sold', 'no_of_crimes']]
#y = df['average_price']
X = df[['GarageType', 'Alley', 'LotShape']]
y = df['SalePrice']

X = pd.get_dummies(df[['GarageType', 'Alley', 'LotShape']])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print("Xtrain", X_train.shape, "y_train", 
      y_train.shape, "Xtest", X_test.shape, "y_test", y_test.shape)  


#regr = linear_model.LinearRegression()
regr = LinearRegression()


lr = LinearRegression()
lr.fit(X_train,y_train)
print("Score on training set: {:.3f}".format(lr.score(X_train, y_train)))
print("Score on test set: {:.3f}".format(lr.score(X_test, y_test)))

regr.fit(X_train, y_train)

y_pred = regr.predict(X_test)

plt.scatter(X_test, y_test, color="black")
plt.plot(X_test, y_pred, color="blue", linewidth=3)
plt.xticks(())
plt.yticks(())

plt.show()

If I check the shape I get

In [6]: X_test.shape
Out[6]: (365, 12)

In [7]: y_test.shape
Out[7]: (365,)

which is clearly not the same. You need one dimension for both X_test and y_test. I guess you want to choose one column, like this:

plt.scatter(X_test[X_test.columns[0]], y_test, color="black")