So, I'm trying to visualize my linear model regression. However, when I try to run it, it giving me a valueError. I have tried different solutions and also looked other at topics with the same problem.
df = pd.read_csv('housingmonthly.csv', sep=',')
X = df[['date', 'area', 'code','houses_sold', 'no_of_crimes']]
y = df['average_price']
X = pd.get_dummies(df[['date', 'area', 'code', 'houses_sold', 'no_of_crimes']])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print("Xtrain", X_train.shape, "y_train",
y_train.shape, "Xtest", X_test.shape, "y_test", y_test.shape)
regr = linear_model.LinearRegression()
lr = LinearRegression()
lr.fit(X_train,y_train)
print("Score on training set: {:.3f}".format(lr.score(X_train, y_train)))
print("Score on test set: {:.3f}".format(lr.score(X_test, y_test)))
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
plt.scatter(X_test, y_test, color="black")
plt.plot(X_test, y_pred, color="blue", linewidth=3)
plt.xticks(())
plt.yticks(())
plt.show()
Stacktrace :
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/tl/80zdv_rx5sv1t7d5dgz86bzc0000gn/T/ipykernel_29101/3394670003.py in <module>
15 print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
16
---> 17 plt.scatter(X_test, y_test, color="black")
18 plt.plot(X_test, y_pred, color="blue", linewidth=3)
19 plt.Xticks(())
/opt/anaconda3/lib/python3.8/site-packages/matplotlib/pyplot.py in scatter(x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, plotnonfinite, data, **kwargs)
2888 verts=cbook.deprecation._deprecated_parameter,
2889 edgecolors=None, *, plotnonfinite=False, data=None, **kwargs):
-> 2890 __ret = gca().scatter(
2891 x, y, s=s, c=c, marker=marker, cmap=cmap, norm=norm,
2892 vmin=vmin, vmax=vmax, alpha=alpha, linewidths=linewidths,
/opt/anaconda3/lib/python3.8/site-packages/matplotlib/__init__.py in inner(ax, data, *args, **kwargs)
1445 def inner(ax, *args, data=None, **kwargs):
1446 if data is None:
-> 1447 return func(ax, *map(sanitize_sequence, args), **kwargs)
1448
1449 bound = new_sig.bind(ax, *args, **kwargs)
/opt/anaconda3/lib/python3.8/site-packages/matplotlib/cbook/deprecation.py in wrapper(*inner_args, **inner_kwargs)
409 else deprecation_addendum,
410 **kwargs)
--> 411 return func(*inner_args, **inner_kwargs)
412
413 return wrapper
/opt/anaconda3/lib/python3.8/site-packages/matplotlib/axes/_axes.py in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, plotnonfinite, **kwargs)
4439 y = np.ma.ravel(y)
4440 if x.size != y.size:
-> 4441 raise ValueError("x and y must be the same size")
4442
4443 if s is None:
This is the error code. I don't now, How i really should tackle this problem.
Thanks a lot
CodePudding user response:
I repeated your code by using the housing competition data (just to have a working example. Here my code (I commented lines of your code that did not fit my data)
df = pd.read_csv('data/train.csv')
#X = df[['date', 'area', 'code','houses_sold', 'no_of_crimes']]
#y = df['average_price']
X = df[['GarageType', 'Alley', 'LotShape']]
y = df['SalePrice']
X = pd.get_dummies(df[['GarageType', 'Alley', 'LotShape']])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print("Xtrain", X_train.shape, "y_train",
y_train.shape, "Xtest", X_test.shape, "y_test", y_test.shape)
#regr = linear_model.LinearRegression()
regr = LinearRegression()
lr = LinearRegression()
lr.fit(X_train,y_train)
print("Score on training set: {:.3f}".format(lr.score(X_train, y_train)))
print("Score on test set: {:.3f}".format(lr.score(X_test, y_test)))
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
plt.scatter(X_test, y_test, color="black")
plt.plot(X_test, y_pred, color="blue", linewidth=3)
plt.xticks(())
plt.yticks(())
plt.show()
If I check the shape I get
In [6]: X_test.shape
Out[6]: (365, 12)
In [7]: y_test.shape
Out[7]: (365,)
which is clearly not the same. You need one dimension for both X_test and y_test. I guess you want to choose one column, like this:
plt.scatter(X_test[X_test.columns[0]], y_test, color="black")