I am very new to scikit-learn PolynomialFeatures
and struggling with the following use case: I have x1
and x2
as both independent variables as well as a color
variable which would need to be converted to one hot encoded variables. These independent variables will predict y
(the target variable). I have x1
, x2
, color
and y
known, I need to get the coefficient and the intercept.
What I want to do is to only apply the degree = 3
to x1
(not to x2
and the one hot encoded color variables). I found some posts using sklearn_pandas
DataFrameMapper
as a solution but due to some restrictions in corporate computer, I am not able to install this package. What would be an alternative solution to do this?
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
x1 = [28.0, 29.0, 12.0, 12.0, 42.0, 35.0, 28.0, 30.0, 32.0, 46.0, 18.0, 28.0, 28.0, 64.0, 38.0, 18.0, 49.0, 37.0, 25.0, 24.0, 42.0, 50.0, 12.0, 64.0, 23.0, 35.0, 22.0, 16.0, 44.0, 77.0, 26.0, 44.0, 38.0, 37.0, 45.0, 42.0, 24.0, 42.0, 12.0, 46.0, 12.0, 26.0, 37.0, 15.0, 67.0, 36.0, 43.0, 36.0, 45.0, 82.0, 44.0, 30.0, 33.0, 51.0, 50.0]
x2 = [0.36, 0.53, 0.45, 0.48, 0.4, 0.44, 0.44, 0.6, 0.39, 0.39, 0.29, 0.52, 0.46, 0.55, 0.62, 0.53, 0.79, 0.57, 0.49, 0.23, 0.55, 0.54, 0.44, 0.74, 0.36, 0.46, 0.37, 0.38, 0.75, 0.8, 0.43, 0.43, 0.58, 0.38, 0.63, 0.39, 0.14, 0.26, 0.14, 0.62, 0.49, 0.46, 0.49, 0.53, 0.73, 0.48, 0.5, 0.47, 0.49, 0.83, 0.56, 0.22, 0.49, 0.43, 0.46]
y = [59.5833333333333, 59.5833333333333, 10.0, 10.0, 47.0833333333333, 51.2499999999999, 34.5833333333333, 88.75, 63.7499999999999, 34.5833333333333, 51.2499999999999, 10.0, 63.7499999999999, 51.0, 59.5833333333333, 47.0833333333333, 49.5625, 43.5624999999999, 63.7499999999999, 10.0, 76.25, 47.0833333333333, 10.0, 51.2499999999999, 47.0833333333333, 10.0, 35.0, 51.2499999999999, 76.25, 100.0, 51.2499999999999, 59.5833333333333, 63.7499999999999, 76.25, 100.0, 51.2499999999999, 10.0, 22.5, 10.0, 88.75, 10.0, 59.5833333333333, 47.0833333333333, 34.5833333333333, 51.2499999999999, 63.7499999999999, 63.7499999999999, 10.0, 76.25, 62.1249999999999, 47.0833333333333, 10.0, 76.25, 47.0833333333333, 88.75]
color = ['green','red','blue','purple','black','white','orange','grey ','gold','yellow','white','orange','grey ','green','red','purple','orange','grey ','gold','yellow','white','orange','grey ','green','red','blue','black','white','orange','grey ','gold','yellow','white','orange','grey ','green','red','blue','purple','orange','grey ','gold','green','red','blue','purple','black','white','orange','grey ','gold','yellow','white','orange','grey ']
df_final = pd.DataFrame({
'x1': x1,
'x2' :x2,
'y': y,
'color':color})
columns_to_category = ['color']
df_final[columns_to_category] = df_final[columns_to_category].astype('category') # change datetypes to category
df_final = pd.get_dummies(df_final, columns=columns_to_category) # One hot encoding the categories
col = [x for x in df_final.columns if ('color' in x)]
x = df_final[['x1','x2'] col]
y = df_final['y']
poly = PolynomialFeatures(degree = 3) # I only need to apply degree = 3 to x1, what would be the solution for this?
X_poly = poly.fit_transform(x)
lin2 = LinearRegression()
lin2.fit(X_poly, y)
intercept = lin2.intercept_ # to get the intercept
coeff = lin2.coef_ # to get the coefficient
r_squared_lin2 = lin2.score(X_poly, y)
print(r_squared_lin2)
CodePudding user response:
You can use scikit-learn ColumnTransformer
to apply the PolynomialFeatures
transformer and the OneHotEncoder
only to specific columns. Note that if you set include_bias=True
in the PolynomialFeatures
transformer then you need to set fit_intercept=False
in the LinearRegression
estimator and vice versa.
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
x1 = [28.0, 29.0, 12.0, 12.0, 42.0, 35.0, 28.0, 30.0, 32.0, 46.0, 18.0, 28.0, 28.0, 64.0, 38.0, 18.0, 49.0, 37.0, 25.0, 24.0, 42.0, 50.0, 12.0, 64.0, 23.0, 35.0, 22.0, 16.0, 44.0, 77.0, 26.0, 44.0, 38.0, 37.0, 45.0, 42.0, 24.0, 42.0, 12.0, 46.0, 12.0, 26.0, 37.0, 15.0, 67.0, 36.0, 43.0, 36.0, 45.0, 82.0, 44.0, 30.0, 33.0, 51.0, 50.0]
x2 = [0.36, 0.53, 0.45, 0.48, 0.4, 0.44, 0.44, 0.6, 0.39, 0.39, 0.29, 0.52, 0.46, 0.55, 0.62, 0.53, 0.79, 0.57, 0.49, 0.23, 0.55, 0.54, 0.44, 0.74, 0.36, 0.46, 0.37, 0.38, 0.75, 0.8, 0.43, 0.43, 0.58, 0.38, 0.63, 0.39, 0.14, 0.26, 0.14, 0.62, 0.49, 0.46, 0.49, 0.53, 0.73, 0.48, 0.5, 0.47, 0.49, 0.83, 0.56, 0.22, 0.49, 0.43, 0.46]
y = [59.5833333333333, 59.5833333333333, 10.0, 10.0, 47.0833333333333, 51.2499999999999, 34.5833333333333, 88.75, 63.7499999999999, 34.5833333333333, 51.2499999999999, 10.0, 63.7499999999999, 51.0, 59.5833333333333, 47.0833333333333, 49.5625, 43.5624999999999, 63.7499999999999, 10.0, 76.25, 47.0833333333333, 10.0, 51.2499999999999, 47.0833333333333, 10.0, 35.0, 51.2499999999999, 76.25, 100.0, 51.2499999999999, 59.5833333333333, 63.7499999999999, 76.25, 100.0, 51.2499999999999, 10.0, 22.5, 10.0, 88.75, 10.0, 59.5833333333333, 47.0833333333333, 34.5833333333333, 51.2499999999999, 63.7499999999999, 63.7499999999999, 10.0, 76.25, 62.1249999999999, 47.0833333333333, 10.0, 76.25, 47.0833333333333, 88.75]
color = ['green','red','blue','purple','black','white','orange','grey ','gold','yellow','white','orange','grey ','green','red','purple','orange','grey ','gold','yellow','white','orange','grey ','green','red','blue','black','white','orange','grey ','gold','yellow','white','orange','grey ','green','red','blue','purple','orange','grey ','gold','green','red','blue','purple','black','white','orange','grey ','gold','yellow','white','orange','grey ']
df = pd.DataFrame({'x1': x1, 'x2': x2, 'y': y, 'color': color})
X = df[['x1', 'x2', 'color']]
y = df['y']
preprocessor = ColumnTransformer(
transformers=[
('encoder', OneHotEncoder(sparse=False), ['color']),
('transformer', PolynomialFeatures(degree=3, include_bias=False), ['x1']),
],
remainder='passthrough'
)
pipeline = Pipeline([
('preprocessor', preprocessor),
('regressor', LinearRegression(fit_intercept=True))
])
pipeline.fit(X, y)
print(pipeline.score(X, y))
# 0.5552322374079989
print(pipeline['regressor'].intercept_)
# -39.54122167504586
print(pipeline['regressor'].coef_)
# [ 2.60299525e-01 -2.18746546e 01 1.03128330e 01 -3.13760382e 00
# 1.45075308e 01 -1.90458338e 00 6.44800139e 00 2.91843209e 00
# 8.65334498e 00 -1.61836001e 01 3.67529674e 00 -5.30354716e-02
# 2.24998469e-04 3.99616163e 01]