from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import DoubleType
for i in range(0, len(names)):
df2=trainingDF
numericCols = [c for c in trainingDF.columns if c not in {names[i]}]
assembler = VectorAssembler(inputCols=numericCols, outputCol="features")
df = assembler.transform(df2)
df=df.withColumnRenamed(names[i],"label")
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(df)
evaluator = RegressionEvaluator()
evaluator.setPredictionCol(names[i])
evaluator.evaluate(df2[names[i]].cast(DoubleType()))
rsq=evaluator.evaluate(df, {evaluator.metricName: "r2"})
vif = round(1 / (1 - rsq), 2)
print(vif)
I am getting an error on the following line. I want to pass all the columns of df2
as double values and typecast them to double.
evaluator.evaluate(df2[names[i]].cast(DoubleType()))
CodePudding user response:
According to the documentation, the method evaluate
takes a pyspark.sql.DataFrame
object as the first parameter, but you have provided a column (df2[names[i]].cast(DoubleType())
is of Column data type, not DataFrame).
evaluate
(dataset: pyspark.sql.dataframe.DataFrame, params: Optional[ParamMap] = None) → floatEvaluates the output with optional parameters.
New in version 1.4.0.
Parameters:
dataset : pyspark.sql.DataFrame
a dataset that contains labels/observations and predictionsparams : dict, optional
an optional param map that overrides embedded paramsReturns:
- float
metric