引き続きとりあえず機械学習、
今回は決定木を
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.regression.DecisionTreeRegressor
val categoricalCols = trainDF.dtypes.filter(_._2 == "StringType").map(_._1)
val indexOutputCols = categoricalCols.map(_ + "index")
val stringIndexer = new StringIndexer().setInputCols(categoricalCols).setOutputCols(indexOutputCols).setHandleInvalid("skip")
val numericCols = trainDF.dtypes.filter{ case (field, dataType) => dataType == "DoubleType" && field != "price"}.map(_._1)
val assemblerInputs = categoricalCols ++ numericCols
val vecAssembler = new VectorAssembler().setInputCols(assemblerInputs).setOutputCol("features")
val dt = new DecisionTreeRegressor().setLabelCol("price")
val stages = Array(stringIndexer, vecAssembler, dt)
val pipeline = new Pipeline().setStages(stages)
dt.setMaxBins(40)
val pipelineModel = pipeline.fit(trainDF)
val predDF = pipelineModel.transform(testDF)
val regressionEvaluator = new RegressionEvaluator().setPredictionCol("prediction").setLabelCol("price").setMetricName("rmse")
val rmse = regressionEvaluator.evaluate(predDF)