from __future__ import print_function from pyspark.ml import Pipeline from pyspark.ml.regression import DecisionTreeRegressor from pyspark.ml.feature import VectorIndexer from pyspark.ml.evaluation import RegressionEvaluator from pyspark.sql import SparkSession
if __name__ == "__main__": spark = SparkSession.builder.appName("DecisionTreeRegressionExample").getOrCreate()
data = spark.read.format("libsvm").load("../data/mllib/sample_libsvm_data.txt")
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
Root Mean Squared Error (RMSE) on test data = 0.297044 DecisionTreeRegressionModel (uid=DecisionTreeRegressor_4089a3fc367ac7a943d9) of depth 2 with 5 nodes
from __future__ import print_function from pyspark.ml import Pipeline from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.feature import VectorIndexer from pyspark.ml.evaluation import RegressionEvaluator from pyspark.sql import SparkSession
if __name__ == "__main__": spark = SparkSession.builder.appName("RandomForestRegressorExample").getOrCreate()
data = spark.read.format("libsvm").load("../data/mllib/sample_libsvm_data.txt")
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
from __future__ import print_function from pyspark.ml import Pipeline from pyspark.ml.regression import GBTRegressor from pyspark.ml.feature import VectorIndexer from pyspark.ml.evaluation import RegressionEvaluator from pyspark.sql import SparkSession
if __name__ == "__main__": spark = SparkSession.builder.appName("GradientBoostedTreeRegressorExample").getOrCreate()
data = spark.read.format("libsvm").load("../data/mllib/sample_libsvm_data.txt")
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
from __future__ import print_function from pyspark.ml.regression import AFTSurvivalRegression from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession
if __name__ == "__main__": spark = SparkSession.builder.appName("AFTSurvivalRegressionExample").getOrCreate()
model = IsotonicRegression().fit(dataset) print("Boundaries in increasing order: %s\n" % str(model.boundaries)) print("Predictions associated with the boundaries: %s\n" % str(model.predictions))
model.transform(dataset).show(5)
spark.stop()
结果如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14
Boundaries in increasing order: [0.01,0.17,0.18,0.27,0.28,0.29,0.3,0.31,0.34,0.35,0.36,0.41,0.42,0.71,0.72,0.74,0.75,0.76,0.77,0.78,0.79,0.8,0.81,0.82,0.83,0.84,0.85,0.86,0.87,0.88,0.89,1.0]
Predictions associated with the boundaries: [0.15715271294117644,0.15715271294117644,0.189138196,0.189138196,0.20040796,0.29576747,0.43396226,0.5081591025000001,0.5081591025000001,0.54156043,0.5504844466666667,0.5504844466666667,0.563929967,0.563929967,0.5660377366666667,0.5660377366666667,0.56603774,0.57929628,0.64762876,0.66241713,0.67210607,0.67210607,0.674655785,0.674655785,0.73890872,0.73992861,0.84242733,0.89673636,0.89673636,0.90719021,0.9272055075,0.9272055075]
+----------+--------------+-------------------+ | label| features| prediction| +----------+--------------+-------------------+ |0.24579296|(1,[0],[0.01])|0.15715271294117644| |0.28505864|(1,[0],[0.02])|0.15715271294117644| |0.31208567|(1,[0],[0.03])|0.15715271294117644| |0.35900051|(1,[0],[0.04])|0.15715271294117644| |0.35747068|(1,[0],[0.05])|0.15715271294117644| +----------+--------------+-------------------+ only showing top 5 rows