Download as pdf or txt
Download as pdf or txt
You are on page 1of 5

regression

December 27, 2021

[2]: from pyspark.sql import SparkSession


sp= SparkSession.builder.appName("Python Spark regression example").
,→config("spark.some.config.option", "some-value").getOrCreate()

[3]: df = spark.read.format('csv').options(header='true',inferschema='true').
,→load("data.csv",header=True);

[4]: import pandas as pd


pd.DataFrame(df.take(3), columns=df.columns)

[4]: TV Radio Newspaper Sales


0 230.1 37.8 69.2 22.1
1 44.5 39.3 45.1 10.4
2 17.2 45.9 69.3 9.3

[5]: df.describe().toPandas()

[5]: summary TV Radio Newspaper \


0 count 200 200 200
1 mean 147.0425 23.264000000000024 30.553999999999995
2 stddev 85.85423631490805 14.846809176168728 21.77862083852283
3 min 0.7 0.0 0.3
4 max 296.4 49.6 114.0

Sales
0 200
1 14.022500000000003
2 5.217456565710477
3 1.6
4 27.0

[6]: df.printSchema()

root

1
|-- TV: double (nullable = true)
|-- Radio: double (nullable = true)
|-- Newspaper: double (nullable = true)
|-- Sales: double (nullable = true)

[7]: df.show(5)

+-----+-----+---------+-----+
| TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8| 69.2| 22.1|
| 44.5| 39.3| 45.1| 10.4|
| 17.2| 45.9| 69.3| 9.3|
|151.5| 41.3| 58.5| 18.5|
|180.8| 10.8| 58.4| 12.9|
+-----+-----+---------+-----+
only showing top 5 rows

[32]: transformed= transData(df)


transformed.show(5)

[Stage 20:> (0 + 1) / 1]
+-----------------+-----+
| features|label|
+-----------------+-----+
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]| 9.3|
|[151.5,41.3,58.5]| 18.5|
|[180.8,10.8,58.4]| 12.9|
+-----------------+-----+
only showing top 5 rows

[34]: from pyspark.ml import Pipeline


from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

# Automatically identify categorical features, and index them.


# We specify maxCategories so features with > 4 distinct values are treated as␣
,→continuous.

2
featureIndexer =␣
,→VectorIndexer(inputCol="features",outputCol="indexedFeatures",maxCategories=4).

,→fit(transformed)

data = featureIndexer.transform(transformed)
data.show(5,True)

+-----------------+-----+-----------------+
| features|label| indexedFeatures|
+-----------------+-----+-----------------+
|[230.1,37.8,69.2]| 22.1|[230.1,37.8,69.2]|
| [44.5,39.3,45.1]| 10.4| [44.5,39.3,45.1]|
| [17.2,45.9,69.3]| 9.3| [17.2,45.9,69.3]|
|[151.5,41.3,58.5]| 18.5|[151.5,41.3,58.5]|
|[180.8,10.8,58.4]| 12.9|[180.8,10.8,58.4]|
+-----------------+-----+-----------------+
only showing top 5 rows

[35]: # Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = transformed.randomSplit([0.8, 0.2])

[36]: # Import LinearRegression class


from pyspark.ml.regression import LinearRegression

# Define LinearRegression algorithm


lr = LinearRegression()

[38]: import warnings


warnings.filterwarnings('ignore')
# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, lr])

model = pipeline.fit(trainingData)

21/12/14 14:37:13 WARN Instrumentation: [8d638690] regParam is zero, which might


cause numerical instability and overfitting.

[39]: def modelsummary(model):


import numpy as np
print ("Note: the last rows are the information for Intercept")
print ("##","-------------------------------------------------")
print ("##"," Estimate | Std.Error | t Values | P-value")
coef = np.append(list(model.coefficients),model.intercept)
Summary=model.summary

3
for i in range(len(Summary.pValues)):
print ("##",'{:10.6f}'.format(coef[i]),\
'{:10.6f}'.format(Summary.coefficientStandardErrors[i]),\
'{:8.3f}'.format(Summary.tValues[i]),\
'{:10.6f}'.format(Summary.pValues[i]))

print ("##",'---')
print ("##","Mean squared error: % .6f" \
% Summary.meanSquaredError, ", RMSE: % .6f" \
% Summary.rootMeanSquaredError )
print ("##","Multiple R-squared: %f" % Summary.r2, ", \
Total iterations: %i"% Summary.totalIterations)

[40]: modelsummary(model.stages[-1])

Note: the last rows are the information for Intercept


## -------------------------------------------------
## Estimate | Std.Error | t Values | P-value
## 0.044758 0.001555 28.783 0.000000
## 0.186763 0.009541 19.575 0.000000
## 0.006556 0.007003 0.936 0.350575
## 2.921133 0.343975 8.492 0.000000
## ---
## Mean squared error: 2.828389 , RMSE: 1.681782
## Multiple R-squared: 0.897012 , Total iterations: 0

[41]: # Make predictions.


predictions = model.transform(testData)

[42]: from pyspark.ml.evaluation import RegressionEvaluator


# Select (prediction, true label) and compute test error
evaluator =␣
,→RegressionEvaluator(labelCol="label",predictionCol="prediction",metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

[Stage 30:> (0 + 1) / 1]
Root Mean Squared Error (RMSE) on test data = 1.66064

[43]: y_true = predictions.select("label").toPandas()


y_pred = predictions.select("prediction").toPandas()

import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)

4
print('r2_score: {0}'.format(r2_score))

r2_score: 0.8900904334948799

[ ]:

You might also like