!pip install findspark
!pip install pyspark


# create spark app

import findspark
findspark.init()

import pyspark

# import SparkSession type from the ‘pyspark.sql‘ module
from pyspark.sql import SparkSession

# create the 'spark' object and set the name of the application as the 'appName' parameter (in a distributed environment it can run simultaneously
# multiple applications that need to be named so that we can distinguish them)

spark = SparkSession.builder.appName("mllib_example_2").getOrCreate()
sc = spark.sparkContext

# we can now use the 'spark' interface object to create and process data frames


from pyspark.sql import Row

raw_data = sc.textFile("iris.csv")
csv_data = raw_data.map(lambda x: x.split(","))

csv_data = csv_data.map(lambda line: [line[0], line[1], line[2], line[3]])

df_data = csv_data.map(lambda line: Row(
        petal_length = float(line[0]),
        petal_width = float(line[1]),
        sepal_length = float(line[2]),
        sepal_width = float(line[3])))

df = spark.createDataFrame(df_data)
                       
from pyspark.ml.feature import VectorAssembler

vector_data = VectorAssembler(inputCols=["petal_length", "petal_width", "sepal_length", "sepal_width"], outputCol="features").transform(df)


from pyspark.ml.clustering import KMeans

kmeans = KMeans(featuresCol="features", k=3, seed=1234)
kmeans_model = kmeans.fit(vector_data)


# the 'transform' function during clustering adds to the data frame the indexes of the clusters in which the tested samples were included

clusters = kmeans_model.transform(vector_data)
clusters.show()


print(kmeans_model.clusterCenters()[0])


# import the types
from pyspark.sql.types import StructType,StructField, StringType, DoubleType

# define the data scheme
schema = StructType([ \
    StructField("petal_length", DoubleType(),True), \
    StructField("petal_width", DoubleType(),True), \
    StructField("sepal_length", DoubleType(),True), \
    StructField("sepal_width", DoubleType(), True), \
    StructField("label", StringType(), True), \
  ])

# we will also use the iris data from the previous exercise
# this time we'll just use the dataframe API and load the CSV directly into the dataframe
# if we have a csv file with a header, we use header=True

df = spark.read.csv('iris.csv', schema=schema, header=False)
df.head()


df.select('petal_width', 'petal_length').describe().show()


df.groupBy('label').count().show()


train_data, test_data = df.randomSplit([0.7, 0.3], seed=123)


train_data.show()


# import necessary types
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression

# define the stage 1 : target feature transformation using the StringIndexeru
stage_1 = StringIndexer(inputCol= 'label', outputCol= 'label_index')

# define the stage 3 : transform the data to vectors using Vector Assembleru
stage_2 = VectorAssembler(inputCols=['petal_length', 'petal_width', 'sepal_length', 'sepal_width'],
                          outputCol='features')

# define the stage 2 : transform the predictive features using StandardScaleru
stage_3 = StandardScaler(inputCol = 'features', outputCol = 'features_scaled')

# define the stage 4 : train the tree classifier
stage_4 = LogisticRegression(featuresCol='features_scaled', labelCol='label_index')

# define the pipeline
pipeline = Pipeline(stages=[stage_1, stage_2, stage_3, stage_4])

# train/test the pipeline on a given data
pipeline_model = pipeline.fit(train_data)
df_updated = pipeline_model.transform(test_data)

df_updated.show()


# import the types
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# define the stage 1 : target feature transformation using the StringIndexeru
stage_1 = StringIndexer(inputCol= 'label', outputCol= 'label_index')

# define the stage 3 : transform the data to vectors using Vector Assembleru
stage_2 = VectorAssembler(inputCols=['petal_length', 'petal_width', 'sepal_length', 'sepal_width'],
                          outputCol='features')

# define the stage 2 : transform the predictive features using StandardScaleruu
stage_3 = StandardScaler(inputCol = 'features', outputCol = 'features_scaled')

# define the stage 4 : train the tree classifier
stage_4 = DecisionTreeClassifier(featuresCol= 'features_scaled', labelCol='label_index')

# define the pipeline
pipeline = Pipeline(stages=[stage_1, stage_2, stage_3, stage_4])

# to find the optimal hyperparameter values of the model by cross-validation, we set the parameters and values that we will examine
# for the tree it will be impurities and its maximum depth, we specify the range of parameters
paramGrid = ParamGridBuilder().addGrid(stage_4.impurity, ['entropy', 'gini']).addGrid(stage_4.maxDepth, [2, 3, 4]).build()

# set up cross-validation - the estimator will be a pipeline
# pay attention to setting the value of the target attribute in the evaluator (implicitly uses the label column) and setting the metric of the evaluator
crossVal = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator(labelCol='label_index', metricName='f1'), numFolds=10) 

# train the CV model on the training data
cvModel = crossVal.fit(train_data)


cvModel.avgMetrics


cvModel.getEstimatorParamMaps()


# we can save the best model from the defined pipeline step and use it as standard MLlib models
bestTreeModel = cvModel.bestModel.stages[3]


print(bestTreeModel.getMaxDepth())
print(bestTreeModel.getImpurity())


# store the best Pipeline in the bestModel (with the best performing classifier)
bestModel = cvModel.bestModel

# apply the test data
predictions = bestModel.transform(test_data)


predictions.show()


# import the types for a particular classification task
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# calculate the metrics
evaluator_a = MulticlassClassificationEvaluator(labelCol="label_index", predictionCol="prediction", metricName="accuracy")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label_index", predictionCol="prediction", metricName="f1")
evaluator_r = MulticlassClassificationEvaluator(labelCol="label_index", predictionCol="prediction", metricName="weightedRecall")
evaluator_p = MulticlassClassificationEvaluator(labelCol="label_index", predictionCol="prediction", metricName="weightedPrecision")


print("accuracy :{}".format(evaluator_a.evaluate(predictions)))
print("f1 :{}".format(evaluator_f1.evaluate(predictions)))
print("weighted recall :{}".format(evaluator_r.evaluate(predictions)))
print("weighted precision :{}".format(evaluator_p.evaluate(predictions)))


from pyspark.mllib.evaluation import MulticlassMetrics

# important: need to cast to float type, and order by prediction, else it won't work
# preds_and_labels = predictions.select(['predictions','d']).withColumn('label_index', F.col('d').cast(FloatType())).orderBy('prediction')

# we select only the prediction and indexed target columns from the prediction data frameo
preds_and_labels = predictions.select(['prediction','label_index'])

# use the MulticlassMetrics - but from the RDD MLlib API
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

# compute the confusion matrix
print(metrics.confusionMatrix().toArray())


from pyspark.ml.feature import IndexToString

index_to_string = IndexToString(inputCol="label_index", outputCol="categoryValue")
index_to_string.transform(preds_and_labels).drop("prediction").distinct().show()

Learning models using the MLlib II library¶

Clustering¶

ML Pipelines¶

Task 6.1¶