!pip install findspark
!pip install pyspark

# creating a spark application
import findspark
findspark.init()

import pyspark

# import the SparkSession type from the 'pyspark.sql' module into the script
from pyspark.sql import SparkSession

# create the 'spark' object and set the name of the application as the 'appName' parameter (in a distributed environment it can run simultaneously
# several applications that need to be named so that we can distinguish them)

spark = SparkSession.builder.appName("mllib_example").getOrCreate()
sc = spark.sparkContext

# we can now use the 'spark' interface object to create and process data frames

# we will import the necessary types
from pyspark.sql import Row
import urllib

# we will download the data from the Internet and save it in the working directory
urllib.request.urlretrieve("http://people.tuke.sk/martin.sarnovsky/tsvd/files/iris.csv", "iris.csv")

# we load the data and remap it to objects of type 'Row'
raw_data = sc.textFile("iris.csv")
csv_data = raw_data.map(lambda x: x.split(","))

csv_data.take(5)

# as class 1 we mark examples of the iris-versicolor species and as class 0 we mark all other examples

csv_data = csv_data.map(lambda line: [line[0], line[1], line[2], line[3],
         1.0 if line[4] == "iris-versicolor" else 0.0])

csv_data.take(5)

df_data = csv_data.map(lambda line: Row(
        petal_length = float(line[0]),
        petal_width = float(line[1]),
        sepal_length = float(line[2]),
        sepal_width = float(line[3]),
        label = line[4]))
df = spark.createDataFrame(df_data)

df.head(5)

df.groupBy('label').count().show()

# we import Vector Assembler
from pyspark.ml.feature import VectorAssembler

# the following command combines all input attributes into a numeric vector, which it stores in a new column 'features'
vector_data = VectorAssembler(inputCols=["petal_length", "petal_width", "sepal_length", "sepal_width"],
         outputCol="features").transform(df)

vector_data.head()

# we divide the data into a training (70%) and a test (30%) set by random selection
training_data, test_data = vector_data.randomSplit([0.7, 0.3], seed=123)

# we import the necessary libraries
from pyspark.ml.classification import LinearSVC

# first we create the 'LinearSVC' object and set the parameters of the algorithm
svm_classifier = LinearSVC(
         featuresCol="features", # parameter features - a data column containing a vector of input attributes
         labelCol="label") # parameter labelCol - data column containing the target attribute (class indexes)

# we will train the model with the 'fit' function, to which we will sell the training data
svm_model = svm_classifier.fit(training_data)

# we will evaluate the accuracy of the classification on the test data using the 'transform' function
predictions = svm_model.transform(test_data)

predictions.head()

test_error = predictions.filter(predictions["prediction"] != predictions["label"]).count() / float(test_data.count())
print("Testing error: {0:.4f}".format(test_error))

# we will import the necessary modules
from pyspark.sql import Row
import urllib

# we download the data and load it as lists of strings
# urllib.request.urlretrieve("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz")
raw_data = sc.textFile("./kddcup.data_10_percent.gz")
csv_data = raw_data.map(lambda x: x.split(","))

df_data = csv_data.map(lambda line: Row(
        duration= float(line[0]),
        protocol_type = line[1],
        src_bytes = float(line[4]),
        dst_bytes = float(line[5]),
        land = float(line[6]),
        attack_type = line[41]))
df = spark.createDataFrame(df_data)

# print the first record
df.first()

df.groupBy('protocol_type').count().show()

df.groupBy('attack_type').count().show(40)

from pyspark.ml.feature import StringIndexer

# first we create an index of values by calling the 'fit' function
attack_type_index = StringIndexer(inputCol="attack_type", outputCol="attack_type_index").fit(df)
# after applying the transformation, a new numeric attribute 'attack_type_index' is added to the data frame
df = attack_type_index.transform(df)

df.head()

# a list of nominal values ordered by the assigned indices can be obtained from the index object via the 'labels' attribute
# e.g. the following command will list the number of classes, i.e. number of values of target attribute 'attack_type'
print("Number of classes: {0}".format(len(attack_type_index.labels)))

from pyspark.ml.feature import OneHotEncoder

protocol_type_index = StringIndexer(inputCol="protocol_type", outputCol="protocol_type_index").fit(df)
df = protocol_type_index.transform(df)

encoder = OneHotEncoder(inputCol="protocol_type_index", outputCol="protocol_encoded").fit(df)
df = encoder.transform(df)

df.head()

# we will remove the original nominal values (i.e. the data frame will further contain only numeric columns)
# df = df.drop("protocol_type")
# df = df.drop("attack_type")
# df.show()

from pyspark.ml.feature import VectorAssembler

# the following command combines all input attributes into a numeric vector, which it stores in a new column 'features'
vector_data = VectorAssembler(inputCols=["dst_bytes", "duration", "land", "src_bytes", "protocol_encoded"],
        outputCol="features").transform(df)

vector_data.head()

training_data, testing_data = vector_data.randomSplit([0.8, 0.2], seed=1234)

from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel

tree_classifier = DecisionTreeClassifier(
        featuresCol="features",                 # data column containing vector of input attributes
        labelCol="attack_type_index",           # data column containing target attribute (class indexes)
        impurity="entropy",                     # the information gain criterion is used to select the attributes when splitting
        maxDepth=5)                             # limit the maximum depth of the generated tree

# we create a classification model by calling the 'fit' function on the training data
tree_model = tree_classifier.fit(training_data)

# we can save the created model to a file using the 'save' function
tree_model.save("decision_tree_1.model")

# you can reload the saved model from a file with the function 'DecisionTreeClassificationModel.load'
tree_model = DecisionTreeClassificationModel.load("decision_tree_1.model")

print(tree_model.toDebugString)

# we will evaluate the accuracy of the classification on the test data using the 'transform' function
# after classification, new columns are added to the data frame that contain the predicted class and probabilities

predictions = tree_model.transform(testing_data)

# the predicted class is stored in the 'prediction' column, we count the number of misclassified examples
# where 'prediction' is not equal to target attribute 'attack_type_index'

test_error = predictions.filter(predictions["prediction"] != predictions["attack_type_index"]).count() / float(testing_data.count())
print("Testing error: {0:.4f}".format(test_error))

Learning models using the MLlib library¶

Task 5.1¶

Task 5.2¶

Task 5.3¶