!pip install findspark
!pip install pyspark # install and import the libraries needed for Apache spark


# creating a spark application
import findspark
findspark.init()

import pyspark
from pyspark import SparkContext
sc = SparkContext("local", "test app")


data = ["spark", "rdd", "example", "sample", "example"]
rdd = sc.parallelize(data)


rdd.count() # = 5


rdd.collect() # = ["spark", "rdd", "example", "sample", "example"]


rdd.first() # = "spark"


rdd.take(4) # = ["spark", "rdd", "example", "sample"]


rdd.takeSample(True, 3)


print(rdd.reduce(lambda x, y: x + y))


# first we define a simple function that converts the string to uppercase and displays it on the screen
def print_upper(x):
    print(x.upper())
# we call the function print_upper for each element of the RDD collection
rdd.foreach(print_upper)


# first we create an RDD with key:value pairs
kv_pairs = sc.parallelize([("a", 1), ("b", 1), ("a", 1), ("a", 1), ("b", 1), ("b", 1), ("b", 1), ("b", 1)])
# and we calculate the number of values for each key, the result of the operation is a map
kv_pairs.countByKey().items()


rdd2 = rdd.map(lambda x: (x, len(x)))
# elements of the transformed collection are obtained by the action 'collect'
rdd2.collect()


# compare e.g. the following command will generate an RDD collection with 3 elements and each element is an array
sc.parallelize([1, 2, 3]).map(lambda x: [x, x, x]).collect()
# = [[1, 1, 1], [2, 2, 2], [3, 3, 3]]


# with a command that generates 3 (identical) transformed elements for each element of the original RDD collection, i.e. the resulting collection will be
# have 3x3=9 elements
sc.parallelize([2, 3, 4]).flatMap(lambda x: [x, x, x]).collect()
# = [2, 2, 2, 3, 3, 3, 4, 4, 4]


rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
filtered_rdd = rdd.filter(lambda x: x % 2 == 0) # we only select even numbers
filtered_rdd.collect()


rdd = sc.parallelize(range(1, 10)) # we will generate a sequence of numbers from 1 to 10
sample_rdd = rdd.sample(True, 0.2) # randomly select 20% of elements with repetition


rdd1 = sc.parallelize(range(1, 15))
rdd2 = sc.parallelize(range(10, 21))
rdd1.union(rdd2).collect()
# = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


rdd1.intersection(rdd2).collect()
# = [10, 11, 12, 13, 14, 15]


# first we generate an RDD collection of pairs of type (first letter of the string, string)
rdd = sc.parallelize(["spark", "rdd", "example", "sample", "example"]).map(lambda word: (word[0], word))
# group the words according to the key (first letter)
group_rdd = rdd.groupByKey()
# 'group_rdd' is a collection of pairs (first letter, iterator of words starting with that letter)
# if we want to convert an iterator, ie the second component of the pair to a list, we have to apply the 'list' function to it, which we can write using
# transforms 'mapValues'
group_list = group_rdd.mapValues(lambda x: list(x))
# the result is an RDD collection of pairs of type (first letter, list of words starting with that letter)
group_list.collect()


kv_pairs = sc.parallelize([("a", 4), ("b", 2), ("a", 7), ("a", 4), ("b", 3)])
kv_pairs_count = kv_pairs.reduceByKey(lambda x, y: x + y) # contains elements ("a", 15), ("b", 5)
kv_pairs_count.collect()


kv_pairs.sortByKey().collect()


import urllib.request
urllib.request.urlretrieve("https://peter.bednar.website.tuke.sk/tsvd/data/kddcup.data_10_percent.gz", "kddcup.data_10_percent.gz")


# we load the data as an RDD collection from the file and display the first 5 records
rawdata = sc.textFile("./kddcup.data_10_percent.gz")
rawdata.take(5)
# [u'0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,
# 9,9,1.00,0.00,0.11,0.00,0.00,0.00,0.00,0.00,normal.',
# u'0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,
# 19,19,1.00,0.00,0.05,0.00,0.00,0.00,0.00,0.00,normal.',  
#u'0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,
#0.00,29,29,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.', 
#u'0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,
#0.00,0.00,39,39,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.', 
#u'0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.00,0.00,1.00,
#0.00,0.00,49,49,1.00,0.00,0.02,0.00,0.00,0.00,0.00,0.00,normal.']


# from the listing it can be seen that the data is initially represented as an RDD collection of strings read from the file line by line
# we can use 'count' to count the number of records
rawdata.count()


# we can use csv reader to split rows into values
import csv
rdd = sc.textFile("./kddcup.data_10_percent.gz")
rdd = rdd.mapPartitions(lambda x: csv.reader(x))


# if the file contains a header on the first line, we can remove it using the 'filter' transformation
header = rdd.first()
rdd = rdd.filter(lambda x: x != header)


# the following commands count how many records have the target attribute with the value 'normal'
# first we filter out all lines that contain the string 'normal'
normal_records = rawdata.filter(lambda x: "normal" in x)
# and we will find out their count
print(normal_records.count())


# on the other hand, we get data that only contains data about non-standard communication, e.g. as follows:
attack_raw_data = rawdata.subtract(normal_records)


# using the 'map' transformation, we split the rows into an array of values with a comma as a separator
csv_data = rawdata.map(lambda line: line.split(","))


# we can use 'map' to rearrange the data, e.g. we will generate an RDD collection of type key:value where we will use as the key
# target attribute (attribute index 41) and as a value we will have an array of other attribute values (indexes 0 to 40)
def create_kv(line):
    elems = line.split(",") # split the row on a substring of values
    tag = elems[41] # tag is the target attribute
    return (tag, elems[0:40]) # we return the pair t


# we apply the mapping function to the RDD collection
key_csv_data = rawdata.map(create_kv)


# with 'sample' we can randomly select a subset of the data, we will select 10% of the records without repetition (1234 is the initialization
# random number generator)
rawdata_sample = rawdata.sample(False, 0.1, 1234)
sample_size = rawdata_sample.count()
total_size = rawdata.count()
# we will print the number of selected records and the total number of records
print("sample size is {0} of {1}".format(sample_size, total_size))

Data processing in the Apache Spark environment¶

Working with RDD - Resilient Distributed Dataset¶

The following commands list basic Spark actions¶

The following commands list basic Spark transformations¶

In the following example, we apply RDD operations on data from a real KDD Cup dataset.¶

Task 3.1¶

Task 3.2¶

Task 3.3¶

Task 3.4¶