!pip install findspark
!pip install pyspark


# create a spark app
import findspark
findspark.init()

import pyspark


from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# Create a Spark context object and create a Streaming context. It (in addition to the Spark context parameter) also defines a parameter that specifies the size of the window for micro-batches. 
# In this case StreamingContext(sc, 1) creates a Streaming context from the Spark context sc and with a micro-batch interval of 1 second

batchIntervalSeconds = 1

sc = SparkContext(appName = "NetworkWordCount")
ssc = StreamingContext(sc, batchIntervalSeconds)

# Now we will create a DStream - a discretized stream - which reads data
# we will use the socketTextStream operation, which will expect input data on localhost, on port 9999

lines = ssc.socketTextStream("localhost", 9999)

# The DStream contains text data received from localhost on port 9999. Each record contains
# a line of text. We will then process them with operations similar to the wordcount script
# from static data

# First, we separate the words in the string separated by a space and count the individual occurrences of the words

wordCounts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda x,y: x + y)

# print the output using pprint()

wordCounts.pprint()


ssc.start()
ssc.awaitTermination(10)
ssc.stop()


from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

# To use Structured Streaming, we will use the Spark Session object
spark = SparkSession.builder.appName("StructuredNetworkWordCount").getOrCreate()

# Similar to the first example, we will read data from localhost on port 9999
# The difference is that this time we will not create a 'DStream' (RDD) but a data frame
# Using the 'readStream' function, we can specify the source and type of stream ('socket'), optional parameters ('host' and 'port'), the load() function loads

lines = spark.readStream.format("socket").option("host", "localhost").option("port", 9999).load()

# We transform the data frame with the loaded data into another data frame - by splitting it according to the separator
# The explode function creates a new record for each element for a given column of the data frame - it transforms the data frame
# consisting of rows per data frame of words

words = lines.select(
   explode(
       split(lines.value, " ")
   ).alias("word")
)

# Next, we can use operations for data frames and count the number of occurrences of words (by grouping by words and counting)

wordCounts = words.groupBy("word").count()

# With Structured Streaming, the result of the operation is also a structured burst. So we will create it and it will be
# continuously perform the specified calculation on the given current

query = wordCounts.writeStream.outputMode("complete").queryName("counts").format("memory").start()
query.awaitTermination(20)
query.stop()

spark.table("counts").show()

# After executing this code, the calculation will start in the background and will be saved to the console.
# In this case, the "complete" output method is set.


from pyspark import SparkContext, Row, SQLContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Let's create Spark Context and Spark Session objects
#sc = SparkContext(appName="pubnub_dir_streaming_app")
#spark = SparkSession.builder.appName("StreamingPubNub").getOrCreate()

# We specify the JSON schema of the objects that we will process. For attributes (except for the 'timestamp' attribute) type String, since received JSON objects contain variable values ​​as strings

schema = StructType([
    StructField("timestamp", TimestampType(), True),
    StructField('ambient_temperature', StringType(), True),
    StructField('humidity', StringType(), True),
    StructField('photosensor', StringType(), True),
    StructField('sensor_uuid', StringType(), True),
    StructField('radiation_level', StringType(), True)
])

# We will create a Streaming Data Frame from a source that contains additional JSON files. As a parameter when creating the data frame, we define the schema of the JSON objects that we will process
# set the 'maxFilesPerTrigger' attribute as a parameter, which specifies how many files will be processed at once and the JSON source of streamed objects

df = spark.readStream.schema(schema).option("maxFilesPerTrigger",1).json("stream/")

# In the next step, we transform the attributes of the data frame to the required values ​​(values ​​of sensory quantities from strings to numeric)

df = df.withColumn("ambient_temperature", df["ambient_temperature"].cast(FloatType()))
df = df.withColumn("humidity", df["humidity"].cast(FloatType()))
df = df.withColumn("photosensor", df["photosensor"].cast(FloatType()))
df = df.withColumn("radiation_level", df["radiation_level"].cast(FloatType()))


# If we want to check a streaming data frame, we must specify a streaming query that will continuously monitor it
# So we create a 'df_stream' from the data frame, specify the output method to the console ("console") and run

df_stream = df.writeStream.format("memory").queryName("stream").start()

# This query will be active for 10 seconds
df_stream.awaitTermination(10)
df_stream.stop()

spark.table("stream").show()


df_stream = df.writeStream.format("memory").queryName("stream").start()

query = df.select(df["sensor_uuid"]).where(df["humidity"] >= 10.0)
query_stream = query.writeStream.format("memory").queryName("humidity").start()
df_stream.awaitTermination(10)
df_stream.stop()
query_stream.stop()

spark.table("humidity").show()


df_stream = df.writeStream.format("memory").queryName("stream").start()

query = df.filter(df['radiation_level'] >= 200.0)
query_stream = query.writeStream.format("memory").queryName("radiation").start()
df_stream.awaitTermination(10)
df_stream.stop()
query_stream.stop()

spark.table("radiation").show()


df_stream = df.writeStream.format("memory").queryName("stream").start()

query = df.groupBy("sensor_uuid").count()
query_stream = query.writeStream.outputMode("complete").format("memory").queryName("uuid_counts").start()
df_stream.awaitTermination(10)
df_stream.stop()
query_stream.stop()

spark.table("uuid_counts").show()


df_stream = df.writeStream.format("memory").queryName("stream").start()

query = df.groupBy(df['sensor_uuid'], window(df['timestamp'], "5 seconds")).count()
query_stream = query.writeStream.format("memory").outputMode("complete").queryName("windows").start()
df_stream.awaitTermination(10)
df_stream.stop()
query_stream.stop()

spark.table("windows").show()

Data streams processing using Apache Spark Streaming¶

Working with structured streams¶

Sensory data stream processing¶

Task 7.1¶

Task 7.2¶

Task 7.3¶