Final Print Py Spark
Final Print Py Spark
#include
import sys
A module is a python file which holds functions, classes, methods and variables.
import time
print(time.time())
import time as t
print(t.time())
import datetime
print(datetime.date.today())
print(date.today())
so when we are importing specific functions then we can call the function directly.
Module
How to import it
How to call the functions within that module
__main__
if we run the python file directly then the global variable __name__ is set to __main__
but if we have it indirectly then the value of __name__ is set to the name of the file
file1
======
print("name of this module is ", __name__)
file2
=======
import module1
if __name__ == "__main__":
print("Executed when invoked directly")
else:
print("executed when imported")
# python comment
// scala comment
a=5
In python, unlike statically typed languages like c or java, there is no need to specifically declare
the data type of the variable. In dynamically typed languages like python, the interpreter itself
predicts the datatype.
Named Function
================
def sum(a,b):
return a+b
total = sum(3,4)
print(total)
1. Case
var totalCount = 10
total_count
//snake case
*/
# this is a comment
in scala
import org.apache.spark.SparkContext
object First extends App {
arr.foreach(println)
in python
arr = [1,2,3,4,5]
for a in arr:
print(a)
4. scala
python
import org.apache.spark.SparkContext
//Common lines
val sc = new SparkContext("local[*]","wordcount")
val input = sc.textFile("/Users/trendytech/Desktop/data/input/file.txt")
//take two rows , and does aggregation and returns one row
val finalCount = wordCounts.reduceByKey((x,y) => x+y)
//action
finalCount.collect.foreach(println)
Pyspark code
==============
# common lines
sc = SparkContext("local[*]", "wordcount")
input = sc.textFile("/Users/trendytech/Desktop/data/input/file.txt")
final_count = word_counts.reduceByKey(lambda x, y: x + y)
result = final_count.collect()
for a in result:
print(a)
=================
sc.setLogLevel("ERROR")
2. __name__
__main__
stdin.readline()
4. DAG
localhost:4040
hence scala dag matches to our code but pyspark dag does not.
if __name__ == "__main__":
# common lines
sc = SparkContext("local[*]", "wordcount")
# sc.setLogLevel("ERROR")
input = sc.textFile("/Users/trendytech/Desktop/data/search_data.txt")
final_count = word_counts.reduceByKey(lambda x, y: x + y)
result = final_count.collect()
for a in result:
print(a)
else:
print("Not executed directly")
stdin.readline()
========
1. Lowercase
2. countByValue
if __name__ == "__main__":
# common lines
sc = SparkContext("local[*]", "wordcount")
# sc.setLogLevel("ERROR")
input = sc.textFile("/Users/trendytech/Desktop/data/search_data.txt")
final_count = word_counts.countByValue()
print(final_count)
else:
print("Not executed directly")
3. sortByKey
if __name__ == "__main__":
# common lines
sc = SparkContext("local[*]", "wordcount")
# sc.setLogLevel("ERROR")
input = sc.textFile("/Users/trendytech/Desktop/data/search_data.txt")
for a in result:
print(a)
else:
print("Not executed directly")
4. sortBy
if __name__ == "__main__":
# common lines
sc = SparkContext("local[*]", "wordcount")
# sc.setLogLevel("ERROR")
input = sc.textFile("/Users/trendytech/Desktop/data/search_data.txt")
for a in result:
print(a)
else:
print("Not executed directly")
stdin.readline()
sc = SparkContext("local[*]","customer-orders")
rdd1 = sc.textFile("/Users/trendytech/Desktop/data/customer-orders.csv")
result = rdd4.collect()
for a in result:
print(a)
2. Movie Rating
================
lines = sc.textFile("/Users/trendytech/Desktop/data/movie-data.data")
for a in result:
print(a)
def parseLine(line):
fields = line.split(",")
age = int(fields[2])
numFriends = int(fields[3])
return (age,numFriends)
sc = SparkContext("local[*]","FriendsByAge")
lines = sc.textFile("/Users/trendytech/Desktop/data/friends-data.csv")
rdd = lines.map(parseLine)
# (33,385) input
#(33,(385,1)) output
#(33,(3000,5))
#in scala we used to access the elements of tuple using x._1 , x._2
result = averagesByAge.collect()
Solution 1 :
rdd2.collect().foreach(println)
TRENDYTECH 9108179578
Solution 2 :
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.log4j._
import scala.math.min
def parseLine(line:String)= {
val fields = line.split(“,”)
val stationID = fields(0)
val entryType = fields(2)
val temperature = fields(3)
(stationID, entryType, temperature)
}
Cont...
TRENDYTECH 9108179578
/** Our main function where the action happens */
def main(args: Array[String]) {
TRENDYTECH 9108179578
// Convert to (stationID, temperature)
val stationTemps = minTemps.map(x => (x._1, x._3.toFloat))
TRENDYTECH 9108179578
1
Problem Statement:
Score Title
X1 Y1
X2 Y2
.… .…
.… .…
2
Dataset Details:
Dataset1:
This will be a log of viewings, showing which user has viewed which
chapter. These chapterIds belong to courses. Also chapterIds are
unique.
But as we can see from this raw data ,there is no reference to courses in
this raw data.
Dataset 3:
Note: For every exercise ,the expected output of the sample raw data is
provided .But this is not the actual output. This has been provided for
understanding purpose only.
You will work on actual data and output will be based on the csv files
provided to you.
**Business Rules:
**Hints
Exercise 1:
courseId count
1 1
1 1
2 1
3 1
3 1
Exercise 2:
You need to do a map to switch the key and the value. Now
that you have two RDDs, each keyed on chapterId, you can join
them together.
As each "row" in the RDD now represents "a chapter from this
course has been viewed", the chapterIds are no longer relevant.
You can get rid of them at this point - this will avoid dealing
with tricky nested tuples later on. At the same time, given
we're counting how many chapters of a course each user
watched, we will be counting shortly, so we can drop a "1" in
for each user/course combination.
Exercise 3:
***********************************************************************************
.
Pyspark week-10
================
sc = SparkContext("local[*]","KeywordAmount")
initial_rdd = sc.textFile("/Users/trendytech/Desktop/data/bigdata-campaign-data.csv")
result = sorted.take(20)
for x in result:
print(x)
=========
def loadBoringWords():
boring_words = set(line.strip() for line in
open("/Users/trendytech/Desktop/data/boringwords.txt"))
return boring_words
sc = SparkContext("local[*]","KeywordAmount")
name_set = sc.broadcast(loadBoringWords())
initial_rdd = sc.textFile("/Users/trendytech/Desktop/data/bigdata-campaign-data.csv")
result = sorted.take(20)
for x in result:
print(x)
Accumulator example
=====================
def blankLineChecker(line):
if(len(line) == 0):
myaccum.add(1)
sc = SparkContext("local[*]","AccumulatorExample")
myrdd = sc.textFile("/Users/trendytech/Desktop/data/samplefile.txt")
myaccum = sc.accumulator(0.0)
myrdd.foreach(blankLineChecker)
print(myaccum.value)
=======
you can use foreach on a rdd but not on a local variable example list
a = rdd.collect
======
from pyspark import SparkContext
sc = SparkContext("local[*]", "logLevelCount")
sc.setLogLevel("INFO")
if __name__ == "__main__":
my_list = ["WARN: Tuesday 4 September 0405",
"ERROR: Tuesday 4 September 0408",
"ERROR: Tuesday 4 September 0408",
"ERROR: Tuesday 4 September 0408",
"ERROR: Tuesday 4 September 0408",
"ERROR: Tuesday 4 September 0408"]
original_logs_rdd = sc.parallelize(my_list)
else:
original_logs_rdd = sc.textFile("/Users/trendytech/Desktop/data/logsample.txt")
print("inside the else part")
result = resultant_rdd.collect()
for x in result:
print(x)
=========
groupByKey
reduceByKey
sc.setLogLevel("INFO")
base_rdd = sc.textFile("/Users/trendytech/Desktop/data/bigLog.txt")
grouped_rdd = mapped_rdd.groupByKey()
result = final_rdd.collect()
for x in result:
print(x)
============
sc = SparkContext("local[*]", "LogLevelCount")
sc.setLogLevel("INFO")
base_rdd = sc.textFile("/Users/trendytech/Desktop/data/bigLog.txt")
result = reduced_rdd.collect()
for x in result:
print(x)
==========
Miscellaneous things
=====================
1)
scala
=======
val a = 1 to 100
val base = sc.parallelize(a)
base.reduce((x,y) => x+y)
pyspark
=========
a = range(1,101)
base = sc. parallelize(a)
base.reduce(lambda x,y: x+y)
2)
input = sc.textFile("/Users/trendytech/Desktop/data/customer-orders.csv")
input.saveAsTextFile("/Users/trendytech/Desktop/data/output10")
3. Count - this is an action and works the same way as we saw in scala codes.
4. sc.defaultParallelism
rdd.getNumPartitions()
original_logs_rdd = sc.parallelize(my_list)
original_logs_rdd.getNumPartitions()
7) sc.defaultMinPartitions - 2
8) repartition
9) coalesce
Solution 1:
//Assignemnet-Problem 1
import org.apache.log4j.Level
import org.apache.log4j.Logger
import org.apache.spark.SparkConf
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.DoubleType
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StructType
sparkConf.set(“spark.master”,”local[2]”) 9108179578
//Step1 -creating a spark session
val spark = SparkSession.builder()
.config(sparkConf)
.getOrCreate()
//Step 2 - Setting the logging level to Error
Logger.getLogger(“org”).setLevel(Level.ERROR)
)) 9108179578
// Step 3 contd.. Loading the file and creation of dataframe using dataframe reader API, using
explicitly specified schema
val windowdataDF = spark.read
.format(“csv”)
.schema(windowdataSchema)
.option(“path”, “C:/xyz/TrendyTech/Spark_data/structuredAPI/windowdata.csv”)
.load()
//Step 4: Saving the data in Parquet format using Dataframe Writer API
//Data is two-level partitioned on Country and weeknum column , these columns have low
cardinality //Default output format is parquet
/* windowdataDF.write
.partitionBy(“Country”, “weeknum”)
.mode(SaveMode.Overwrite)
.option(“path”,”C:/xyz/TrendyTech/Spark_data/structuredAPI/Output/windowdata_output”)
.save()
TRENDYTECH */ 9108179578
//Step 5: Save the Dataframe to Avro Format and also partitioning data by Country column
windowdataDF.write
.format(“avro”)
.partitionBy(“Country”)
.mode(SaveMode.Overwrite)
.option(“path”,”C:/xyz/TrendyTech/Spark_data/structuredAPI/Output/windowdata_avrooutput”)
.save()
TRENDYTECH 9108179578
Solution 2:
import org.apache.log4j.Level
import org.apache.log4j.Logger
import org.apache.spark.SparkConf
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.SparkSession
object WEEK11_SOLUTION_2_WINDOWDATA extends App {
// Setting the Logging~Level To ERROR
Logger.getLogger("org").setLevel(Level.ERROR)
windowDataDF.write
.format("json")
.mode(SaveMode.Overwrite)
.option("path", "G:/TRENDY~TECH/WEEK-
11/Assignment_Dataset/OutPut_Prb2/windowData_jsonoutput")
.save()
//windowDataDF.show()
spark.stop()
scala.io.StdIn.readLine()
}
TRENDYTECH 9108179578
TRENDYTECH 9108179578
Pyspark week 11
================
2. spark-submit
sc = SparkContext("local[*]","PremiumCustomers")
base_rdd = sc.textFile("/Users/trendytech/Desktop/data/customer-orders.csv")
doubled_amount = premium_customers.map(lambda
x:(x[0],x[1]*2)).persist(StorageLevel.MEMORY_ONLY)
result = doubled_amount.collect()
for x in result:
print(x)
print(doubled_amount.count())
stdin.readline()
spark-submit /Users/trendytech/PycharmProjects/pysparklearning/module1.py
Ratings.dat
Movies.dat
import org.apache.log4j.Logger
import org.apache.log4j.Level
import org.apache.spark.SparkContext
Logger.getLogger("org").setLevel(Level.ERROR)
val sc = new SparkContext("local[*]","joindemo")
topMoviesRdd.collect.foreach(println)
}
=========================
equivalent pyspark code
=========================
sc = SparkContext("local[*]","joindemo")
ratings_rdd = sc.textFile("/Users/trendytech/Desktop/data/ratings.dat")
movies_rdd= sc.textFile("/Users/trendytech/Desktop/data/movies.dat")
joined_rdd = movies_mapped_rdd.join(final_rdd)
result = top_movies_rdd.collect()
for x in result:
print(x)
Structured API's
==================
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master","local[*]")
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
orderDf = spark.read.csv("/Users/trendytech/Desktop/data/orders.csv")
orderDf.show()
spark.stop()
=======
find the total orders placed by each customer where customer id > 10000
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master","local[*]")
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
orderDf =
spark.read.option("header",True).option("inferSchema",True).csv("/Users/trendytech/Desktop/da
ta/orders.csv")
groupedDf = orderDf.repartition(4) \
.where("order_customer_id > 10000") \
.select("order_id","order_customer_id") \
.groupBy("order_customer_id") \
.count()
groupedDf.show()
===============
so when we give a column name which does not exist then the error is shown at runtime and
not at compile time.
2. standard way
orderDf = spark.read.format("csv")\
.option("header",True)\
.option("inferSchema",True)\
.option("path","/Users/trendytech/Desktop/data/orders.csv")\
.load()
orderDf = spark.read.format("json")\
.option("path","/Users/trendytech/Desktop/data/orders.json")\
.load()
1
Spark StructuredAPIs -Assignment Solutions
Assignment 1 :
Code:
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.log4j.Level
import org.apache.log4j.Logger
import org.apache.spark.sql.functions._
//Load the department data into a Dataframe using dataframe reader API
// deptDf.show()
// deptDf.printSchema()
2
//Load the employee data into a Dataframe using dataframe reader API
// employeeDf.show()
// employeeDf.printSchema()
//Joining of two dataframes using left outer join, with department dataframe on left
side
//Use first function so as to get other columns also along with aggregated columns
joinedDfNew.groupBy("deptid").agg(count("empname").as("empcount"),first("deptNam
e").as ("deptName")).dropDuplicates("deptName").show()
spark.stop()
}
Output:
Assignment 2
3
Find the top movies as shown in spark practical 18 using broadcast join. Use
Dataframes or Datasets to solve it this time.
Code:
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.log4j.Level
import org.apache.log4j.Logger
import org.apache.spark.sql.functions._
//Transform to a Dataframe:
import spark.implicits._
// ratingsDf.show()
// ratingsDf.printSchema()
val moviesRDD =
spark.sparkContext.textFile("C:/TrendyTech/SparkExamples/movies.dat")
val moviesNewDf =
moviestransformedRDD.toDF().select("movieid","moviename")
// moviesNewDf.show()
//moviesNewDf.printSchema()
//transformedmovieDf.show()
// popularMoviesDf.show()
5
//Now we want to associate the Movie names also, so we use a broadcast join
val finalPopularMoviesDf =
popularMoviesDf.join(broadcast(moviesNewDf),joinCondition,joinType).drop(popularM
oviesDf.col("movieid")).sort(desc("avgMovieRating")) //joining the 2 dataframes using
broadcast join where movies data is the smaller dataset
finalPopularMoviesDf.drop("movieViewCount","movieid","avgMovieRating").show(false
)
spark.stop()
Output:
Assignment 3
File A is a text file of size 1.2 GB in HDFS at location /loc/x. It contains match by match
statistics of runs scored by all the batsman in the history of cricket.
File B is a text file of size 1.2 MB present in local dir /loc/y. It contains list of batsman
playing in cricket world cup 2019.
6
File A:
1 Rohit_Sharma India 200 100.2
1 Virat_Kohli India 100 98.02
1 Steven_Smith Aus 77 79.23
35 Clive_Lloyd WI 29 37.00
243 Rohit_Sharma India 23 150.00
243 Faf_du_Plesis SA 17 35.06
File B:
Rohit_Sharma India
Steven_Smith Aus
Virat_Kohli India
Find the batsman participating in 2019 who has the best average of scoring runs in his
career. Solve this using Dataframes or Datasets.
Code:
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.log4j.Level
import org.apache.log4j.Logger
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.Row
val batsmenHistoryRDD =
spark.sparkContext.textFile("C:/TrendyTech/SparkExamples/FileA_BatsmenDetails_Histo
ry.txt")
// Dataframe creation
import spark.implicits._
//batsmenHistoryDf.show()
//batsmenHistoryDf.printSchema()
//Calculating Average runs scored by a batsman in history, with highest average at top
val batsmenBestRunsAvgHistoryDf =
batsmenHistoryDf.groupBy("Batsman").agg(avg("RunsScored").as("AverageRunsScored"))
.select("Batsman","AverageRunsScored")
//batsmenBestRunsAvgHistoryDf.sort(col("AverageRunsScored").desc).show()
//Alternative Approach instead of using case class ,though case class can also be used
instead-
8
//Programmatically create an explicit schema of the worldcup 2019 file:
batsmenWorldCupDf.show()
batsmenWorldCupDf.printSchema()
val finalBestBatsmenPlayingWorldCupDf =
batsmenBestRunsAvgHistoryDf.join(broadcast(batsmenWorldCupDf),joinCondition,joinT
ype).drop (batsmenBestRunsAvgHistoryDf.col("Batsman"))
finalBestBatsmenPlayingWorldCupDf.orderBy(desc("AverageRunsScored")).show()
spark.stop()
Output:
+-----------------+------------+
|AverageRunsScored| batsman|
9
+-----------------+------------+
| 111.5|Rohit_Sharma|
| 100.0| Virat_Kohli|
| 77.0|Steven_Smith|
+-----------------+------------+
**********************************************************************
Problem 1:
Sample output
depName,deptid,empcount
IT,11,1
HR,21,1
Marketing,31,1
Fin,41,2
Admin,51,0
TRENDYTECH 9108179578
Problem 2:
Find the top movies as shown in spark practical 18 using broadcast join. Use
Dataframes or Datasets to solve it this time.
Problem 3:
File A is a text file of size 1.2 GB in HDFS at location /loc/x. It contains match by
match statistics of runs scored by all the batsman in the history of cricket.
File B is a text file of size 1.2 MB present in local dir /loc/y. It contains list of
batsman playing in cricket world cup 2019.
TRENDYTECH 9108179578
File A:
MatchNumber Batsman Team RunsScored StrikeRate
1 Rohit Sharma India 200 100.2
1 Virat Kohli India 100 98.02
1 Steven Smith Aus 77 79.23
35 Clive Lloyd WI 29 37.00
243 Rohit Sharma India 23 150.00
243 Faf du Plesis SA 17 35.06
File B:
Batsman Team
Rohit_Sharma India
Steven_Smith Aus
Virat_Kohli India
Question: Find the batsman participating in 2019 who has the best average of
TRENDYTECH 9108179578
scoring runs in his career. Solve using Dataframes or Datasets.
1. reading the data - Reader API
Scala
======
orderDf.write.format("csv")
.mode(SaveMode.Overwrite)
.option("path","/Users/trendytech/Desktop/newfolder1")
.save()
pyspark
======
orderDf.write.format("csv")\
.mode("overwrite")\
.option("path","/Users/trendytech/Desktop/newfolder1")\
.save()
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master","local[*]")
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
orderDf = spark.read.format("csv")\
.option("header",True)\
.option("inferSchema",True)\
.option("path","/Users/trendytech/Desktop/data/orders.csv")\
.load()
ordersRep = orderDf.repartition(4)
ordersRep.write.format("csv")\
.mode("overwrite")\
.option("path","/Users/trendytech/Desktop/newfolder1")\
.save()
====
overwrite
append
errorIfExists
ignore
=====
Parquet is the default file format in apache spark when we talk about structured api's
=====
3. bucketBy
4. maxRecordsPerFile
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master","local[*]")
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
orderDf = spark.read.format("csv")\
.option("header",True)\
.option("inferSchema",True)\
.option("path","/Users/trendytech/Desktop/data/orders.csv")\
.load()
orderDf.write.format("csv").partitionBy("order_status")\
.mode("overwrite")\
.option("path","/Users/trendytech/Desktop/newfolder4")\
.save()
=======
Avro
3.1.2 pyspark
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master","local[*]")
my_conf.set("spark.jars","/Users/trendytech/Downloads/spark-avro_2.12-3.1.2.jar")
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
orderDf = spark.read.format("csv")\
.option("header",True)\
.option("inferSchema",True)\
.option("path","/Users/trendytech/Desktop/data/orders.csv")\
.load()
orderDf.write.format("avro")\
.mode("overwrite")\
.option("path","/Users/trendytech/Desktop/newfolder4")\
.save()
====
Spark SQL
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
orderDf = spark.read.format("csv")\
.option("header",True)\
.option("inferSchema",True)\
.option("path","/Users/trendytech/Desktop/data/orders.csv")\
.load()
orderDf.createOrReplaceTempView("orders")
resultDf.show()
====
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master","local[*]")
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
orderDf = spark.read.format("csv")\
.option("header",True)\
.option("inferSchema",True)\
.option("path","/Users/trendytech/Desktop/data/orders.csv")\
.load()
orderDf.createOrReplaceTempView("orders")
====
Table has 2 parts
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master","local[*]")
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
orderDf = spark.read.format("csv")\
.option("header",True)\
.option("inferSchema",True)\
.option("path","/Users/trendytech/Desktop/data/orders.csv")\
.load()
orderDf.write.format("csv")\
.mode("overwrite")\
.saveAsTable("orders1")
========
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master","local[*]")
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
orderDf = spark.read.format("csv")\
.option("header",True)\
.option("inferSchema",True)\
.option("path","/Users/trendytech/Desktop/data/orders.csv")\
.load()
==========
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master","local[*]")
spark = SparkSession.builder.config(conf=my_conf).enableHiveSupport().getOrCreate()
orderDf = spark.read.format("csv")\
.option("header",True)\
.option("inferSchema",True)\
.option("path","/Users/trendytech/Desktop/data/orders.csv")\
.load()
orderDf.write.format("csv")\
.mode("overwrite")\
.saveAsTable("retail.orders3")
============
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master","local[*]")
spark = SparkSession.builder.config(conf=my_conf).enableHiveSupport().getOrCreate()
orderDf = spark.read.format("csv")\
.option("header",True)\
.option("inferSchema",True)\
.option("path","/Users/trendytech/Desktop/data/orders.csv")\
.load()
orderDf.write.format("csv")\
.mode("overwrite")\
.bucketBy(4,"order_customer_id")\
.sortBy("order_customer_id")\
.saveAsTable("retail.orders4")
============
Spark DF session 12
===================
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master","local[*]")
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
lines_df = spark.read.text("/Users/trendytech/Desktop/data/orders_new.csv")
#lines_df.printSchema()
#lines_df.show()
final_df =
lines_df.select(regexp_extract('value',myregex,1).alias("order_id"),regexp_extract('value',myreg
ex,2).alias("date"),regexp_extract('value',myregex,3).alias("customer_id"),regexp_extract('value'
,myregex,4).alias("status"))
final_df.printSchema()
final_df.show()
final_df.select("order_id").show()
final_df.groupby("status").count().show()
===============
spark df session 13
==============
Column String
Column object
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master","local[*]")
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
orderDf = spark.read.format("csv")\
.option("header",True)\
.option("inferSchema",True)\
.option("path","/Users/trendytech/Desktop/data/orders.csv")\
.load()
orderDf.select("order_id","order_date").show()
orderDf.select(col("order_id")).show()
==========
Spark DF Session 14
=====================
if the age is greater than 18 we have to populate the 4th column named Adult with "Y"
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master","local[*]")
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
df = spark.read.format("csv")\
.option("inferSchema",True)\
.option("path","/Users/trendytech/Desktop/data/dataset1")\
.load()
df1 = df.toDF("name","age","city")
def ageCheck(age):
if(age > 18):
return "Y"
else:
return "N"
parseAgeFunction = udf(ageCheck,StringType())
df2 = df1.withColumn("adult",parseAgeFunction("age"))
df2.printSchema()
df2.show()
===========
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
df = spark.read.format("csv")\
.option("inferSchema",True)\
.option("path","/Users/trendytech/Desktop/data/dataset1")\
.load()
df1 = df.toDF("name","age","city")
def ageCheck(age):
if(age > 18):
return "Y"
else:
return "N"
spark.udf.register("parseAgeFunction",ageCheck,StringType())
for x in spark.catalog.listFunctions():
print(x)
df2 = df1.withColumn("adult",expr("parseAgeFunction(age)"))
df2.show()
============
Spark DF session 15
=====================
create a dataframe from this local list and give column names
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master","local[*]")
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
myList = [(1,"2013-07-25",11599,"CLOSED"),
(2,"2014-07-25",256,"PENDING_PAYMENT"),
(3,"2013-07-25",11599,"COMPLETE"),
(4,"2019-07-25",8827,"CLOSED")]
ordersDf = spark.createDataFrame(myList)\
.toDF("orderid","orderdate","customerid","status")
newDf = ordersDf\
.withColumn("date1",unix_timestamp(col("orderdate"))) \
.withColumn("newid", monotonically_increasing_id()) \
.dropDuplicates(["orderdate","customerid"])\
.drop("orderid")\
.sort("orderdate")
ordersDf.printSchema()
ordersDf.show()
newDf.show()
===========
Spark DF session 16
=====================
Aggregate transformations
1. Simple aggregations
2. Grouping aggregations
3. window aggregates
//simple aggregates
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master","local[*]")
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
invoiceDF = spark.read
.format("csv")
.option("header",true)
.option("inferSchema",true)
.option("path","/Users/trendytech/Desktop/order_data.csv")
.load()
//spark SQL
invoiceDF.createOrReplaceTempView("sales")
spark.sql("select count(*),sum(Quantity),avg(UnitPrice),count(distinct(InvoiceNo)) from
sales").show
spark.stop()
}
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master","local[*]")
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
invoiceDF = spark.read\
.format("csv")\
.option("header",True)\
.option("inferSchema",True) \
.option("path","/Users/trendytech/Desktop/data/order_data.csv") \
.load()
invoiceDF.select(
count("*").alias("RowCount"),
sum("Quantity").alias("TotalQuantity"),
avg("UnitPrice").alias("AvgPrice"),
countDistinct("InvoiceNo").alias("CountDistinct")).show()
invoiceDF.selectExpr(
"count(*) as RowCount",
"sum(Quantity) as TotalQuantity",
"avg(UnitPrice) as AvgPrice",
"count(Distinct(InvoiceNo)) as CountDistinct").show()
invoiceDF.createOrReplaceTempView("sales")
spark.sql("select count(*),sum(Quantity),avg(UnitPrice),count(distinct(InvoiceNo)) from
sales").show()
=============
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master","local[*]")
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
invoiceDF = spark.read\
.format("csv")\
.option("header",True)\
.option("inferSchema",True) \
.option("path","/Users/trendytech/Desktop/data/order_data.csv") \
.load()
summaryDF.show()
#string expression
val summaryDf1 = invoiceDF.groupBy("Country","InvoiceNo")
.agg(expr("sum(Quantity) as TotalQunatity"),
expr("sum(Quantity * UnitPrice) as InvoiceValue"))
summaryDf1.show()
#spark SQL
invoiceDF.createOrReplaceTempView("sales")
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master","local[*]")
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
invoiceDF = spark.read\
.format("csv")\
.option("header",True)\
.option("inferSchema",True) \
.option("path","/Users/trendytech/Desktop/data/order_data.csv") \
.load()
summaryDF.show()
#string expression
summaryDf1 = invoiceDF.groupBy("Country","InvoiceNo")\
.agg(expr("sum(Quantity) as TotalQunatity"),
expr("sum(Quantity * UnitPrice) as InvoiceValue"))
summaryDf1.show()
#spark SQL
invoiceDF.createOrReplaceTempView("sales")
==============
my_conf = SparkConf()
my_conf.set("spark.app.name", "my first application")
my_conf.set("spark.master", "local[*]")
spark = SparkSession.builder.config(conf=my_conf).getOrCreate()
invoiceDF = spark.read \
.format("csv") \
.option("header", True) \
.option("inferSchema", True) \
.option("path", "/Users/trendytech/Desktop/data/windowdata.csv") \
.load()
myWindow = Window.partitionBy("country")\
.orderBy("weeknum")\
.rowsBetween(Window.unboundedPreceding, Window.currentRow)
mydf = invoiceDF.withColumn("RunningTotal",sum("invoicevalue").over(myWindow))
mydf.show()
==========
spark2-shell --master yarn
rdd1 = sc.textFile("bigLogNew.txt")
rdd3 = rdd2.groupByKey()
rdd4.collect()
=========
import random
def randomGenerator():
return random.randint(1,60)
def myFunction(x):
if(x[0][0:4]=="WARN"):
return("WARN",x[1])
else:
return ("ERROR",x[1])
rdd1 = sc.textFile("bigLogNew.txt")
rdd3 = rdd2.groupByKey()
rdd4.cache()
rdd6.collect()
1
Broadcast join can be used when we have 1 large table and 1 small table.
Input:
ERROR: Thu Jun 04 10:37:51 BST 2015
WARN: Sun Nov 06 10:37:51 GMT 2016
WARN: Mon Aug 29 10:37:51 BST 2016
2
ERROR: Thu Dec 10 10:37:51 GMT 2015
ERROR: Fri Dec 26 10:37:51 GMT 2014
ERROR: Thu Feb 02 10:37:51 GMT 2017
WARN: Fri Oct 17 10:37:51 BST 2014
ERROR: Wed Jul 01 10:37:51 BST 2015
WARN: Thu Jul 27 10:37:51 BST 2017
WARN: Thu Oct 19 10:37:51 BST 2017
output:
(ERROR,Thu Jun 04 10:37:51 BST 2015)
(WARN, Sun Nov 06 10:37:51 GMT 2016)
MAP TRANFORMATION
RDD2 is Large
(ERROR,Thu Jun 04 10:37:51 BST 2015)
(WARN, Sun Nov 06 10:37:51 GMT 2016)
RDD3 IS SMALL
("ERROR",0)
("WARN",1)
val a = Array(
("ERROR",0),
("WARN",1)
)
val a = Array(
3
("ERROR",0),
("WARN",1)
)
rdd3.saveAsTextFile("joinresults2")
orders is 2.6 GB
customers is around 900 kb
val customerDF =
spark.read.format("csv").option("header",true).option("inferSchema",true).option("path","custom
ers.csv").load
4
val orderDF =
spark.read.format("csv").option("header",true).option("inferSchema",true).option("path","orders.c
sv").load
spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1)
if I do groupBy on a Dataframe then we will get 200 partitions after the shuffling is done.
500 mb file..
after groupBy the partitions remain the same that means 4 partitions.
2 things..
import org.apache.spark.sql.types._
val customerDF =
spark.read.format("csv").option("header",true).option("inferSchema",true).option("path","custom
ers.csv").load
val orderDF =
spark.read.format("csv").schema(ordersSchema).option("header",true).option("path","orders.csv
").load
joinedDF.take(1000000)
increase the --driver-memory when you are collecting more data on driver machine.
--num-executors
--driver-memory
--executor-memory
--executor-cores
===============================
Spark Optimization Session - 15
===============================
a) client
6
b) cluster
8 partitions..
a cluster of 4 nodes..
rdd.coalesce(4)
when we are using coalesce then the resultant partitions can be of unequal sizes..
when we are using repartition then full shuffling is involved which is time consuming but we
know that the resultant partitions will be of similar size..
2. bundle your code as a java jar file and export the jar.
wordcount.jar
spark2-submit \
--class LogLevelGrouping \
--master yarn \
--deploy-mode cluster \
7
--executor-memory 3G \
--num-executors 4 \
wordcount.jar bigLogNew.txt
since the deploy mode is cluster mode that means our driver is running on one of the executors
residing in the cluster.
spark2-submit \
--class LogLevelGrouping \
--master yarn \
--executor-memory 3G \
--num-executors 4 \
wordcount.jar bigLogNew.txt
Join opimizations
==================
100 node cluster each worker node having 64 GB RAM & 16 CPU Cores.
50 executors with 5 cores each - 250 total cpu cores (at the max 250 tasks)
spark.sql.shuffle.partitions
after the shuffle only at max 100 partitions will be full and other will be empty.
whenever the cardinality of data is low then some of the partitions will be empty.
100
9
Skew Partitions
================
which ever task is working on this heavily loaded partition will be very slow.
there should not be partition skew, else the job will be delayed.
orders - order_customer_id
customers - customer_id
bucketed both the tables on the join column and sorted it.
SMB join.
in mysql we will have a table and we will try to create a dataframe by directly connecting from
that.
mysql-connector-java.jar
11
spark-shell --driver-class-path /usr/share/java/mysql-connector-java.jar
mysql_props.setProperty("user","sqoopuser")
mysql_props.setProperty("password","NHkkP876rp")
orderDF.show()
orders.csv - 2.6 GB
order_id,order_date,order_customer_id,order_status
1,2013-07-25 00:00:00.0,11599,CLOSED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
3,2013-07-25 00:00:00.0,12111,COMPLETE
4,2013-07-25 00:00:00.0,8827,CLOSED
5,2013-07-25 00:00:00.0,11318,COMPLETE
6,2013-07-25 00:00:00.0,7130,COMPLETE
7,2013-07-25 00:00:00.0,4530,COMPLETE
8,2013-07-25 00:00:00.0,2911,PROCESSING
9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT
we want to find out the number of orders which are placed by each customer in each month.
orderDF.createOrReplaceTempView("orders")
2 questions
============
sort aggregate
===============
customer_id:month value
1024:january 1 "1"
1024:january 1 "1"
1024:january 1 "1"
1024:january 1 "1"
1025:january 1 "1"
1025:january 1 "!"
1024:january ,{1,1,1,1,1}
2000
O(nlogn)
1000 * log(1000)
1000 * 10 = 10000
2000 * 11 = 22000
1024:january 1 "1"
1024:january 1 "1"
1024:january 1 "1"
1025:january 1 "1"
1025:january 1 "!"
Hash Aggregate
===============
hash table
14
============
customer_id:month value
1024:january 3 "1"
1025:january 2 "1"
no sorting is required..
additional memory is required to have the hashtable kind of structure.
1000 rows...
question 2: why in the first query it used sort aggregate and why in second query it used hash
aggregate..
customer_id:month value
1024:january 3 1
string is immutable
when we are using hash aggregate we should have mutable types in the values
Catalyst optimizer
Structured API's (DF, DS, Spark SQL) perform better than Raw RDD's
catalyst optimizer will optimize the execution plan for Structured API's
Many rules are already available. Also if we want we can add our own optimization rules.
Students.csv - 60 mb
student_id,exam_center_id,subject,year,quarter,score,grade
1,1,Math,2005,1,41,D
1,1,Spanish,2005,1,51,C
1,1,German,2005,1,39,D
1,1,Physics,2005,1,35,D
1,1,Biology,2005,1,53,C
1,1,Philosophy,2005,1,73,B
1,1,Modern Art,2005,1,32,E
1,1,History,2005,1,43,D
1,1,Geography,2005,1,54,C
16
val df1 =
spark.read.format("csv").option("header",true).option("inferSchema",true).option("path","/Users/t
rendytech/Desktop/students.csv").load
df1.createOrReplaceTempView("students")
syntax is correct
it will try to resolve the table name the column names etc..
if the columnname or table name is not available then we will get analysis exception.
combining of filters..
combining of projections
in physical plan..
17
physical plan1
===============
sortAggregate
physical plan2
===============
HashAggregate
It will select the physical plan which is the most optimized one with minimum cost.
student_id,exam_center_id,subject,year,quarter,score,grade
1,1,Math,2005,1,41,D
1,1,Spanish,2005,1,51,C
1,1,German,2005,1,39,D
1,1,Physics,2005,1,35,D
1,1,Biology,2005,1,53,C
1,1,Philosophy,2005,1,73,B
1,1,Modern Art,2005,1,32,E
1,1,History,2005,1,43,D
1,1,Geography,2005,1,54,C
a*b
if b is 1 then return a
Catalyst optimizer
Physical Plan
df1.createOrReplaceTempView("students")
http://localhost:4040/
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.catalyst.expressions.Multiply
import org.apache.spark.sql.catalyst.expressions.Literal
spark.experimental.extraOptimizations = Seq(MultiplyOptimizationRule)
TRENDY TECH
Problem Statement: We are creating array as one source and second source is a file. We are going to do join on both the
sources.
Solution:
Scala Spark Program PySpark Program
spark2-shell --conf spark.dynamicAllocation.enabled=false --master PySpark --conf spark.dynamicAllocation.enabled=false --master yarn --
yarn --num-executors 6 --executor-cores 2 --executor-memory 3G -- num-executors 6 --executor-cores 2 --executor-memory 3G --conf
conf spark.ui.port=4063 spark.ui.port=4063
Problem Statement: We are creating array as one source and second source is a file. We are going to do join on both the
sources. In above program we did normal join and, in this program, we will use broadcast join.
Solution:
Scala Spark Program PySpark Program
spark2-shell --conf spark.dynamicAllocation.enabled=false --master PySpark --conf spark.dynamicAllocation.enabled=false --master yarn --
yarn --num-executors 6 --executor-cores 2 --executor-memory 3G -- num-executors 6 --executor-cores 2 --executor-memory 3G --conf
conf spark.ui.port=4063 spark.ui.port=4063
spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1) spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1)
val joinedDF = customerDF.join(orderDF,customerDF("customer_id") === joinedDF = customerDF.join(orderDF,customerDF["customer_id"]
orderDF("order_customer_id")) == orderDF["order_customer_id"])
joinedDF.write.csv("output1") joinedDF.write.csv("/user/itv000173/output1")
Problem Statement: Join using two data frames and providing orders schema
Solution:
Scala Spark Program PySpark Program
spark2-shell --conf spark.dynamicAllocation.enabled=false -- PySpark --conf spark.dynamicAllocation.enabled=false --master yarn --num-
master yarn --num-executors 21 executors 21
joined.write.csv("output21") joinedDF.write.csv("/user/itv000173/output21")
Problem Statement: Execute Code in production. Create jar and execute using spark-submit in cluster mode. Program is same
as week13 except few changes mentioned in video
Solution:
Scala Spark Program PySpark Program
spark2-submit \ spark2-submit \
--class LogLevelGrouping \ --class LogLevelGrouping \
--master yarn \ --master yarn \
--deploy-mode cluster \ --deploy-mode cluster \
--executor-memory 3G \ --executor-memory 3G \
--num-executors 4 \ --num-executors 4 \
wordcount.jar bigLogNew.txt LogLevelGrouping.py bigLogNew.txt
Problem Statement: Execute Code in production. Create jar and execute using spark-submit in local mode. Program is same as
week13 except few changes mentioned in video
Solution:
Scala Spark Program PySpark Program
spark2-submit \ spark2-submit \
--class LogLevelGrouping \ --class LogLevelGrouping \
--master yarn \ --master yarn \
--executor-memory 3G \ --executor-memory 3G \
--num-executors 4 \ --num-executors 4 \
wordcount.jar bigLogNew.txt LogLevelGrouping.py bigLogNew.txt
orderDF.createOrReplaceTempView("orders") orderDF.createOrReplaceTempView("orders")
spark.sql("select * from orders").show spark.sql("select * from orders").show()
//----------------------------change the cast from order by //----------------------------change the cast from order by
spark.sql("select order_customer_id, date_format(order_date, spark.sql("select order_customer_id, date_format(order_date, 'MMMM')
'MMMM') orderdt, count(1) cnt, orderdt, count(1) cnt, first(date_format(order_date,'M')) monthnum from
first(cast(date_format(order_date,'M') as int)) monthnum from orders group by order_customer_id, orderdt order by cast(monthnum as
orders group by order_customer_id, orderdt order by int)").show()
monthnum").show
Problem Statement: just add .explain to spark sql from above program
Solution:
Scala Spark Program PySpark Program
spark2-shell --conf spark.dynamicAllocation.enabled=false --master PySpark --conf spark.dynamicAllocation.enabled=false --master yarn --
yarn --num-executors 11 --conf spark.ui.port=4063 num-executors 11 --conf spark.ui.port=4063
orderDF = spark.read \
val mysql_props = new java.util.Properties .jdbc(connection_url, "orders",
mysql_props.setProperty("user","sqoopuser") properties={"user": "sqoopuser", "password": "NHkkP876rp"})
mysql_props.setProperty("password","NHkkP876rp")
val orderDF = spark.read.jdbc(connection_url,"orders",mysql_props) orderDF.show()
orderDF.show()
Batch Processing
=================
your processing might take few minutes, few hours or few days.
we will have continuously flowing data and we have to calculate the results instantly.
HDFS + MR + YARN
MR (mapreduce) Processing.
Apache Spark.
Spark Streaming.
if the file is of 500 mb & if the default block size in hdfs is 128 mb.
when there is no concept of a static file then how do you visualize your rdd.
30 water balloons.
some balloons might have more data than other balloons based on flow of water.
so basically
2. Higher Level Constructs (Structured API's Dataframes, dataset and spark sql)
socket
=======
nc -lk 9998
sc is already available
we have spark context which is available
//lines is a dstream
val lines = ssc.socketTextStream("localhost",9998)
wordCounts.print()
ssc.start()
when calculating word count it was forgetting the state of previous rdds.
then (hello,5)
(hello,2)
1. stateless transformation
is the one which forgets the previous state. we perform operation on a single rdd
always.
2. stateful transformation
when we talk about batch processing. we load the entire file as one single rdd.
batch processing is always stateless. there is no point of talking about stateful
transformation in case of batch processing.
lets consider you have a streaming application which runs for 6 hours.
batch interval size to be 5 minutes - a new rdd will be created every 5 minutes.
during the course of entire streaming application how many rdds will be created?
157
(k,1)
(k,5)
(k,7)
when we talk about stateless we just talk about 1 single rdd. - stateless
considering the entire stream we talked about including all rdds. - stateful
3 things
=========
//lines is a dstream
val lines = ssc.socketTextStream("localhost",9998)
wordCounts.print()
ssc.start()
I want to calculate the frequency of each word across the entire stream...
stateful transformation..
updateStateByKey is a stateful transformation we can think of using.
(big,1)
(data,1)
(is,1)
(interesting,1)
(big,1)
(data,1)
(is,1)
(fun,1)
(big,1)
(big,1)
(data,1)
(data,1)
(is,1)
(is,1)
(fun,1)
rdd1
rdd2
sliding window -
if we have the batch interval as 2 seconds. that means after every 2 seconds a new
rdd will created.
after every 2 seconds one oldest rdd will go away and one new rdd will come in.
after every 4 seconds.. 2 oldest rdd's will go away and 2 new rdd will come in.
reduceByKeyAndWindow
hello,1
how,1
are,1
you,1
hello,1
our problem statement is find the frequency of each word in the 10 seconds sliding
window.
reduceByKey - stateless
Week 15 is based on Spark streaming where we need real time stream. We will use
socket and file
Generalized changes that are required in every program
1. To start cmd prompt for PySpark. We write PySpark instead of scala-shell.
2. Remove all val, var keyword as python does not have val and var types.
3. Anonymous functions are replaced with lambda in python.
4. Comment is given using # in python instead of // in scala
Note
1. Best practice is to use your own itversity hdfs location in the program for input and
output files. You can also use Linux root as shown in video.
2. There are be many ways to get the output for particular problem, we are showcasing
one way.
3. Changes are highlighted in yellow.
4. Ncat is Linux utility. For windows, follow below steps.
WEEK 15 – Spark Streaming Part 1 TRENDY TECH
TRENDY TECH
Steps for streaming program execution on windows.
1. Download the Free Nmap Security Scanner for Linux/Mac/Windows Download nmap-stable setup and install
2. In code give localhost 9998
3. Run the code... It will give error because no port is listening ... That's ok
4. Open cmd – go to Nmap folder
5. ncat -lvp 9998
6. start typing words
7. Cross check in your program
Problem Statement: Write real time stateless word count program in IDE
Solution:
Scala Spark Program PySpark Program
Create word.scala Create word.py
import org.apache.spark.SparkContext from PySpark import *
import org.apache.spark.streaming.Seconds from PySpark.streaming import *
import org.apache.spark.streaming.StreamingContext sc =SparkContext("local[2]","APP")
sc.setLogLevel("ERROR")
object StreamingWordCount extends App{ #creating spark streaming context
val sc = new SparkContext("local[*]","wordcount") ssc = StreamingContext(sc, 2)
//creating spark streaming context #lines is a dstream
val ssc = new StreamingContext(sc, Seconds(5)) lines = ssc.socketTextStream("localhost", 9998)
//lines is a dstream #words is a transformed dstream
val lines = ssc.socketTextStream("localhost",9998) words = lines.flatMap(lambda x: x.split())
//words is a transformed dstream pairs = words.map(lambda x: (x, 1))
val words = lines.flatMap(x => x.split(" ")) wordCounts = pairs.reduceByKey(lambda x, y: x + y)
val pairs = words.map(x => (x,1)) wordCounts.pprint()
val wordCounts = pairs. reduceByKey((x,y) => x+y) ssc.start()
wordCounts.print() ssc.awaitTermination()
ssc.start()
ssc.awaitTermination()
}
Problem Statement: Write a real time stateful word count program using sliding window in IDE
Solution:
Scala Spark Program PySpark Program
Create word1.scala Create word1.py
import org.apache.spark.SparkContext from PySpark import *
import org.apache.spark.streaming.Seconds from PySpark.streaming import *
import org.apache.spark.streaming.StreamingContext sc =SparkContext("local[2]","APP")
sc.setLogLevel("ERROR")
object StreamingWordCount extends App{ #creating spark streaming context
val sc = new SparkContext("local[*]","wordcount") ssc = StreamingContext(sc, 2)
//creating spark streaming context sss.checkpoint(".")
val ssc = new StreamingContext(sc, Seconds(5)) #lines is a dstream
//lines is a dstream lines = ssc.socketTextStream("localhost", 9998)
val lines = ssc.socketTextStream("localhost",9998) #words is a transformed dstream
ssc.checkpoint(".") wordCounts = lines.flatMap(lambda x: x.split()) \
//words is a transformed dstream .words.map(lambda x: (x, 1)) \
val wordCounts = lines.flatMap(x => x.split(" ")) .reduceByKeyAndWindow(lambda x, y: int(x) + int(y), lambda x, y: int(x) -
.map(x => (x,1)) int(y), 10, 2)
.reduceByKeyAndWindow((x,y)=>x+y,(x,y)=>x-y,Seconds(10),Seconds(2))
wordCounts.print() wordCounts.pprint()
ssc.start() ssc.start()
ssc.awaitTermination() ssc.awaitTermination()
}
Problem Statement: Write a real time program to count number of lines in window.
Solution:
Scala Spark Program PySpark Program
import org.apache.spark.SparkContext from PySpark import *
import org.apache.spark.streaming.Seconds from PySpark.streaming import *
import org.apache.spark.streaming.StreamingContext sc =SparkContext("local[2]","APP")
sc.setLogLevel("ERROR")
object StreamingWordCount extends App{ #creating spark streaming context
val sc = new SparkContext("local[*]","wordcount") ssc = StreamingContext(sc, 2)
//creating spark streaming context sss.checkpoint(".")
val ssc = new StreamingContext(sc, Seconds(2)) #lines is a dstream
//lines is a dstream lines = ssc.socketTextStream("localhost", 9998)
val lines = ssc.socketTextStream("localhost",9998)
ssc.checkpoint(".")
Week 16 is based on Spark structured streaming where we need real time stream. We
will use socket and file as source for streaming
Generalized changes that are required in every program
1. Remove all val, var keyword as python does not have val and var types.
2. Anonymous functions are replaced with lambda in python.
3. Comment is given using # in python instead of // in scala
4. To write multi line code in pyspark, end every line with \ except the last line.
Note
1. There are be many ways to get the output for particular problem, we are showcasing
one way.
2. Changes are highlighted in yellow.
3. Ncat is Linux utility. For windows, follow below steps.
Problem Statement: Write a real time word count program using spark structured stream
Solution:
Scala Spark Program PySpark Program
import org.apache.spark.sql.SparkSession from pyspark.sql import SparkSession
// process # process
val wordsDf=linesDf.selectExpr("explode(split(value,' ')) as word") wordsDf=linesDf.selectExpr("explode(split(value,' ')) as word")
val countsDf =wordsDf.groupBy("word").count() countsDf =wordsDf.groupBy("word").count()
Add below two bold lines and run again, you can see the performance gain. NO change in pyspark
Add below bold line to Set triggering time. Small difference in pyspark
wordCountQuery = countsDf.writeStream wordCountQuery = countsDf.writeStream \
.format('console') .format('console') \
.outputMode('complete') .outputMode('complete') \
.option("checkpointLocation","checkpoint-location2") .option("checkpointLocation","checkpoint-location2") \
.trigger(Trigger.ProcessingTime("5 seconds")) .trigger(processingTime='5 seconds') \
.start() .start()
Problem Statement: Read json file and write json file using spark structured streaming
Solution:
Scala Spark Program PySpark Program
import org.apache.spark.sql.SparkSession from pyspark.sql import SparkSession
// process # process
val ordersDf.createOrReplaceTempView("orders") ordersDf.createOrReplaceTempView("orders")
val completeOrders =spark.sql("select * from orders where completeOrders =spark.sql("select * from orders where
order_status='COMPLETE'") order_status='COMPLETE'")
wordCountQuery.awaitTermination() wordCountQuery.awaitTermination()
Add bold line to set micro batch of one file. NO change in pyspark
val ordersDf = spark.readStream ordersDf = spark.readStream \
.format("json") .format("json") \
.option("path", "myinputfolder") .option("path", "myinputfolder") \
.option("maxFilesPerTrigger",1) .option("maxFilesPerTrigger",1) \
.load() .load()\
Logger.getLogger("org").setLevel(Level.ERROR)
//define own schema instead of infering it # define own schema instead of infering it
val orderSchema= StructType(List( orderSchema = StructType([
WEEK 16 – Spark Streaming Part 2 TRENDY TECH
TRENDY TECH
StructField("order_id", IntegerType), StructField("order_id", IntegerType()),
StructField("order_date", StringType), StructField("order_date", StringType()),
StructField("order_customer_id", IntegerType), StructField("order_customer_id", IntegerType()),
StructField("order_status", StringType), StructField("order_status", StringType()),
StructField("amount", IntegerType), StructField("amount", IntegerType()),
)) ])
ordersQuery.awaitTermination() ordersQuery.awaitTermination()
Problem Statement: Write a real time join program using spark structured streaming
Solution:
Scala Spark Program PySpark Program
import org.apache.spark.sql.SparkSession from pyspark.sql import SparkSession
import org.apache.log4j.Level from pyspark.sql.functions import from_json, window
import org.apache.log4j.Logger from pyspark.sql.types import StructType, StructField, IntegerType,
import java.sql.Timestamp StringType, TimestampType
import org.apache.spark.sql.types.StructType from pyspark.sql import functions as F
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.TimestampType
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.Trigger
// define own schema instead of infering it # define own schema instead of infering it
val impressionSchema = StructType(List( impressionSchema = StructType([
StructField("impressionID", StringType), StructField("impressionID", StringType()),
StructField("ImpressionTime", TimestampType), StructField("ImpressionTime", TimestampType()),
StructField("CampaignName", StringType), StructField("CampaignName", StringType()),
)) ])
// structure the data based on the schema defined - impressionDf # structure the data based on the schema defined - impressionDf
val valueDF1 = impressionsDf.select(from_json(col("value"), valueDF1 = impressionsDf.select(from_json(F.col("value"),
impressionSchema).alias("value")) impressionSchema).alias("value"))
val impressionDfNew = valueDF1.select("value.*") impressionDfNew = valueDF1.select("value.*")
// structure the data based on the schema defined - clickDf # structure the data based on the schema defined - clickDf
val valueDF2 = clicksDf.select(from_json(col("value"), valueDF2 = clicksDf.select(from_json(F.col("value"),
clickSchema).alias("value")) clickSchema).alias("value"))
val clickDfNew = valueDF2.select("value.*") clickDfNew = valueDF2.select("value.*")
// joining both the streaming data frames #joining both the streaming data frames
val joinedDf = impressionDfNew.join(clickDfNew,joinExpr,joinType) joinedDf = impressionDfNew.join(clickDfNew,joinExpr,joinType) \
.drop(clickDfNew.col("clickID")) .drop(clickDfNew["clickID"])
campaignQuery.awaitTermination() campaignQuery.awaitTermination()