D8 - Lab - Practicals - Day 8

Day 8 – Go to Page 65
Previous Days kept for easy reference back
Day 4
Practicals - RDD (Resilient Distributed Dataset)
spark-shell
scala> sc.setLogLevel("ERROR")
1. Create RDD (Parallelize Method)
- Method - Function defined in class
Method: spark.sparkContext.parallelize - use to create a RDD
scala> val carsArray = Array("BMW", "Bentley", "Mercedes", "Suzuki",

"Honda", "Jaquar", "Fiat", "Audi")
carsArray: Array[String] = Array(BMW, Bentley, Mercedes, Suzuki,

Honda, Jaquar, Fiat, Audi)
scala> val carsRDD = spark.sparkContext.parallelize(carsArray,2)
carsRDD: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[0]

at parallelize at <console>:25
scala> carsRDD.collect()
res2: Array[String] = Array(BMW, Bentley, Mercedes, Suzuki, Honda,

Jaquar, Fiat, Audi)
OR
scala> val carsRDD = sc.parallelize(carsArray,2)


Jaquar, Fiat, Audi)
2. Transformations
2.1 distinct
# Create an RDD with some duplicate values - BMW
scala> val carsArray = Array("BMW", "Bentley", "Mercedes", "Suzuki",

"Honda", "Jaquar", "Fiat", "Audi", "BMW")
carsArray: Array[String] = Array(BMW, Bentley, Mercedes, Suzuki,

Honda, Jaquar, Fiat, Audi, BMW)
scala> val carsRDD = sc.parallelize(carsArray,2)

Jaquar, Fiat, Audi, BMW)
# Get distinct values in RDD. i.e. BMW should only come 1 time
scala> val distinctCarsRDD = carsRDD.distinct()
distinctCarsRDD: org.apache.spark.rdd.RDD[String] =
MapPartitionsRDD[5] at distinct at <console>:25
scala> distinctCarsRDD.collect()
res6: Array[String] = Array(BMW, Mercedes, Jaquar, Fiat, Honda,

Bentley, Suzuki, Audi)
2.2. filter
Example 1:

scala> distinctCarsRDD.filter(carName =>

carName.startsWith("B")).collect()
res7: Array[String] = Array(BMW, Bentley)
OR - use shorthand notation
scala> distinctCarsRDD.filter(_.startsWith("B")).collect()
res15: Array[String] = Array(BMW, Bentley)
Example 2:
# defining an RDD with numbers 1 to 10 in 2 partitions
scala> val numbersRDD = sc.parallelize(1 to 10,2)
numbersRDD: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[14]

scala> numbersRDD.collect()
res16: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
# filter only even numbers
scala> val evenNumbersRDD = numbersRDD.filter(x => x%2 ==0)
evenNumbersRDD: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[15]

at filter at <console>:25
scala> evenNumbersRDD.collect
res17: Array[Int] = Array(2, 4, 6, 8, 10)
scala>
# use shorthant notation
scala> val evenNumbersRDD = numbersRDD.filter(_%2 ==0)

scala> evenNumbersRDD.collect
scala>
# for more complex filters, prefer to define function separately
scala> def evenFilter(x: int) = {
| x%2 == 0
| }
<console>:23: error: not found: type int
def evenFilter(x: int) = {
scala> def evenFilter(x: Int) = {
| x%2 == 0
| }
evenFilter: (x: Int)Boolean
scala> val evenNumbersRDD = numbersRDD.filter(evenFilter)

scala> evenNumbersRDD.collect()
2.3 map

# make a map as carname, true/false
scala> val carsWithBRDD = distinctCarsRDD.map(carName => (carName,

carName.startsWith("B")))
carsWithBRDD: org.apache.spark.rdd.RDD[(String, Boolean)] =

MapPartitionsRDD[19] at map at <console>:25
scala> carsWithBRDD.collect()
res21: Array[(String, Boolean)] = Array((BMW,true), (Mercedes,false),

(Jaquar,false), (Fiat,false), (Honda,false), (Bentley,true),
(Suzuki,false), (Audi,false))
# filter only true
scala> carsWithBRDD.filter(x => x._2 == true).collect()
res22: Array[(String, Boolean)] = Array((BMW,true), (Bentley,true))
Example 2: carname, length of carname. eg: bmw,3

scala> distinctCarsRDD.map(x => (x,x.length)).collect()
res27: Array[(String, Int)] = Array((BMW,3), (Mercedes,8), (Jaquar,6),

(Fiat,4), (Honda,5), (Bentley,7), (Suzuki,6), (Audi,4))
2.4 flatMap
Example 1: Scala
scala> val array1D = Array("1,2,3", "4,5,6", "7,8,9")
array1D: Array[String] = Array(1,2,3, 4,5,6, 7,8,9)

scala> val array2D = array1D.map(x => x.split(","))
array2D: Array[Array[String]] = Array(Array(1, 2, 3), Array(4, 5, 6),

Array(7, 8, 9))
scala> val arrayFLATD = array1D.flatMap(x => x.split(","))
arrayFLATD: Array[String] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9)
Example 2: RDD
scala> val book = Array("The history of textbooks dates back to

ancient civilizations.", "The Greek philosopher Socrates lamented the
loss of knowledge because the media of transmission were changing",
"The next revolution in the field of books came with the 15th-century
invention of printing with changeable type. ","Gutenberg's first and
only large-scale printing effort was the now iconic Gutenberg Bible in
the 1450s — a Latin translation from the Hebrew Old Testament and the
Greek New Testament. ", "While many textbooks were already in use,
compulsory education and the resulting growth of schooling in Europe
led to the printing of many more textbooks for children. ")
book: Array[String] = Array(The history of textbooks dates back to

ancient civilizations., The Greek philosopher Socrates lamented the
loss of knowledge because the media of transmission were changing,
invention of printing with changeable type. ", "Gutenberg's first and
Steps for word count example

Step 1: split words by space (The, history )
Step 2: assign value 1 to every word (The 1, history 1)
Step 3: reduce it (The 5, history 3)
# make an RDD from array in 2 partition
scala> val bookRDD = sc.parallelize(book,2)
bookRDD: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[22]

scala> bookRDD.collect()
res28: Array[String] = Array(The history of textbooks dates back to

scala>
# using map - get array of array
scala> bookRDD.map(x => x.split(" ")).collect
res31: Array[Array[String]] = Array(Array(The, history, of, textbooks,

dates, back, to, ancient, civilizations.), Array(The, Greek,
philosopher, Socrates, lamented, the, loss, of, knowledge, because,
the, media, of, transmission, were, changing), Array(The, next,
revolution, in, the, field, of, books, came, with, the, 15th-century,
invention, of, printing, with, changeable, type.), Array(Gutenberg's,
first, and, only, large-scale, printing, effort, was, the, now,
iconic, Gutenberg, Bible, in, the, 1450s, —, a, Latin, translation,
from, the, Hebrew, Old, Testament, and, the, Greek, New, Testament.),
Array(While, many, textbooks, were, already, in, use,, compulsory,
education, and, the, resulting, growth, of, schooling, in, Europe,
led, to, the, printing, of, many, more, textbooks, for, c...
# flatMap - Flattens
scala> bookRDD.flatMap(x => x.split(" ")).collect
res32: Array[String] = Array(The, history, of, textbooks, dates, back,

to, ancient, civilizations., The, Greek, philosopher, Socrates,
lamented, the, loss, of, knowledge, because, the, media, of,
transmission, were, changing, The, next, revolution, in, the, field,
of, books, came, with, the, 15th-century, invention, of, printing,
with, changeable, type., Gutenberg's, first, and, only, large-scale,
printing, effort, was, the, now, iconic, Gutenberg, Bible, in, the,
1450s, —, a, Latin, translation, from, the, Hebrew, Old, Testament,
and, the, Greek, New, Testament., While, many, textbooks, were,
already, in, use,, compulsory, education, and, the, resulting, growth,
of, schooling, in, Europe, led, to, the, printing, of, many, more,
textbooks, for, children.)
Day 5 – RDD Day 2
scala> val flatBookRDD = bookRDD.flatMap(x => x.split(" "))
flatBookRDD: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[1] at

flatMap at <console>:25
scala> flatBookRDD.collect

2.5 sort
scala> flatBookRDD.sortBy(word => word.length()).collect()
res5: Array[String] = Array(—, a, of, to, of, of, in, of, of, in, in,
of, in, to, of, The, The, the, the, The, the, the, and, was, the, now,
the, the, Old, and, the, New, and, the, led, the, for, back, loss,
were, next, came, with, with, only, from, many, were, use,, many,
more, dates, Greek, media, field, books, type., first, Bible, 1450s,
Latin, Greek, While, effort, iconic, Hebrew, growth, Europe, history,
ancient, because, already, Socrates, lamented, changing, printing,
printing, printing, textbooks, knowledge, invention, Gutenberg,
Testament, textbooks, education, resulting, schooling, textbooks,
children., revolution, changeable, Testament., compulsory,
philosopher, Gutenberg's, large-scale, translation, transmission,
15th-century, civilizations.)
scala>
scala> flatBookRDD.sortBy(word => word.length(),false).collect()
res6: Array[String] = Array(civilizations., transmission, 15th-

century, philosopher, Gutenberg's, large-scale, translation,
revolution, changeable, Testament., compulsory, textbooks, knowledge,
invention, Gutenberg, Testament, textbooks, education, resulting,
schooling, textbooks, children., Socrates, lamented, changing,
printing, printing, printing, history, ancient, because, already,
effort, iconic, Hebrew, growth, Europe, dates, Greek, media, field,
books, type., first, Bible, 1450s, Latin, Greek, While, back, loss,
were, next, came, with, with, only, from, many, were, use,, many,
more, The, The, the, the, The, the, the, and, was, the, now, the, the,
Old, and, the, New, and, the, led, the, for, of, to, of, of, in, of,
of, in, in, of, in, to, of, —, a)
2.6 randomSplit
Data Engineering – Not required
Data Science – Machine Learning – Because you need to break the entire
data set into 70:30 train-test sets
# Splitting as 70-30%. Returns Array of RDD
scala> val trainTestBookRDD = flatBookRDD.randomSplit(Array[Double]

(0.70,0.30))
trainTestBookRDD: Array[org.apache.spark.rdd.RDD[String]] =
Array(MapPartitionsRDD[17] at randomSplit at <console>:25,
MapPartitionsRDD[18] at randomSplit at <console>:25)
scala> trainTestBookRDD(0).collect
res8: Array[String] = Array(history, of, dates, back, to,

civilizations., The, Greek, Socrates, lamented, loss, knowledge, the,
media, transmission, were, changing, next, revolution, in, of, came,
with, the, 15th-century, invention, printing, changeable, type.,
first, large-scale, printing, effort, the, now, iconic, Gutenberg, in,
the, 1450s, —, a, translation, from, the, Hebrew, Old, Testament, and,
Greek, Testament., While, textbooks, already, in, use,, compulsory,
education, the, resulting, growth, of, led, of, many, more, textbooks,
for, children.)
scala> trainTestBookRDD(1).collect
res9: Array[String] = Array(The, textbooks, ancient, philosopher, the,

of, because, of, The, the, field, books, of, with, Gutenberg's, and,
only, was, Bible, Latin, the, New, many, were, and, schooling, in,
Europe, to, the, printing)
3. Actions
Starts the Transformation/ Kicks Transformation
3.1 collect

3.2 reduce
It takes next element and perform operation (eg adds) with the
existing result
Wap in spark to find the shortest word from the book


# define the function
scala> def smallestWord(leftWord:String, rightWord:String): String = {

| if (leftWord.length > rightWord.length)
| return rightWord
| else
| return leftWord
| }
smallestWord: (leftWord: String, rightWord: String)String
# call the function in the reduce method
scala> flatBookRDD.reduce(smallestWord)
res13: String = —
OR
# not so good way of doing
scala> flatBookRDD.reduce((x,y) => (if (x.length > y.length)

| y
| else
| x
| )
| )
res16: String = —
3.3 count
Number of elements in the RDD

scala> flatBookRDD.count
res17: Long = 100
3.4 countApprox
# count returned in 25 millisecond timeout. Default confidence

interval is 95%
scala> flatBookRDD.countApprox(25)
res29:
org.apache.spark.partial.PartialResult[org.apache.spark.partial.Bounde
dDouble] = (final: [100.000, 100.000])
scala> flatBookRDD.countApprox(21)
res30:
dDouble] = (partial: [0.000, Infinity])
# with 22 milliseconds timeout and 10% confidence interval the count

is between 49 and 50. Actual was 100
scala> flatBookRDD.countApprox(22, .1)

res37:
dDouble] = (partial: [49.000, 50.000])
3.5 countByValue
Wap to find number of times each word comes in a textbook
scala> flatBookRDD.countByValue()
res39: scala.collection.Map[String,Long] = Map(Testament. -> 1,
Testament -> 1, for -> 1, Greek -> 2, in -> 4, printing -> 3, Old ->
1, Gutenberg's -> 1, effort -> 1, already -> 1, history -> 1, field ->
1, use, -> 1, Latin -> 1, civilizations. -> 1, revolution -> 1, led ->
1, growth -> 1, came -> 1, type. -> 1, a -> 1, because -> 1, textbooks
-> 3, Europe -> 1, Gutenberg -> 1, to -> 2, iconic -> 1, now -> 1,
large-scale -> 1, Hebrew -> 1, was -> 1, schooling -> 1, The -> 3, — -
> 1, ancient -> 1, education -> 1, 15th-century -> 1, dates -> 1,
While -> 1, lamented -> 1, back -> 1, with -> 2, from -> 1, books ->
1, next -> 1, first -> 1, media -> 1, knowledge -> 1, Bible -> 1, loss
-> 1, changing -> 1, were -> 2, more -> 1, New -> 1, changeable -> 1,
children. -> 1, translation -> 1, tran...
3.6 countByValueApprox
# 30 millisecond timeout and 50% confidence interval
scala> flatBookRDD.countByValueApprox(30,.50)
res48:
org.apache.spark.partial.PartialResult[scala.collection.Map[String,org
.apache.spark.partial.BoundedDouble]] = (final: Map(Testament. ->
[1.000, 1.000], Testament -> [1.000, 1.000], for -> [1.000, 1.000],
Greek -> [2.000, 2.000], in -> [4.000, 4.000], printing -> [3.000,
3.000], Old -> [1.000, 1.000], Gutenberg's -> [1.000, 1.000], effort -
> [1.000, 1.000], already -> [1.000, 1.000], history -> [1.000,
1.000], field -> [1.000, 1.000], use, -> [1.000, 1.000], Latin ->
[1.000, 1.000], civilizations. -> [1.000, 1.000], revolution ->
[1.000, 1.000], led -> [1.000, 1.000], growth -> [1.000, 1.000], came
-> [1.000, 1.000], type. -> [1.000, 1.000], a -> [1.000, 1.000],
because -> [1.000, 1.000], textbooks -> [3.000, 3.000], Europe ->
[1.000, 1.000], Gutenberg -> [1.000, 1.000], to -> [2....
3.7 first
# 1st record/word/string in RDD

scala> flatBookRDD.first
res51: String = The
3.8 max – gives max value

scala> sc.parallelize(1 to 10).max()
res53: Int = 10
3.9 min – gives min value in RDD

scala> sc.parallelize(1 to 10).min
res54: Int = 1
3.10 take, take ordered and top
scala> val sampleRDD = sc.parallelize(List(6,7,5,3,10,25,9,70,35))

sampleRDD: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[26]
scala> sampleRDD.collect
res55: Array[Int] = Array(6, 7, 5, 3, 10, 25, 9, 70, 35)
scala>
# take 1st 3 elements from RDD
scala> sampleRDD.take(3)
res56: Array[Int] = Array(6, 7, 5)
scala>
# Ascending Order
scala> sampleRDD.takeOrdered(3)
scala>
# Descending Order
scala> sampleRDD.top(3)
scala> flatBookRDD.take(3)
res59: Array[String] = Array(The, history, of)
scala> flatBookRDD.top(3)
res60: Array[String] = Array(—, with, with)
scala> flatBookRDD.takeOrdered(3)
res61: Array[String] = Array(1450s, 15th-century, Bible)
3.11 takeSample
scala> flatBookRDD.takeSample(true, 70, 10L)
# true – with Replacement

# 70 – number of Words
# seed – important for reproducibility
res62: Array[String] = Array(of, field, schooling, the, were, The,

While, Old, type., effort, 1450s, the, in, printing, from, media, the,
knowledge, of, large-scale, were, Gutenberg's, of, While, back, of, a,
the, a, and, textbooks, with, Testament., from, the, the, loss,
lamented, Socrates, because, the, was, Gutenberg, growth,
civilizations., New, and, civilizations., Bible, printing, Latin, of,
Testament, printing, resulting, in, The, now, changing, to, a, first,
printing, —, first, books, loss, the, of, 15th-century)
since with replacement was true, of came 7 times above even when in
the entire data of was present 4 times
scala> flatBookRDD.takeSample(false, 70, 10L)

res63: Array[String] = Array(New, the, to, and, from, Greek, loss,
growth, in, use,, in, of, of, and, of, history, printing, in, next,
books, was, led, Testament., textbooks, lamented, only, came, The,
Socrates, Greek, textbooks, iconic, textbooks, with, media, were,
knowledge, ancient, Gutenberg, the, of, to, back, dates, printing,
Europe, —, were, Hebrew, philosopher, 1450s, large-scale, the, 15th-
century, civilizations., Gutenberg's, with, field, changeable,
invention, the, resulting, While, type., for, The, the, changing, in,
now)
since withReplacement was false, word of came only 4 times
# note: below if you don’t specify seed, it will give you random
values
res65: Array[String] = Array(lamented, While, books, growth,
education)

education)

education)
scala> flatBookRDD.takeSample(true, 5)
res68: Array[String] = Array(of, came, invention, Latin, in)
scala> flatBookRDD.takeSample(true, 5)
res69: Array[String] = Array(for, the, —, in, ancient)
3.12 saveAsTextFile
scala> flatBookRDD.saveAsTextFile("/home/hadoopuser/temp201225")
# Just check the output below

hadoopuser@hadoopuser-VirtualBox:~$ hdfs dfs -ls
/home/hadoopuser/temp201225
Found 3 items
-rw-r--r-- 1 hadoopuser supergroup 0 2020-12-25 20:31
/home/hadoopuser/temp201225/_SUCCESS
/home/hadoopuser/temp201225/part-00000
hadoopuser@hadoopuser-VirtualBox:~$ hdfs dfs -cat
The
history
of
textbooks
dates
back
to
ancient
civilizations.
The
Greek
philosopher
Socrates
lamented
the
loss
of
knowledge
because
the
media
of
transmission
were
changing
The
next
revolution
in
the
field
of
books
came
with
the
15th-century
invention
of
printing
with
changeable
type.
Gutenberg's
first
and
only
large-scale
printing
effort
was
the
now
iconic
Gutenberg
Bible
in
the
1450s
—
a
Latin
translation
from
the
Hebrew
Old
Testament
and
the
Greek
New
Testament.
While
many
textbooks
were
already
in
use,
compulsory
education
and
the
resulting
growth
of
schooling
in
Europe
led
to
the
printing
of
many
more
textbooks
for
children.
3.13 cache and persist (Sure interview question)

Problem : in below code, flatMap transformation is executed 2 times, 1
time for take and 1 time for top

flatBookRDD: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[46]
at flatMap at <console>:25
res79: Array[String] = Array(The, history, of, textbooks, dates)
res80: Array[String] = Array(—, with, with, were, were)
Solution: use cache

flatBookRDD: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[48]
at flatMap at <console>:25
# cache in a memory.
scala> flatBookRDD.cache()
res81: flatBookRDD.type = MapPartitionsRDD[48] at flatMap at
<console>:25
res82: Array[String] = Array(The, history, of, textbooks, dates)
# will not do flatMap again. Instead will take from memory

res83: Array[String] = Array(—, with, with, were, were)
# stored in memory
scala> flatBookRDD.getStorageLevel
res84: org.apache.spark.storage.StorageLevel = StorageLevel(memory,
deserialized, 1 replicas)
Caution: Memory is limited. Memory exception if you do lot of catching

If I still want to do caching?????
Answer: use DISK. For DISK you have to use persist
Cache is only for memory
scala> flatBookRDD.persist()
<console>:25
res86: org.apache.spark.storage.StorageLevel = StorageLevel(memory,
deserialized, 1 replicas)
scala> flatBookRDD.unpersist()
<console>:25
scala>
flatBookRDD.persist(org.apache.spark.storage.StorageLevel.DISK_ONLY)
<console>:25
res102: org.apache.spark.storage.StorageLevel = StorageLevel(disk, 1
replicas)
http://spark.apache.org/docs/latest/rdd-programming-guide.html#rdd-
persistence
Day 6
3.14 checkpoint
- Debug Purpose
scala> sc.setCheckpointDir("/checkpoint201226")
scala> flatBookRDD.checkpoint()
# you can see the checkpoint created

hadoopuser@hadoopuser-VirtualBox:~$ hdfs dfs -ls /checkpoint201226
Found 2 items
drwxr-xr-x - hadoopuser supergroup 0 2020-12-26 19:18
/checkpoint201226/68f348df-4135-41f4-a86a-0bb38930253d
drwxr-xr-x - hadoopuser supergroup 0 2020-12-26 19:16
/checkpoint201226/db47c0f8-5eda-489d-9c77-bc797231f3e8
3.15 pipe – write linux command inside pipe method
scala> flatBookRDD.pipe("wc -l").collect()

res8: Array[String] = Array(25, 75)
3.16 mapPartitions
Map does the transformation element wise
scala> bookRDD.collect
res10: Array[String] = Array(The history of textbooks dates back to

loss of knowledge because the media of transmission were changing, The
next revolution in the field of books came with the 15th-century
invention of printing with changeable type., "Gutenberg's first and
scala> bookRDD.map(x => x.split(" ")).collect

dates, back, to, ancient, civilizations.), Array(The, Greek,
philosopher, Socrates, lamented, the, loss, of, knowledge, because,
the, media, of, transmission, were, changing), Array(The, next,
revolution, in, the, field, of, books, came, with, the, 15th-century,
invention, of, printing, with, changeable, type.), Array(Gutenberg's,
first, and, only, large-scale, printing, effort, was, the, now,
iconic, Gutenberg, Bible, in, the, 1450s, —, a, Latin, translation,
from, the, Hebrew, Old, Testament, and, the, Greek, New, Testament.),
Array(While, many, textbooks, were, already, in, use,, compulsory,
education, and, the, resulting, growth, of, schooling, in, Europe,
led, to, the, printing, of, many, more, textbooks, for, c...
mapPartitions – does transformation partition wise
Since mapPartitions operates in partition level, it is comparatively
faster than map.
scala> bookRDD.getNumPartitions
res12: Int = 2
scala> bookRDD.mapPartitions(part => Iterator[Int](1)).sum()

res13: Double = 2.0
3.17 mapPartitionsWithIndex
Debug : you want to know which word is present in which partition
scala> def indexFunc(i:Int, wordInIterator: Iterator[String]) =

{ wordInIterator.toList.map(word => s"Partition:$i
=>$word").iterator }
indexFunc: (i: Int, wordInIterator: Iterator[String])Iterator[String]
scala> flatBookRDD.mapPartitionsWithIndex(indexFunc).collect
res15: Array[String] = Array(Partition:0 =>The, Partition:0 =>history,
Partition:0 =>of, Partition:0 =>textbooks, Partition:0 =>dates,
Partition:0 =>back, Partition:0 =>to, Partition:0 =>ancient,
Partition:0 =>civilizations., Partition:0 =>The, Partition:0 =>Greek,
Partition:0 =>philosopher, Partition:0 =>Socrates, Partition:0
=>lamented, Partition:0 =>the, Partition:0 =>loss, Partition:0 =>of,
Partition:0 =>knowledge, Partition:0 =>because, Partition:0 =>the,
Partition:0 =>media, Partition:0 =>of, Partition:0 =>transmission,
Partition:0 =>were, Partition:0 =>changing, Partition:1 =>The,
Partition:1 =>next, Partition:1 =>revolution, Partition:1 =>in,
Partition:1 =>the, Partition:1 =>field, Partition:1 =>of, Partition:1
=>books, Partition:1 =>came, Partition:1 =>with, Partition:1
=>the, ...
3.18 foreachPartition
- It operates on each partition (similar to mapPartitions)

- Diff between mapPartition and foreachPartition is , mapPartition
will return some thing for each partition. Whereas
foreachPartition will not return any thing
# define a function to write contents of each partition into different

files in tmp directory
scala> def forFunc(wordInIterator: Iterator[String]) = {
| import java.io._
| import scala.util.Random
|
| var randomFileName = new Random().nextInt()
|
| var printWriter = new PrintWriter(new File(s"/tmp/randomfile-$
{randomFileName}.txt"))
| while (wordInIterator.hasNext) {
| printWriter.write(wordInIterator.next())
| }
| printWriter.close()
| }
forFunc: (wordInIterator: Iterator[String])Unit
scala> flatBookRDD.foreachPartition(forFunc)
3.19 glom
Writes contents of each RDD into an Array
scala> flatBookRDD.glom().collect
dates, back, to, ancient, civilizations., The, Greek, philosopher,
Socrates, lamented, the, loss, of, knowledge, because, the, media, of,
transmission, were, changing), Array(The, next, revolution, in, the,
field, of, books, came, with, the, 15th-century, invention, of,
printing, with, changeable, type., Gutenberg's, first, and, only,
large-scale, printing, effort, was, the, now, iconic, Gutenberg,
Bible, in, the, 1450s, —, a, Latin, translation, from, the, Hebrew,
Old, Testament, and, the, Greek, New, Testament., While, many,
textbooks, were, already, in, use,, compulsory, education, and, the,
resulting, growth, of, schooling, in, Europe, led, to, the, printing,
of, many, more, textbooks, for, children.))
4. Key – Value RDD
Dictionary in Python {“name”:”Harish”, “location”:”xyz”}

JSON {“name”:”Harish”, “location”:”xyz”}
Configurations
{“connectionString”:”jdbc::/mysql………”, “userId”:”xyz”,
“password”:”abc”}
1. Can you have multiple Keys of same name? - No

2. Can you have multiple values? – yes “name”:[“harish”,”masand”]
RDD – Yes you can have duplicate Keys
3.1 map to create key-value pair RDD
scala>
scala>
scala> flatBookRDD.map(word => (word, word.toLowerCase)).collect

res24: Array[(String, String)] = Array((The,the), (history,history),
(of,of), (textbooks,textbooks), (dates,dates), (back,back), (to,to),
(ancient,ancient), (civilizations.,civilizations.), (The,the),
(Greek,greek), (philosopher,philosopher), (Socrates,socrates),
(lamented,lamented), (the,the), (loss,loss), (of,of),
(knowledge,knowledge), (because,because), (the,the), (media,media),
(of,of), (transmission,transmission), (were,were),
(changing,changing), (The,the), (next,next), (revolution,revolution),
(in,in), (the,the), (field,field), (of,of), (books,books),
(came,came), (with,with), (the,the), (15th-century,15th-century),
(invention,invention), (of,of), (printing,printing), (with,with),
(changeable,changeable), (type.,type.), (Gutenberg's,gutenberg's),
(first,first), (and,and), (only,...
scala> flatBookRDD.map(word => (word, word.length)).collect

res27: Array[(String, Int)] = Array((The,3), (history,7), (of,2),
(textbooks,9), (dates,5), (back,4), (to,2), (ancient,7),
(civilizations.,14), (The,3), (Greek,5), (philosopher,11),
(Socrates,8), (lamented,8), (the,3), (loss,4), (of,2), (knowledge,9),
(because,7), (the,3), (media,5), (of,2), (transmission,12), (were,4),
(changing,8), (The,3), (next,4), (revolution,10), (in,2), (the,3),
(field,5), (of,2), (books,5), (came,4), (with,4), (the,3), (15th-
century,12), (invention,9), (of,2), (printing,8), (with,4),
(changeable,10), (type.,5), (Gutenberg's,11), (first,5), (and,3),
(only,4), (large-scale,11), (printing,8), (effort,6), (was,3),
(the,3), (now,3), (iconic,6), (Gutenberg,9), (Bible,5), (in,2),
(the,3), (1450s,5), (—,1), (a,1), (Latin,5), (translation,11),
(from,4), (the,3), (Hebrew,...
3.2 key value pair rdd using keyBy
scala> flatBookRDD.keyBy(word => word.length).collect

res28: Array[(Int, String)] = Array((3,The), (7,history), (2,of),
(9,textbooks), (5,dates), (4,back), (2,to), (7,ancient),
(14,civilizations.), (3,The), (5,Greek), (11,philosopher),
(8,Socrates), (8,lamented), (3,the), (4,loss), (2,of), (9,knowledge),
(7,because), (3,the), (5,media), (2,of), (12,transmission), (4,were),
(8,changing), (3,The), (4,next), (10,revolution), (2,in), (3,the),
(5,field), (2,of), (5,books), (4,came), (4,with), (3,the), (12,15th-
century), (9,invention), (2,of), (8,printing), (4,with),
(10,changeable), (5,type.), (11,Gutenberg's), (5,first), (3,and),
(4,only), (11,large-scale), (8,printing), (6,effort), (3,was),
(3,the), (3,now), (6,iconic), (9,Gutenberg), (5,Bible), (2,in),
(3,the), (5,1450s), (1,—), (1,a), (5,Latin), (11,translation),
(4,from), (3,the), (6,Hebre...
3.3 mapValues – modifies the value of a key-value RDD
scala> val keyValueRDD = flatBookRDD.keyBy(word => word.length)

keyValueRDD: org.apache.spark.rdd.RDD[(Int, String)] =
MapPartitionsRDD[17] at keyBy at <console>:25
scala> keyValueRDD.collect
res29: Array[(Int, String)] = Array((3,The), (7,history), (2,of),
(9,textbooks), (5,dates), (4,back), (2,to), (7,ancient),
(14,civilizations.), (3,The), (5,Greek), (11,philosopher),
(8,Socrates), (8,lamented), (3,the), (4,loss), (2,of), (9,knowledge),
(7,because), (3,the), (5,media), (2,of), (12,transmission), (4,were),
(8,changing), (3,The), (4,next), (10,revolution), (2,in), (3,the),
(5,field), (2,of), (5,books), (4,came), (4,with), (3,the), (12,15th-
century), (9,invention), (2,of), (8,printing), (4,with),
(10,changeable), (5,type.), (11,Gutenberg's), (5,first), (3,and),
(4,only), (11,large-scale), (8,printing), (6,effort), (3,was),
(3,the), (3,now), (6,iconic), (9,Gutenberg), (5,Bible), (2,in),
(3,the), (5,1450s), (1,—), (1,a), (5,Latin), (11,translation),
(4,from), (3,the), (6,Hebre...
scala> keyValueRDD.mapValues(word => word.toUpperCase).collect

res30: Array[(Int, String)] = Array((3,THE), (7,HISTORY), (2,OF),
(9,TEXTBOOKS), (5,DATES), (4,BACK), (2,TO), (7,ANCIENT),
(14,CIVILIZATIONS.), (3,THE), (5,GREEK), (11,PHILOSOPHER),
(8,SOCRATES), (8,LAMENTED), (3,THE), (4,LOSS), (2,OF), (9,KNOWLEDGE),
(7,BECAUSE), (3,THE), (5,MEDIA), (2,OF), (12,TRANSMISSION), (4,WERE),
(8,CHANGING), (3,THE), (4,NEXT), (10,REVOLUTION), (2,IN), (3,THE),
(5,FIELD), (2,OF), (5,BOOKS), (4,CAME), (4,WITH), (3,THE), (12,15TH-
CENTURY), (9,INVENTION), (2,OF), (8,PRINTING), (4,WITH),
(10,CHANGEABLE), (5,TYPE.), (11,GUTENBERG'S), (5,FIRST), (3,AND),
(4,ONLY), (11,LARGE-SCALE), (8,PRINTING), (6,EFFORT), (3,WAS),
(3,THE), (3,NOW), (6,ICONIC), (9,GUTENBERG), (5,BIBLE), (2,IN),
(3,THE), (5,1450S), (1,—), (1,A), (5,LATIN), (11,TRANSLATION),
(4,FROM), (3,THE), (6,HEBRE...
3.3 flatMapValues – modifies the value of a key-value RDD and

separates each element. i.e. flattens
scala> keyValueRDD.flatMapValues(word => word.toUpperCase).collect

res32: Array[(Int, Char)] = Array((3,T), (3,H), (3,E), (7,H), (7,I),
(7,S), (7,T), (7,O), (7,R), (7,Y), (2,O), (2,F), (9,T), (9,E), (9,X),
(9,T), (9,B), (9,O), (9,O), (9,K), (9,S), (5,D), (5,A), (5,T), (5,E),
(5,S), (4,B), (4,A), (4,C), (4,K), (2,T), (2,O), (7,A), (7,N), (7,C),
(7,I), (7,E), (7,N), (7,T), (14,C), (14,I), (14,V), (14,I), (14,L),
(14,I), (14,Z), (14,A), (14,T), (14,I), (14,O), (14,N), (14,S),
(14,.), (3,T), (3,H), (3,E), (5,G), (5,R), (5,E), (5,E), (5,K),
(11,P), (11,H), (11,I), (11,L), (11,O), (11,S), (11,O), (11,P),
(11,H), (11,E), (11,R), (8,S), (8,O), (8,C), (8,R), (8,A), (8,T),
(8,E), (8,S), (8,L), (8,A), (8,M), (8,E), (8,N), (8,T), (8,E), (8,D),
(3,T), (3,H), (3,E), (4,L), (4,O), (4,S), (4,S), (2,O), (2,F), (9,K),
(9,N), (9,O), (9,W), (9,L), (9,E), (9,D), (9,G), (9,...
3.5 keys and values – returns all keys
scala> keyValueRDD.keys.collect
res33: Array[Int] = Array(3, 7, 2, 9, 5, 4, 2, 7, 14, 3, 5, 11, 8, 8,
3, 4, 2, 9, 7, 3, 5, 2, 12, 4, 8, 3, 4, 10, 2, 3, 5, 2, 5, 4, 4, 3,
12, 9, 2, 8, 4, 10, 5, 11, 5, 3, 4, 11, 8, 6, 3, 3, 3, 6, 9, 5, 2, 3,
5, 1, 1, 5, 11, 4, 3, 6, 3, 9, 3, 3, 5, 3, 10, 5, 4, 9, 4, 7, 2, 4,
10, 9, 3, 3, 9, 6, 2, 9, 2, 6, 3, 2, 3, 8, 2, 4, 4, 9, 3, 9)
scala> keyValueRDD.values.collect
Day 7
# preparing data to work - keyValueRDD

Spark-shell
scala> sc.setLogLevel("ERROR")
scala> val book = Array("The history of textbooks dates back to

ancient civilizations.", "The Greek philosopher Socrates lamented the
loss of knowledge because the media of transmission were changing",
invention of printing with changeable type. ","Gutenberg's first and
book: Array[String] = Array(The history of textbooks dates back to
scala> val bookRDD = sc.parallelize(book,2)
bookRDD: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[0]

flatBookRDD: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[1] at
scala> val keyValueRDD = flatBookRDD.map(word => (word,word.length))

keyValueRDD: org.apache.spark.rdd.RDD[(String, Int)] =
scala> keyValueRDD.collect
res2: Array[(String, Int)] = Array((The,3), (history,7), (of,2),
(textbooks,9), (dates,5), (back,4), (to,2), (ancient,7),
(civilizations.,14), (The,3), (Greek,5), (philosopher,11),
(Socrates,8), (lamented,8), (the,3), (loss,4), (of,2), (knowledge,9),
(because,7), (the,3), (media,5), (of,2), (transmission,12), (were,4),
(changing,8), (The,3), (next,4), (revolution,10), (in,2), (the,3),
(field,5), (of,2), (books,5), (came,4), (with,4), (the,3), (15th-
century,12), (invention,9), (of,2), (printing,8), (with,4),
(changeable,10), (type.,5), (Gutenberg's,11), (first,5), (and,3),
(only,4), (large-scale,11), (printing,8), (effort,6), (was,3),
(the,3), (now,3), (iconic,6), (Gutenberg,9), (Bible,5), (in,2),
(the,3), (1450s,5), (—,1), (a,1), (Latin,5), (translation,11),
(from,4), (the,3), (Hebrew,6...
3.6 lookup – to find the value, for the given key
scala> keyValueRDD.lookup("history")
res5: Seq[Int] = WrappedArray(7)
scala> keyValueRDD.lookup("were")
res3: Seq[Int] = WrappedArray(4, 4)
scala> keyValueRDD.lookup("the")
res4: Seq[Int] = WrappedArray(3, 3, 3, 3, 3, 3, 3, 3, 3, 3)
# separate all the characters preset in the data and make key value
pait as T,1;h,1;e,1
scala> flatBookRDD collect

warning: there was one feature warning; re-run with
# separate to characters
scala> val charBookRDD = flatBookRDD.flatMap(word =>
word.toLowerCase.toSeq)
charBookRDD: org.apache.spark.rdd.RDD[Char] = MapPartitionsRDD[2] at
scala> charBookRDD.collect
res3: Array[Char] = Array(t, h, e, h, i, s, t, o, r, y, o, f, t, e, x,
t, b, o, o, k, s, d, a, t, e, s, b, a, c, k, t, o, a, n, c, i, e, n,
t, c, i, v, i, l, i, z, a, t, i, o, n, s, ., t, h, e, g, r, e, e, k,
p, h, i, l, o, s, o, p, h, e, r, s, o, c, r, a, t, e, s, l, a, m, e,
n, t, e, d, t, h, e, l, o, s, s, o, f, k, n, o, w, l, e, d, g, e, b,
e, c, a, u, s, e, t, h, e, m, e, d, i, a, o, f, t, r, a, n, s, m, i,
s, s, i, o, n, w, e, r, e, c, h, a, n, g, i, n, g, t, h, e, n, e, x,
t, r, e, v, o, l, u, t, i, o, n, i, n, t, h, e, f, i, e, l, d, o, f,
b, o, o, k, s, c, a, m, e, w, i, t, h, t, h, e, 1, 5, t, h, -, c, e,
n, t, u, r, y, i, n, v, e, n, t, i, o, n, o, f, p, r, i, n, t, i, n,
g, w, i, t, h, c, h, a, n, g, e, a, b, l, e, t, y, p, e, ., g, u, t,
e, n, b, e, r, g, ', s, f, i, r, s, ...
# assign a value 1 to each of these keys

i.e. (t,1) , (h,1) , (e,1)
scala> val kvCharBookRDD = charBookRDD.map(x => (x,1))

kvCharBookRDD: org.apache.spark.rdd.RDD[(Char, Int)] =
scala> kvCharBookRDD.collect
res4: Array[(Char, Int)] = Array((t,1), (h,1), (e,1), (h,1), (i,1),
(s,1), (t,1), (o,1), (r,1), (y,1), (o,1), (f,1), (t,1), (e,1), (x,1),
(t,1), (b,1), (o,1), (o,1), (k,1), (s,1), (d,1), (a,1), (t,1), (e,1),
(s,1), (b,1), (a,1), (c,1), (k,1), (t,1), (o,1), (a,1), (n,1), (c,1),
(i,1), (e,1), (n,1), (t,1), (c,1), (i,1), (v,1), (i,1), (l,1), (i,1),
(z,1), (a,1), (t,1), (i,1), (o,1), (n,1), (s,1), (.,1), (t,1), (h,1),
(e,1), (g,1), (r,1), (e,1), (e,1), (k,1), (p,1), (h,1), (i,1), (l,1),
(o,1), (s,1), (o,1), (p,1), (h,1), (e,1), (r,1), (s,1), (o,1), (c,1),
(r,1), (a,1), (t,1), (e,1), (s,1), (l,1), (a,1), (m,1), (e,1), (n,1),
(t,1), (e,1), (d,1), (t,1), (h,1), (e,1), (l,1), (o,1), (s,1), (s,1),
(o,1), (f,1), (k,1), (n,1), (o,1), (w,1), (l,1), (e,1), (d,1), (g,1),
(e,1), (b,1), (e,1), (c,1), (...
3.7 countByKey
scala> kvCharBookRDD.countByKey
res5: scala.collection.Map[Char,Long] = Map(e -> 69, s -> 28, x -> 4,
4 -> 1, n -> 46, . -> 4, y -> 8, t -> 55, u -> 10, f -> 13, a -> 30, 5
-> 2, m -> 11, i -> 38, - -> 2, , -> 1, v -> 3, 1 -> 2, — -> 1, ' ->
1, b -> 12, g -> 17, l -> 22, p -> 8, 0 -> 1, c -> 16, h -> 26, r ->
29, w -> 11, k -> 8, o -> 44, z -> 1, d -> 13)
3.8 countByKeyApprox
# timeout is 100 milliseconds, and default conficence is 0.95

scala> kvCharBookRDD.countByKeyApprox(100L)
res6:
org.apache.spark.partial.PartialResult[scala.collection.Map[Char,org.a
pache.spark.partial.BoundedDouble]] = (final: Map(e -> [69.000,
69.000], s -> [28.000, 28.000], x -> [4.000, 4.000], 4 -> [1.000,
1.000], n -> [46.000, 46.000], . -> [4.000, 4.000], y -> [8.000,
8.000], t -> [55.000, 55.000], u -> [10.000, 10.000], f -> [13.000,
13.000], a -> [30.000, 30.000], 5 -> [2.000, 2.000], m -> [11.000,
11.000], i -> [38.000, 38.000], - -> [2.000, 2.000], , -> [1.000,
1.000], v -> [3.000, 3.000], 1 -> [2.000, 2.000], — -> [1.000, 1.000],
' -> [1.000, 1.000], b -> [12.000, 12.000], g -> [17.000, 17.000], l -
> [22.000, 22.000], p -> [8.000, 8.000], 0 -> [1.000, 1.000], c ->
[16.000, 16.000], h -> [26.000, 26.000], r -> [29.000, 29.000], w ->
[11.000, 11.000], k -> [8.000, 8.000], o -> [44....
# timeout is 100 milliseconds, and confidence is 0.10

scala> kvCharBookRDD.countByKeyApprox(100L,0.10)
res7:
org.apache.spark.partial.PartialResult[scala.collection.Map[Char,org.a
pache.spark.partial.BoundedDouble]] = (final: Map(e -> [69.000,
69.000], s -> [28.000, 28.000], x -> [4.000, 4.000], 4 -> [1.000,
1.000], n -> [46.000, 46.000], . -> [4.000, 4.000], y -> [8.000,
8.000], t -> [55.000, 55.000], u -> [10.000, 10.000], f -> [13.000,
13.000], a -> [30.000, 30.000], 5 -> [2.000, 2.000], m -> [11.000,
11.000], i -> [38.000, 38.000], - -> [2.000, 2.000], , -> [1.000,
1.000], v -> [3.000, 3.000], 1 -> [2.000, 2.000], — -> [1.000, 1.000],
' -> [1.000, 1.000], b -> [12.000, 12.000], g -> [17.000, 17.000], l -
> [22.000, 22.000], p -> [8.000, 8.000], 0 -> [1.000, 1.000], c ->
[16.000, 16.000], h -> [26.000, 26.000], r -> [29.000, 29.000], w ->
[11.000, 11.000], k -> [8.000, 8.000], o -> [44....
3.9 groupByKey
scala> def additionFunction (x: Int, y: Int) = x + y

additionFunction: (x: Int, y: Int)Int
scala> kvCharBookRDD.groupByKey().map(z =>

(z._1,z._2.reduce(additionFunction))).collect
res13: Array[(Char, Int)] = Array((d,13), (z,1), (4,1), (p,8), (x,4),
(t,55), (b,12), (.,4), (0,1), (h,26), (n,46), (—,1), (f,13), (v,3),
(r,29), (l,22), (,,1), (w,11), (s,28), (e,69), (',1), (5,2), (a,30),
(i,38), (y,8), (k,8), (u,10), (o,44), (-,2), (1,2), (g,17), (m,11),
(c,16))
3.10 reduceByKey
scala> kvCharBookRDD.reduceByKey(additionFunction).collect
(t,55), (.,4), (b,12), (0,1), (h,26), (n,46), (—,1), (f,13), (v,3),
(r,29), (l,22), (,,1), (w,11), (s,28), (e,69), (',1), (5,2), (a,30),
(i,38), (k,8), (y,8), (u,10), (o,44), (-,2), (1,2), (g,17), (m,11),
(c,16))
reduceByKey is same as groupByKey + reduce
spark. Eg: driver – 10 executor

- Group by key – does in a driver
- Reduce By Key – 1st do in executor and then do in driver
Reducebykey will be fast.

Also groupbykey may risk outofmemory
Analogy in SQL
Reduce by -Select name, sum(marks) from table group by name
Group by – select name, marks from table group by name
3.11 aggregate
scala> val numbers = sc.parallelize(1 to 20,4)

numbers: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[18] at
parallelize at <console>:24
scala> numbers.glom().collect()
res22: Array[Array[Int]] = Array(Array(1, 2, 3, 4, 5), Array(6, 7, 8,
9, 10), Array(11, 12, 13, 14, 15), Array(16, 17, 18, 19, 20))
scala> numbers.aggregate(0)(_ + _, _ + _)
res23: Int = 210
Yellow is partition wise addition – 15, 40, 65, 90

Blue is going to add the result of partition wise addition i.e =
15+40+65+90 = 210
scala> numbers.aggregate(1)(_ + _, _ + _)
res37: Int = 215
1 + 15 = 16
1 + 16 + 40 = 57
1 + 57 + 65 = 123
1 + 123 + 90 = 214
1 + 214 = 215
Optional
Scala> val numbers = sc.parallelize(1 to 4,2)

numbers: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[20] at
scala> numbers.glom.collect
res39: Array[Array[Int]] = Array(Array(1, 2), Array(3, 4))
scala> numbers.aggregate(2)(_ * _, _ * _)
res46: Int = 192
partition wise multiplication = 1x2=2 and 3x4=12
2 x 2 = 4
2 x 4 x 12 = 96
2 x 96 = 192
3.12 treeAggregate
Even the second operation happens at the executor level depending the
depth
scala> numbers.treeAggregate(0)(_ + _, _ + _, 3)
res48: Int = 10
3.13 aggregareByKey
(s,1), (t,1), (o,1), (r,1), (y,1), (o,1), (f,1), (t,1), (e,1), (x,1),
(t,1), (b,1), (o,1), (o,1), (k,1), (s,1), (d,1), (a,1), (t,1), (e,1),
(s,1), (b,1), (a,1), (c,1), (k,1), (t,1), (o,1), (a,1), (n,1), (c,1),
(i,1), (e,1), (n,1), (t,1), (c,1), (i,1), (v,1), (i,1), (l,1), (i,1),
(z,1), (a,1), (t,1), (i,1), (o,1), (n,1), (s,1), (.,1), (t,1), (h,1),
(e,1), (g,1), (r,1), (e,1), (e,1), (k,1), (p,1), (h,1), (i,1), (l,1),
(o,1), (s,1), (o,1), (p,1), (h,1), (e,1), (r,1), (s,1), (o,1), (c,1),
(r,1), (a,1), (t,1), (e,1), (s,1), (l,1), (a,1), (m,1), (e,1), (n,1),
(t,1), (e,1), (d,1), (t,1), (h,1), (e,1), (l,1), (o,1), (s,1), (s,1),
(o,1), (f,1), (k,1), (n,1), (o,1), (w,1), (l,1), (e,1), (d,1), (g,1),
(e,1), (b,1), (e,1), (c,1), ...
scala> kvCharBookRDD.aggregateByKey(0)(_ + _, _ + _).collect

(t,55), (.,4), (b,12), (0,1), (h,26), (n,46), (—,1), (f,13), (v,3),
(r,29), (l,22), (,,1), (w,11), (s,28), (e,69), (',1), (5,2), (a,30),
(i,38), (k,8), (y,8), (u,10), (o,44), (-,2), (1,2), (g,17), (m,11),
(c,16))
3.14 cogroup
scala> val kvCharBookRDD2 = charBookRDD.map(x => (x,2))

kvCharBookRDD2: org.apache.spark.rdd.RDD[(Char, Int)] =
(s,1), (t,1), (o,1), (r,1), (y,1), (o,1), (f,1), (t,1), (e,1), (x,1),
(t,1), (b,1), (o,1), (o,1), (k,1), (s,1), (d,1), (a,1), (t,1), (e,1),
(s,1), (b,1), (a,1), (c,1), (k,1), (t,1), (o,1), (a,1), (n,1), (c,1),
(i,1), (e,1), (n,1), (t,1), (c,1), (i,1), (v,1), (i,1), (l,1), (i,1),
(z,1), (a,1), (t,1), (i,1), (o,1), (n,1), (s,1), (.,1), (t,1), (h,1),
(e,1), (g,1), (r,1), (e,1), (e,1), (k,1), (p,1), (h,1), (i,1), (l,1),
(o,1), (s,1), (o,1), (p,1), (h,1), (e,1), (r,1), (s,1), (o,1), (c,1),
(r,1), (a,1), (t,1), (e,1), (s,1), (l,1), (a,1), (m,1), (e,1), (n,1),
(t,1), (e,1), (d,1), (t,1), (h,1), (e,1), (l,1), (o,1), (s,1), (s,1),
(o,1), (f,1), (k,1), (n,1), (o,1), (w,1), (l,1), (e,1), (d,1), (g,1),
(e,1), (b,1), (e,1), (c,1), ...
scala> kvCharBookRDD2.collect
(s,2), (t,2), (o,2), (r,2), (y,2), (o,2), (f,2), (t,2), (e,2), (x,2),
(t,2), (b,2), (o,2), (o,2), (k,2), (s,2), (d,2), (a,2), (t,2), (e,2),
(s,2), (b,2), (a,2), (c,2), (k,2), (t,2), (o,2), (a,2), (n,2), (c,2),
(i,2), (e,2), (n,2), (t,2), (c,2), (i,2), (v,2), (i,2), (l,2), (i,2),
(z,2), (a,2), (t,2), (i,2), (o,2), (n,2), (s,2), (.,2), (t,2), (h,2),
(e,2), (g,2), (r,2), (e,2), (e,2), (k,2), (p,2), (h,2), (i,2), (l,2),
(o,2), (s,2), (o,2), (p,2), (h,2), (e,2), (r,2), (s,2), (o,2), (c,2),
(r,2), (a,2), (t,2), (e,2), (s,2), (l,2), (a,2), (m,2), (e,2), (n,2),
(t,2), (e,2), (d,2), (t,2), (h,2), (e,2), (l,2), (o,2), (s,2), (s,2),
(o,2), (f,2), (k,2), (n,2), (o,2), (w,2), (l,2), (e,2), (d,2), (g,2),
(e,2), (b,2), (e,2), (c,2), ...
scala> kvCharBookRDD.cogroup(kvCharBookRDD2).collect
res56: Array[(Char, (Iterable[Int], Iterable[Int]))] = Array((d,
(CompactBuffer(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),CompactBuffer(2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2))), (z,
(CompactBuffer(1),CompactBuffer(2))), (4,
(CompactBuffer(1),CompactBuffer(2))), (p,(CompactBuffer(1, 1, 1, 1, 1,
1, 1, 1),CompactBuffer(2, 2, 2, 2, 2, 2, 2, 2))), (x,(CompactBuffer(1,
1, 1, 1),CompactBuffer(2, 2, 2, 2))), (t,(CompactBuffer(1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1),CompactBuffer(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2))), (b,
(CompactBuffer(1, 1, 1, 1, 1, 1...
3.15 joins
scala> kvCharBookRDD.join(kvCharBookRDD2).collect
res57: Array[(Char, (Int, Int))] = Array((d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)...
Day 8:
3. Advanced RDD Methods
3.1 zips – to make pair RDD
# make 1st RDD

scala> val wordArray = Array ("bmw", "audi", "mercedes",
"suzuki")
wordArray: Array[String] = Array(bmw, audi, mercedes, suzuki)
scala> val wordRDD = sc.parallelize(wordArray,2)

wordRDD: org.apache.spark.rdd.RDD[String] =
ParallelCollectionRDD[0] at parallelize at <console>:26
scala> wordRDD.collect
res2: Array[String] = Array(bmw, audi, mercedes, suzuki)
# make 2nd RDD

scala> val costArray = Array (100000, 200000, 300000, 50000)
costArray: Array[Int] = Array(100000, 200000, 300000, 50000)
scala> val costRDD = sc.parallelize(costArray,2)
costRDD: org.apache.spark.rdd.RDD[Int] =
ParallelCollectionRDD[1] at parallelize at <console>:26
scala> costRDD.collect
res3: Array[Int] = Array(100000, 200000, 300000, 50000)
# zip: Zips this RDD with another one, returning key-value pairs with the first
element in each RDD second element in each RDD
scala> val zippedRDD = wordRDD.zip(costRDD)

zippedRDD: org.apache.spark.rdd.RDD[(String, Int)] =
ZippedPartitionsRDD2[2] at zip at <console>:27
scala> zippedRDD.collect
res4: Array[(String, Int)] = Array((bmw,100000),
(audi,200000), (mercedes,300000), (suzuki,50000))
1. Both RDD’s should have same number of partitions.

2. Both RDD’s should have same number of elements in each
partition.
3.2 coalesce
 only reduce number of partitions
 avoids reshuffling of data
3.3 repartition
 reduce or increase the number of partitions
 repartition causes shuffling of data
When to reduce number of partitions and when to increase number of
partitions?
Reduce – example: you do a filter on some RDD and data in the RDD
reduces, so no meaning in keeping data in multiple partition, so you
can reduce the number of partitions
Increase – you want to increase parallelism in the job
val carRDD = RDD(“x1”,”x2”,”x3”,”x4” ……… 100’s of model names , 1

partition)
Make some reporting
X1 reporting, x2 reporting, x3 reporting…. Can be in parallel
repartition
val carRDD = RDD(“x1”,”x2”,”x3”,”x4” ……… 100’s of model names , 10
partition)
- basically increasing the amount of parallelism being performed
Lab
# created a car array

scala> val carArray = Array ("bmw", "audi", "mercedes", "suzuki",
"toyota", "chevorlete", "saab", "honda", "mazda", "toyota")
carArray: Array[String] = Array(bmw, audi, mercedes, suzuki, toyota,
chevorlete, saab, honda, mazda, toyota)
# created a carRDD with 4 partitions

scala> val carRDD = sc.parallelize(carArray,4)
carRDD: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[3] at
# Reduce the number of partitions using coalesce and repartition

Note that repartition took more time and did the complete shuffle
scala> carRDD.glom().collect()
res5: Array[Array[String]] = Array(Array(bmw, audi), Array(mercedes,
suzuki, toyota), Array(chevorlete, saab), Array(honda, mazda, toyota))
scala> carRDD.coalesce(3).glom().collect()
suzuki, toyota), Array(chevorlete, saab, honda, mazda, toyota))
scala> carRDD.repartition(3).glom().collect()
[Stage 6:> (0
+[Stage 6:>
(0 +[Stage 6:==============>
(1 +[Stage 6:=============================>
(2 +[Stage 6:============================================>
(3 +
res7: Array[Array[String]] = Array(Array(suzuki, mazda), Array(bmw,
toyota, chevorlete, toyota), Array(audi, mercedes, saab, honda))
# Increase number of partition.

Coalesce does not work as coalesce can only reduce the number of
partitions.
Repartition is increase the number of partition
scala> carRDD.coalesce(6).glom().collect()
suzuki, toyota), Array(chevorlete, saab), Array(honda, mazda, toyota))
scala> carRDD.repartition(6).glom().collect()
res9: Array[Array[String]] = Array(Array(suzuki), Array(bmw, toyota),
Array(audi, honda), Array(mazda), Array(chevorlete, toyota),
Array(mercedes, saab))
3.4 custom Partition

- you are able to control which partition gets what data
Data we are using is as below
DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Croatia,1
United States,Ireland,344
Egypt,United States,15
United States,India,62
United States,Singapore,1
United States,Grenada,62
Costa Rica,United States,588
Senegal,United States,40
Moldova,United States,1
United States,Sint Maarten,325
United States,Marshall Islands,39
Guyana,United States,64
Malta,United States,1
Anguilla,United States,41
Bolivia,United States,30
United States,Paraguay,6
Algeria,United States,4
Turks and Caicos Islands,United States,230
United States,Gibraltar,1
Saint Vincent and the Grenadines,United States,1
Italy,United States,382
United States,Federated States of Micronesia,69
United States,Russia,161
Pakistan,United States,12
United States,Netherlands,660
Iceland,United States,181
Marshall Islands,United States,42
Luxembourg,United States,155
Honduras,United States,362
The Bahamas,United States,955
Step 1: Create a csv file

Option A: using vi command
i. hadoopuser@hadoopuser-VirtualBox:~$ vi flightData.csv
ii. copy the data
iii. to save exit, used Esc + wq!
Option B: If you are not comfortable using vi
i. open the text editor in the ubuntu

ii. copy the data
iii. press save button and save it to path
/home/hadoopuser/flightData.csv
Step 2: Copy the data from local linux machine into the hadoop
directory
To copy the file first create the directory:

hadoopuser@hadoopuser-VirtualBox:~$ hdfs dfs -mkdir /sparkLabDay8
hadoopuser@hadoopuser-VirtualBox:~$ hdfs dfs -copyFromLocal

/home/hadoopuser/flightData.csv /sparkLabDay8/
Step 3: See the data in hadoop directory

/sparkLabDay8/flightData.csv
1. Load the data in a data frame

2. You do all the transformation actions
3. Convert data frame to RDD
4. Do the custom partition on RDD {NOTE: YOU CAN NOT DO A CUSTOME
PARTITION IN DATA FRAME}
5. Convert RDD back into data frame
6. You do all the transformation actions according to business
logic
1. Load the data in a data frame

scala> val df =
spark.read.option("header","true").option("inferSchema","true"
).csv("/sparkLabDay8/flightData.csv")
[Stage 11:>
(0 +
df: org.apache.spark.sql.DataFrame = [DEST_COUNTRY_NAME:
string, ORIGIN_COUNTRY_NAME: string ... 1 more field]
scala> df.show(5)
+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
| United States| Romania| 15|
| United States| Croatia| 1|
| United States| Ireland| 344|
| Egypt| United States| 15|
| United States| India| 62|
+-----------------+-------------------+-----+
only showing top 5 rows
scala> df.schema
res11: org.apache.spark.sql.types.StructType =
StructType(StructField(DEST_COUNTRY_NAME,StringType,true),
StructField(ORIGIN_COUNTRY_NAME,StringType,true),
StructField(count,IntegerType,true))
2. You do all the transformation actions

3. Convert Dataframe into RDD
# convert df into rdd with 8 partitions

scala> val rdd = df.repartition(8).rdd
rdd: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] =
MapPartitionsRDD[38] at rdd at <console>:25
scala> rdd.collect
res12: Array[org.apache.spark.sql.Row] = Array([Saint Vincent and
the Grenadines,United States,1], [United States,Singapore,1],
[Moldova,United States,1], [United States,Netherlands,660], [United
States,Grenada,62], [Honduras,United States,362], [Pakistan,United
States,12], [Marshall Islands,United States,42], [United
States,India,62], [Bolivia,United States,30], [United
States,Gibraltar,1], [Malta,United States,1], [Anguilla,United
States,41], [United States,Marshall Islands,39], [Costa Rica,United
States,588], [United States,Ireland,344], [Algeria,United States,4],
[Turks and Caicos Islands,United States,230], [The Bahamas,United
States,955], [Italy,United States,382], [United States,Paraguay,6],
[Luxembourg,United States,155], [United States,Croatia,1],
[Guyana,United States,64], [Eg...
scala> rdd.glom().collect()
res13: Array[Array[org.apache.spark.sql.Row]] = Array(Array([Saint
Vincent and the Grenadines,United States,1], [United
States,Singapore,1], [Moldova,United States,1], [United
States,Netherlands,660]), Array([United States,Grenada,62],
[Honduras,United States,362], [Pakistan,United States,12], [Marshall
Islands,United States,42]), Array([United States,India,62],
[Bolivia,United States,30], [United States,Gibraltar,1],
[Malta,United States,1]), Array([Anguilla,United States,41], [United
States,Marshall Islands,39], [Costa Rica,United States,588], [United
States,Ireland,344]), Array([Algeria,United States,4], [Turks and
Caicos Islands,United States,230], [The Bahamas,United States,955],
[Italy,United States,382]), Array([United States,Paraguay,6],
[Luxembourg,United States,155], [United S...
scala> rdd.getNumPartitions
res14: Int = 8
Hash Partitioner : Operates on Discrete values: Eg: city names

Range Partitioner : Operates on Continuous values: Eg: Marks,
salary
Scala> import org.apache.spark.HashPartitioner

import org.apache.spark.HashPartitioner
# RDD to Key Value pair RDD

scala> val keyedRDD = rdd.keyBy(r => r(0))
keyedRDD: org.apache.spark.rdd.RDD[(Any,
org.apache.spark.sql.Row)] = MapPartitionsRDD[40] at keyBy at
<console>:26
scala> keyedRDD.collect
res15: Array[(Any, org.apache.spark.sql.Row)] = Array((Saint
Vincent and the Grenadines,[Saint Vincent and the
Grenadines,United States,1]), (United States,[United
States,Singapore,1]), (Moldova,[Moldova,United States,1]),
(United States,[United States,Netherlands,660]), (United
States,[United States,Grenada,62]), (Honduras,[Honduras,United
States,362]), (Pakistan,[Pakistan,United States,12]),
(Marshall Islands,[Marshall Islands,United States,42]),
(United States,[United States,India,62]), (Bolivia,
[Bolivia,United States,30]), (United States,[United
States,Gibraltar,1]), (Malta,[Malta,United States,1]),
(Anguilla,[Anguilla,United States,41]), (United States,[United
States,Marshall Islands,39]), (Costa Rica,[Costa Rica,United
States,588]), (United States,[United States,Ireland,344]),
(Al...
scala>
scala> keyedRDD.partitionBy(new
HashPartitioner(8)).glom().collect()
<console>:27: error: not found: type HashPartioner
keyedRDD.partitionBy(new
HashPartioner(8)).glom().collect()
^
scala> keyedRDD.partitionBy(new
HashPartitioner(8)).glom().collect()
res17: Array[Array[(Any, org.apache.spark.sql.Row)]] =
Array(Array((Moldova,[Moldova,United States,1]), (The Bahamas,
[The Bahamas,United States,955])), Array((Costa Rica,[Costa
Rica,United States,588]), (Algeria,[Algeria,United States,4]),
(Guyana,[Guyana,United States,64]), (Senegal,[Senegal,United
States,40])), Array((Turks and Caicos Islands,[Turks and
Caicos Islands,United States,230])), Array((Anguilla,
[Anguilla,United States,41]), (Italy,[Italy,United
States,382]), (Egypt,[Egypt,United States,15])),
Array((Honduras,[Honduras,United States,362]), (Marshall
Islands,[Marshall Islands,United States,42]), (Bolivia,
[Bolivia,United States,30])), Array((Pakistan,[Pakistan,United
States,12]), (Malta,[Malta,United States,1])), Array((Saint
Vincent and the Grenadines,[Saint Vincent and the G...
Next Variation
You want all flights for Italy in 1st partition
scala> import org.apache.spark.Partitioner

import org.apache.spark.Partitioner
scala> class customPart extends Partitioner {

| def numPartitions = 3
|
| def getPartition(key: Any): Int = {
| val flightDestination = key.toString
| if (flightDestination == "Italy") {
| return 0
| } else {
| return new java.util.Random().nextInt(2) + 1
| }
| }
| }
defined class customPart
scala> keyedRDD.partitionBy(new customPart).glom().collect()
res18: Array[Array[(Any, org.apache.spark.sql.Row)]] =

Array(Array((Italy,[Italy,United States,382])), Array((Saint
Vincent and the Grenadines,[Saint Vincent and the
Grenadines,United States,1]), (United States,[United
States,Singapore,1]), (United States,[United
States,Netherlands,660]), (United States,[United
States,Grenada,62]), (Pakistan,[Pakistan,United States,12]),
(Bolivia,[Bolivia,United States,30]), (United States,[United
States,Gibraltar,1]), (Malta,[Malta,United States,1]), (United
States,[United States,Marshall Islands,39]), (Costa Rica,
[Costa Rica,United States,588]), (Algeria,[Algeria,United
States,4]), (Turks and Caicos Islands,[Turks and Caicos
Islands,United States,230]), (The Bahamas,[The Bahamas,United
States,955]), (Luxembourg,[Luxembourg,United States,155]),
(United...
scala>
scala> val a = keyedRDD.partitionBy(new customPart).map(_._1)
a: org.apache.spark.rdd.RDD[Any] = MapPartitionsRDD[46] at map

at <console>:28
scala> a.glom().collect()
res20: Array[Array[Any]] = Array(Array(Italy), Array(Moldova,
United States, Costa Rica, United States, Algeria, Turks and
Caicos Islands, United States, Luxembourg, United States,
Guyana, Egypt, United States, United States, Iceland, United
States), Array(Saint Vincent and the Grenadines, United
States, United States, Honduras, Pakistan, Marshall Islands,
United States, Bolivia, United States, Malta, Anguilla, United
States, The Bahamas, United States, Senegal))

D8 - Lab - Practicals - Day 8

Uploaded by

Copyright:

Available Formats

D8 - Lab - Practicals - Day 8

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

D8 - Lab - Practicals - Day 8

Uploaded by

Copyright:

Available Formats

Day 8 – Go to Page 65

Previous Days kept for easy reference back

Practicals - RDD (Resilient Distributed Dataset)

1. Create RDD (Parallelize Method)

- Method - Function defined in class

Method: spark.sparkContext.parallelize - use to create a RDD

scala> val carsArray = Array("BMW", "Bentley", "Mercedes", "Suzuki",

carsArray: Array[String] = Array(BMW, Bentley, Mercedes, Suzuki,

scala> val carsRDD = spark.sparkContext.parallelize(carsArray,2)

carsRDD: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[0]

res2: Array[String] = Array(BMW, Bentley, Mercedes, Suzuki, Honda,

scala> val carsRDD = sc.parallelize(carsArray,2)

carsRDD: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[1]

res3: Array[String] = Array(BMW, Bentley, Mercedes, Suzuki, Honda,

# Create an RDD with some duplicate values - BMW

scala> val carsArray = Array("BMW", "Bentley", "Mercedes", "Suzuki",

carsArray: Array[String] = Array(BMW, Bentley, Mercedes, Suzuki,

scala> val carsRDD = sc.parallelize(carsArray,2)

carsRDD: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[2]

scala> val distinctCarsRDD = carsRDD.distinct()

res6: Array[String] = Array(BMW, Mercedes, Jaquar, Fiat, Honda,

res6: Array[String] = Array(BMW, Mercedes, Jaquar, Fiat, Honda,

scala> distinctCarsRDD.filter(carName =>

res7: Array[String] = Array(BMW, Bentley)

OR - use shorthand notation

res15: Array[String] = Array(BMW, Bentley)

scala> val numbersRDD = sc.parallelize(1 to 10,2)

numbersRDD: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[14]

res16: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)

# filter only even numbers

scala> val evenNumbersRDD = numbersRDD.filter(x => x%2 ==0)

evenNumbersRDD: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[15]

res17: Array[Int] = Array(2, 4, 6, 8, 10)

# use shorthant notation

scala> val evenNumbersRDD = numbersRDD.filter(_%2 ==0)

evenNumbersRDD: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[16]

res18: Array[Int] = Array(2, 4, 6, 8, 10)

# for more complex filters, prefer to define function separately

scala> def evenFilter(x: int) = {

<console>:23: error: not found: type int

def evenFilter(x: int) = {

scala> def evenFilter(x: Int) = {

evenFilter: (x: Int)Boolean

scala> val evenNumbersRDD = numbersRDD.filter(evenFilter)

evenNumbersRDD: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[17]

res19: Array[Int] = Array(2, 4, 6, 8, 10)

res20: Array[String] = Array(BMW, Mercedes, Jaquar, Fiat, Honda,

# make a map as carname, true/false

scala> val carsWithBRDD = distinctCarsRDD.map(carName => (carName,

carsWithBRDD: org.apache.spark.rdd.RDD[(String, Boolean)] =

res21: Array[(String, Boolean)] = Array((BMW,true), (Mercedes,false),

# filter only true

scala> carsWithBRDD.filter(x => x._2 == true).collect()

res22: Array[(String, Boolean)] = Array((BMW,true), (Bentley,true))

Example 2: carname, length of carname. eg: bmw,3

res25: Array[String] = Array(BMW, Mercedes, Jaquar, Fiat, Honda,

scala> distinctCarsRDD.map(x => (x,x.length)).collect()

res27: Array[(String, Int)] = Array((BMW,3), (Mercedes,8), (Jaquar,6),

scala> val array1D = Array("1,2,3", "4,5,6", "7,8,9")