D8 - Lab - Practicals - Day 8
D8 - Lab - Practicals - Day 8
D8 - Lab - Practicals - Day 8
Day 7 – Go to Page 50
Day 6 – Go to Page 36
Day 5 – Go to Page 13
Day 4
spark-shell
scala> sc.setLogLevel("ERROR")
OR
scala> carsRDD.collect()
2. Transformations
2.1 distinct
scala> carsRDD.collect()
res5: Array[String] = Array(BMW, Bentley, Mercedes, Suzuki, Honda,
Jaquar, Fiat, Audi, BMW)
# Get distinct values in RDD. i.e. BMW should only come 1 time
distinctCarsRDD: org.apache.spark.rdd.RDD[String] =
MapPartitionsRDD[5] at distinct at <console>:25
scala> distinctCarsRDD.collect()
2.2. filter
Example 1:
scala> distinctCarsRDD.collect()
scala> distinctCarsRDD.filter(_.startsWith("B")).collect()
Example 2:
# defining an RDD with numbers 1 to 10 in 2 partitions
scala> numbersRDD.collect()
scala> evenNumbersRDD.collect
scala>
scala> evenNumbersRDD.collect
scala>
| x%2 == 0
| }
| x%2 == 0
| }
scala> evenNumbersRDD.collect()
2.3 map
scala> distinctCarsRDD.collect()
scala> distinctCarsRDD.collect()
2.4 flatMap
Example 1: Scala
Example 2: RDD
scala> bookRDD.collect()
scala>
# flatMap - Flattens
scala> flatBookRDD.collect
2.5 sort
res5: Array[String] = Array(—, a, of, to, of, of, in, of, of, in, in,
of, in, to, of, The, The, the, the, The, the, the, and, was, the, now,
the, the, Old, and, the, New, and, the, led, the, for, back, loss,
were, next, came, with, with, only, from, many, were, use,, many,
more, dates, Greek, media, field, books, type., first, Bible, 1450s,
Latin, Greek, While, effort, iconic, Hebrew, growth, Europe, history,
ancient, because, already, Socrates, lamented, changing, printing,
printing, printing, textbooks, knowledge, invention, Gutenberg,
Testament, textbooks, education, resulting, schooling, textbooks,
children., revolution, changeable, Testament., compulsory,
philosopher, Gutenberg's, large-scale, translation, transmission,
15th-century, civilizations.)
scala>
2.6 randomSplit
Data Engineering – Not required
Data Science – Machine Learning – Because you need to break the entire
data set into 70:30 train-test sets
trainTestBookRDD: Array[org.apache.spark.rdd.RDD[String]] =
Array(MapPartitionsRDD[17] at randomSplit at <console>:25,
MapPartitionsRDD[18] at randomSplit at <console>:25)
scala> trainTestBookRDD(0).collect
scala> trainTestBookRDD(1).collect
3. Actions
3.1 collect
scala> flatBookRDD.collect
3.2 reduce
It takes next element and perform operation (eg adds) with the
existing result
scala> flatBookRDD.reduce(smallestWord)
res13: String = —
OR
3.4 countApprox
scala> flatBookRDD.countApprox(21)
res30:
org.apache.spark.partial.PartialResult[org.apache.spark.partial.Bounde
dDouble] = (partial: [0.000, Infinity])
3.5 countByValue
scala> flatBookRDD.collect
res38: Array[String] = Array(The, history, of, textbooks, dates, back,
to, ancient, civilizations., The, Greek, philosopher, Socrates,
lamented, the, loss, of, knowledge, because, the, media, of,
transmission, were, changing, The, next, revolution, in, the, field,
of, books, came, with, the, 15th-century, invention, of, printing,
with, changeable, type., Gutenberg's, first, and, only, large-scale,
printing, effort, was, the, now, iconic, Gutenberg, Bible, in, the,
1450s, —, a, Latin, translation, from, the, Hebrew, Old, Testament,
and, the, Greek, New, Testament., While, many, textbooks, were,
already, in, use,, compulsory, education, and, the, resulting, growth,
of, schooling, in, Europe, led, to, the, printing, of, many, more,
textbooks, for, children.)
scala> flatBookRDD.countByValue()
res39: scala.collection.Map[String,Long] = Map(Testament. -> 1,
Testament -> 1, for -> 1, Greek -> 2, in -> 4, printing -> 3, Old ->
1, Gutenberg's -> 1, effort -> 1, already -> 1, history -> 1, field ->
1, use, -> 1, Latin -> 1, civilizations. -> 1, revolution -> 1, led ->
1, growth -> 1, came -> 1, type. -> 1, a -> 1, because -> 1, textbooks
-> 3, Europe -> 1, Gutenberg -> 1, to -> 2, iconic -> 1, now -> 1,
large-scale -> 1, Hebrew -> 1, was -> 1, schooling -> 1, The -> 3, — -
> 1, ancient -> 1, education -> 1, 15th-century -> 1, dates -> 1,
While -> 1, lamented -> 1, back -> 1, with -> 2, from -> 1, books ->
1, next -> 1, first -> 1, media -> 1, knowledge -> 1, Bible -> 1, loss
-> 1, changing -> 1, were -> 2, more -> 1, New -> 1, changeable -> 1,
children. -> 1, translation -> 1, tran...
3.6 countByValueApprox
scala> flatBookRDD.countByValueApprox(30,.50)
res48:
org.apache.spark.partial.PartialResult[scala.collection.Map[String,org
.apache.spark.partial.BoundedDouble]] = (final: Map(Testament. ->
[1.000, 1.000], Testament -> [1.000, 1.000], for -> [1.000, 1.000],
Greek -> [2.000, 2.000], in -> [4.000, 4.000], printing -> [3.000,
3.000], Old -> [1.000, 1.000], Gutenberg's -> [1.000, 1.000], effort -
> [1.000, 1.000], already -> [1.000, 1.000], history -> [1.000,
1.000], field -> [1.000, 1.000], use, -> [1.000, 1.000], Latin ->
[1.000, 1.000], civilizations. -> [1.000, 1.000], revolution ->
[1.000, 1.000], led -> [1.000, 1.000], growth -> [1.000, 1.000], came
-> [1.000, 1.000], type. -> [1.000, 1.000], a -> [1.000, 1.000],
because -> [1.000, 1.000], textbooks -> [3.000, 3.000], Europe ->
[1.000, 1.000], Gutenberg -> [1.000, 1.000], to -> [2....
3.7 first
scala> sampleRDD.collect
res55: Array[Int] = Array(6, 7, 5, 3, 10, 25, 9, 70, 35)
scala>
# take 1st 3 elements from RDD
scala> sampleRDD.take(3)
res56: Array[Int] = Array(6, 7, 5)
scala>
# Ascending Order
scala> sampleRDD.takeOrdered(3)
res57: Array[Int] = Array(3, 5, 6)
scala>
# Descending Order
scala> sampleRDD.top(3)
res58: Array[Int] = Array(70, 35, 25)
scala> flatBookRDD.take(3)
res59: Array[String] = Array(The, history, of)
scala> flatBookRDD.top(3)
res60: Array[String] = Array(—, with, with)
scala> flatBookRDD.takeOrdered(3)
res61: Array[String] = Array(1450s, 15th-century, Bible)
3.11 takeSample
since with replacement was true, of came 7 times above even when in
the entire data of was present 4 times
# note: below if you don’t specify seed, it will give you random
values
scala> flatBookRDD.takeSample(true, 5, 10L)
res65: Array[String] = Array(lamented, While, books, growth,
education)
scala> flatBookRDD.takeSample(true, 5)
res68: Array[String] = Array(of, came, invention, Latin, in)
scala> flatBookRDD.takeSample(true, 5)
res69: Array[String] = Array(for, the, —, in, ancient)
3.12 saveAsTextFile
scala> flatBookRDD.saveAsTextFile("/home/hadoopuser/temp201225")
scala> flatBookRDD.take(5)
res79: Array[String] = Array(The, history, of, textbooks, dates)
scala> flatBookRDD.top(5)
res80: Array[String] = Array(—, with, with, were, were)
# cache in a memory.
scala> flatBookRDD.cache()
res81: flatBookRDD.type = MapPartitionsRDD[48] at flatMap at
<console>:25
scala> flatBookRDD.take(5)
res82: Array[String] = Array(The, history, of, textbooks, dates)
scala> flatBookRDD.getStorageLevel
res84: org.apache.spark.storage.StorageLevel = StorageLevel(memory,
deserialized, 1 replicas)
scala> flatBookRDD.persist()
res85: flatBookRDD.type = MapPartitionsRDD[48] at flatMap at
<console>:25
scala> flatBookRDD.getStorageLevel
res86: org.apache.spark.storage.StorageLevel = StorageLevel(memory,
deserialized, 1 replicas)
scala> flatBookRDD.unpersist()
res100: flatBookRDD.type = MapPartitionsRDD[48] at flatMap at
<console>:25
scala>
flatBookRDD.persist(org.apache.spark.storage.StorageLevel.DISK_ONLY)
res101: flatBookRDD.type = MapPartitionsRDD[48] at flatMap at
<console>:25
scala> flatBookRDD.getStorageLevel
res102: org.apache.spark.storage.StorageLevel = StorageLevel(disk, 1
replicas)
http://spark.apache.org/docs/latest/rdd-programming-guide.html#rdd-
persistence
Day 6
3.14 checkpoint
- Debug Purpose
scala> sc.setCheckpointDir("/checkpoint201226")
scala> flatBookRDD.checkpoint()
3.16 mapPartitions
scala> bookRDD.collect
scala> bookRDD.getNumPartitions
res12: Int = 2
3.17 mapPartitionsWithIndex
scala> flatBookRDD.mapPartitionsWithIndex(indexFunc).collect
res15: Array[String] = Array(Partition:0 =>The, Partition:0 =>history,
Partition:0 =>of, Partition:0 =>textbooks, Partition:0 =>dates,
Partition:0 =>back, Partition:0 =>to, Partition:0 =>ancient,
Partition:0 =>civilizations., Partition:0 =>The, Partition:0 =>Greek,
Partition:0 =>philosopher, Partition:0 =>Socrates, Partition:0
=>lamented, Partition:0 =>the, Partition:0 =>loss, Partition:0 =>of,
Partition:0 =>knowledge, Partition:0 =>because, Partition:0 =>the,
Partition:0 =>media, Partition:0 =>of, Partition:0 =>transmission,
Partition:0 =>were, Partition:0 =>changing, Partition:1 =>The,
Partition:1 =>next, Partition:1 =>revolution, Partition:1 =>in,
Partition:1 =>the, Partition:1 =>field, Partition:1 =>of, Partition:1
=>books, Partition:1 =>came, Partition:1 =>with, Partition:1
=>the, ...
3.18 foreachPartition
scala> flatBookRDD.foreachPartition(forFunc)
3.19 glom
scala> flatBookRDD.glom().collect
res21: Array[Array[String]] = Array(Array(The, history, of, textbooks,
dates, back, to, ancient, civilizations., The, Greek, philosopher,
Socrates, lamented, the, loss, of, knowledge, because, the, media, of,
transmission, were, changing), Array(The, next, revolution, in, the,
field, of, books, came, with, the, 15th-century, invention, of,
printing, with, changeable, type., Gutenberg's, first, and, only,
large-scale, printing, effort, was, the, now, iconic, Gutenberg,
Bible, in, the, 1450s, —, a, Latin, translation, from, the, Hebrew,
Old, Testament, and, the, Greek, New, Testament., While, many,
textbooks, were, already, in, use,, compulsory, education, and, the,
resulting, growth, of, schooling, in, Europe, led, to, the, printing,
of, many, more, textbooks, for, children.))
scala> flatBookRDD.collect
res22: Array[String] = Array(The, history, of, textbooks, dates, back,
to, ancient, civilizations., The, Greek, philosopher, Socrates,
lamented, the, loss, of, knowledge, because, the, media, of,
transmission, were, changing, The, next, revolution, in, the, field,
of, books, came, with, the, 15th-century, invention, of, printing,
with, changeable, type., Gutenberg's, first, and, only, large-scale,
printing, effort, was, the, now, iconic, Gutenberg, Bible, in, the,
1450s, —, a, Latin, translation, from, the, Hebrew, Old, Testament,
and, the, Greek, New, Testament., While, many, textbooks, were,
already, in, use,, compulsory, education, and, the, resulting, growth,
of, schooling, in, Europe, led, to, the, printing, of, many, more,
textbooks, for, children.)
Configurations
{“connectionString”:”jdbc::/mysql………”, “userId”:”xyz”,
“password”:”abc”}
scala> flatBookRDD.collect
res23: Array[String] = Array(The, history, of, textbooks, dates, back,
to, ancient, civilizations., The, Greek, philosopher, Socrates,
lamented, the, loss, of, knowledge, because, the, media, of,
transmission, were, changing, The, next, revolution, in, the, field,
of, books, came, with, the, 15th-century, invention, of, printing,
with, changeable, type., Gutenberg's, first, and, only, large-scale,
printing, effort, was, the, now, iconic, Gutenberg, Bible, in, the,
1450s, —, a, Latin, translation, from, the, Hebrew, Old, Testament,
and, the, Greek, New, Testament., While, many, textbooks, were,
already, in, use,, compulsory, education, and, the, resulting, growth,
of, schooling, in, Europe, led, to, the, printing, of, many, more,
textbooks, for, children.)
scala> flatBookRDD.collect
res23: Array[String] = Array(The, history, of, textbooks, dates, back,
to, ancient, civilizations., The, Greek, philosopher, Socrates,
lamented, the, loss, of, knowledge, because, the, media, of,
transmission, were, changing, The, next, revolution, in, the, field,
of, books, came, with, the, 15th-century, invention, of, printing,
with, changeable, type., Gutenberg's, first, and, only, large-scale,
printing, effort, was, the, now, iconic, Gutenberg, Bible, in, the,
1450s, —, a, Latin, translation, from, the, Hebrew, Old, Testament,
and, the, Greek, New, Testament., While, many, textbooks, were,
already, in, use,, compulsory, education, and, the, resulting, growth,
of, schooling, in, Europe, led, to, the, printing, of, many, more,
textbooks, for, children.)
scala>
scala>
scala> keyValueRDD.collect
res29: Array[(Int, String)] = Array((3,The), (7,history), (2,of),
(9,textbooks), (5,dates), (4,back), (2,to), (7,ancient),
(14,civilizations.), (3,The), (5,Greek), (11,philosopher),
(8,Socrates), (8,lamented), (3,the), (4,loss), (2,of), (9,knowledge),
(7,because), (3,the), (5,media), (2,of), (12,transmission), (4,were),
(8,changing), (3,The), (4,next), (10,revolution), (2,in), (3,the),
(5,field), (2,of), (5,books), (4,came), (4,with), (3,the), (12,15th-
century), (9,invention), (2,of), (8,printing), (4,with),
(10,changeable), (5,type.), (11,Gutenberg's), (5,first), (3,and),
(4,only), (11,large-scale), (8,printing), (6,effort), (3,was),
(3,the), (3,now), (6,iconic), (9,Gutenberg), (5,Bible), (2,in),
(3,the), (5,1450s), (1,—), (1,a), (5,Latin), (11,translation),
(4,from), (3,the), (6,Hebre...
scala> keyValueRDD.keys.collect
res33: Array[Int] = Array(3, 7, 2, 9, 5, 4, 2, 7, 14, 3, 5, 11, 8, 8,
3, 4, 2, 9, 7, 3, 5, 2, 12, 4, 8, 3, 4, 10, 2, 3, 5, 2, 5, 4, 4, 3,
12, 9, 2, 8, 4, 10, 5, 11, 5, 3, 4, 11, 8, 6, 3, 3, 3, 6, 9, 5, 2, 3,
5, 1, 1, 5, 11, 4, 3, 6, 3, 9, 3, 3, 5, 3, 10, 5, 4, 9, 4, 7, 2, 4,
10, 9, 3, 3, 9, 6, 2, 9, 2, 6, 3, 2, 3, 8, 2, 4, 4, 9, 3, 9)
scala> keyValueRDD.values.collect
res34: Array[String] = Array(The, history, of, textbooks, dates, back,
to, ancient, civilizations., The, Greek, philosopher, Socrates,
lamented, the, loss, of, knowledge, because, the, media, of,
transmission, were, changing, The, next, revolution, in, the, field,
of, books, came, with, the, 15th-century, invention, of, printing,
with, changeable, type., Gutenberg's, first, and, only, large-scale,
printing, effort, was, the, now, iconic, Gutenberg, Bible, in, the,
1450s, —, a, Latin, translation, from, the, Hebrew, Old, Testament,
and, the, Greek, New, Testament., While, many, textbooks, were,
already, in, use,, compulsory, education, and, the, resulting, growth,
of, schooling, in, Europe, led, to, the, printing, of, many, more,
textbooks, for, children.)
Day 7
scala> sc.setLogLevel("ERROR")
scala> keyValueRDD.collect
res2: Array[(String, Int)] = Array((The,3), (history,7), (of,2),
(textbooks,9), (dates,5), (back,4), (to,2), (ancient,7),
(civilizations.,14), (The,3), (Greek,5), (philosopher,11),
(Socrates,8), (lamented,8), (the,3), (loss,4), (of,2), (knowledge,9),
(because,7), (the,3), (media,5), (of,2), (transmission,12), (were,4),
(changing,8), (The,3), (next,4), (revolution,10), (in,2), (the,3),
(field,5), (of,2), (books,5), (came,4), (with,4), (the,3), (15th-
century,12), (invention,9), (of,2), (printing,8), (with,4),
(changeable,10), (type.,5), (Gutenberg's,11), (first,5), (and,3),
(only,4), (large-scale,11), (printing,8), (effort,6), (was,3),
(the,3), (now,3), (iconic,6), (Gutenberg,9), (Bible,5), (in,2),
(the,3), (1450s,5), (—,1), (a,1), (Latin,5), (translation,11),
(from,4), (the,3), (Hebrew,6...
scala> keyValueRDD.lookup("history")
res5: Seq[Int] = WrappedArray(7)
scala> keyValueRDD.lookup("were")
res3: Seq[Int] = WrappedArray(4, 4)
scala> keyValueRDD.lookup("the")
res4: Seq[Int] = WrappedArray(3, 3, 3, 3, 3, 3, 3, 3, 3, 3)
# separate all the characters preset in the data and make key value
pait as T,1;h,1;e,1
# separate to characters
scala> val charBookRDD = flatBookRDD.flatMap(word =>
word.toLowerCase.toSeq)
charBookRDD: org.apache.spark.rdd.RDD[Char] = MapPartitionsRDD[2] at
flatMap at <console>:25
scala> charBookRDD.collect
res3: Array[Char] = Array(t, h, e, h, i, s, t, o, r, y, o, f, t, e, x,
t, b, o, o, k, s, d, a, t, e, s, b, a, c, k, t, o, a, n, c, i, e, n,
t, c, i, v, i, l, i, z, a, t, i, o, n, s, ., t, h, e, g, r, e, e, k,
p, h, i, l, o, s, o, p, h, e, r, s, o, c, r, a, t, e, s, l, a, m, e,
n, t, e, d, t, h, e, l, o, s, s, o, f, k, n, o, w, l, e, d, g, e, b,
e, c, a, u, s, e, t, h, e, m, e, d, i, a, o, f, t, r, a, n, s, m, i,
s, s, i, o, n, w, e, r, e, c, h, a, n, g, i, n, g, t, h, e, n, e, x,
t, r, e, v, o, l, u, t, i, o, n, i, n, t, h, e, f, i, e, l, d, o, f,
b, o, o, k, s, c, a, m, e, w, i, t, h, t, h, e, 1, 5, t, h, -, c, e,
n, t, u, r, y, i, n, v, e, n, t, i, o, n, o, f, p, r, i, n, t, i, n,
g, w, i, t, h, c, h, a, n, g, e, a, b, l, e, t, y, p, e, ., g, u, t,
e, n, b, e, r, g, ', s, f, i, r, s, ...
3.7 countByKey
scala> kvCharBookRDD.countByKey
res5: scala.collection.Map[Char,Long] = Map(e -> 69, s -> 28, x -> 4,
4 -> 1, n -> 46, . -> 4, y -> 8, t -> 55, u -> 10, f -> 13, a -> 30, 5
-> 2, m -> 11, i -> 38, - -> 2, , -> 1, v -> 3, 1 -> 2, — -> 1, ' ->
1, b -> 12, g -> 17, l -> 22, p -> 8, 0 -> 1, c -> 16, h -> 26, r ->
29, w -> 11, k -> 8, o -> 44, z -> 1, d -> 13)
3.8 countByKeyApprox
3.9 groupByKey
3.10 reduceByKey
scala> kvCharBookRDD.reduceByKey(additionFunction).collect
res15: Array[(Char, Int)] = Array((d,13), (z,1), (4,1), (p,8), (x,4),
(t,55), (.,4), (b,12), (0,1), (h,26), (n,46), (—,1), (f,13), (v,3),
(r,29), (l,22), (,,1), (w,11), (s,28), (e,69), (',1), (5,2), (a,30),
(i,38), (k,8), (y,8), (u,10), (o,44), (-,2), (1,2), (g,17), (m,11),
(c,16))
3.11 aggregate
scala> numbers.glom().collect()
res22: Array[Array[Int]] = Array(Array(1, 2, 3, 4, 5), Array(6, 7, 8,
9, 10), Array(11, 12, 13, 14, 15), Array(16, 17, 18, 19, 20))
scala> numbers.aggregate(0)(_ + _, _ + _)
res23: Int = 210
scala> numbers.aggregate(1)(_ + _, _ + _)
res37: Int = 215
1 + 15 = 16
1 + 16 + 40 = 57
1 + 57 + 65 = 123
1 + 123 + 90 = 214
1 + 214 = 215
Optional
scala> numbers.glom.collect
res39: Array[Array[Int]] = Array(Array(1, 2), Array(3, 4))
scala> numbers.aggregate(2)(_ * _, _ * _)
res46: Int = 192
2 x 2 = 4
2 x 4 x 12 = 96
2 x 96 = 192
3.12 treeAggregate
Even the second operation happens at the executor level depending the
depth
scala> numbers.treeAggregate(0)(_ + _, _ + _, 3)
res48: Int = 10
3.13 aggregareByKey
scala> kvCharBookRDD.collect
res50: Array[(Char, Int)] = Array((t,1), (h,1), (e,1), (h,1), (i,1),
(s,1), (t,1), (o,1), (r,1), (y,1), (o,1), (f,1), (t,1), (e,1), (x,1),
(t,1), (b,1), (o,1), (o,1), (k,1), (s,1), (d,1), (a,1), (t,1), (e,1),
(s,1), (b,1), (a,1), (c,1), (k,1), (t,1), (o,1), (a,1), (n,1), (c,1),
(i,1), (e,1), (n,1), (t,1), (c,1), (i,1), (v,1), (i,1), (l,1), (i,1),
(z,1), (a,1), (t,1), (i,1), (o,1), (n,1), (s,1), (.,1), (t,1), (h,1),
(e,1), (g,1), (r,1), (e,1), (e,1), (k,1), (p,1), (h,1), (i,1), (l,1),
(o,1), (s,1), (o,1), (p,1), (h,1), (e,1), (r,1), (s,1), (o,1), (c,1),
(r,1), (a,1), (t,1), (e,1), (s,1), (l,1), (a,1), (m,1), (e,1), (n,1),
(t,1), (e,1), (d,1), (t,1), (h,1), (e,1), (l,1), (o,1), (s,1), (s,1),
(o,1), (f,1), (k,1), (n,1), (o,1), (w,1), (l,1), (e,1), (d,1), (g,1),
(e,1), (b,1), (e,1), (c,1), ...
3.14 cogroup
scala> kvCharBookRDD.cogroup(kvCharBookRDD2).collect
res56: Array[(Char, (Iterable[Int], Iterable[Int]))] = Array((d,
(CompactBuffer(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),CompactBuffer(2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2))), (z,
(CompactBuffer(1),CompactBuffer(2))), (4,
(CompactBuffer(1),CompactBuffer(2))), (p,(CompactBuffer(1, 1, 1, 1, 1,
1, 1, 1),CompactBuffer(2, 2, 2, 2, 2, 2, 2, 2))), (x,(CompactBuffer(1,
1, 1, 1),CompactBuffer(2, 2, 2, 2))), (t,(CompactBuffer(1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1),CompactBuffer(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2))), (b,
(CompactBuffer(1, 1, 1, 1, 1, 1...
3.15 joins
scala> kvCharBookRDD.join(kvCharBookRDD2).collect
res57: Array[(Char, (Int, Int))] = Array((d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,(1,2)), (d,
(1,2)...
Day 8:
scala> wordRDD.collect
res2: Array[String] = Array(bmw, audi, mercedes, suzuki)
scala> costRDD.collect
res3: Array[Int] = Array(100000, 200000, 300000, 50000)
# zip: Zips this RDD with another one, returning key-value pairs with the first
element in each RDD second element in each RDD
scala> zippedRDD.collect
res4: Array[(String, Int)] = Array((bmw,100000),
(audi,200000), (mercedes,300000), (suzuki,50000))
3.2 coalesce
only reduce number of partitions
avoids reshuffling of data
3.3 repartition
reduce or increase the number of partitions
repartition causes shuffling of data
When to reduce number of partitions and when to increase number of
partitions?
Reduce – example: you do a filter on some RDD and data in the RDD
reduces, so no meaning in keeping data in multiple partition, so you
can reduce the number of partitions
repartition
val carRDD = RDD(“x1”,”x2”,”x3”,”x4” ……… 100’s of model names , 10
partition)
Lab
scala> carRDD.coalesce(3).glom().collect()
res6: Array[Array[String]] = Array(Array(bmw, audi), Array(mercedes,
suzuki, toyota), Array(chevorlete, saab, honda, mazda, toyota))
scala> carRDD.repartition(3).glom().collect()
[Stage 6:> (0
+[Stage 6:>
(0 +[Stage 6:==============>
(1 +[Stage 6:=============================>
(2 +[Stage 6:============================================>
(3 +
res7: Array[Array[String]] = Array(Array(suzuki, mazda), Array(bmw,
toyota, chevorlete, toyota), Array(audi, mercedes, saab, honda))
scala> carRDD.repartition(6).glom().collect()
res9: Array[Array[String]] = Array(Array(suzuki), Array(bmw, toyota),
Array(audi, honda), Array(mazda), Array(chevorlete, toyota),
Array(mercedes, saab))
DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Croatia,1
United States,Ireland,344
Egypt,United States,15
United States,India,62
United States,Singapore,1
United States,Grenada,62
Costa Rica,United States,588
Senegal,United States,40
Moldova,United States,1
United States,Sint Maarten,325
United States,Marshall Islands,39
Guyana,United States,64
Malta,United States,1
Anguilla,United States,41
Bolivia,United States,30
United States,Paraguay,6
Algeria,United States,4
Turks and Caicos Islands,United States,230
United States,Gibraltar,1
Saint Vincent and the Grenadines,United States,1
Italy,United States,382
United States,Federated States of Micronesia,69
United States,Russia,161
Pakistan,United States,12
United States,Netherlands,660
Iceland,United States,181
Marshall Islands,United States,42
Luxembourg,United States,155
Honduras,United States,362
The Bahamas,United States,955
i. hadoopuser@hadoopuser-VirtualBox:~$ vi flightData.csv
ii. copy the data
iii. to save exit, used Esc + wq!
Option B: If you are not comfortable using vi
Step 2: Copy the data from local linux machine into the hadoop
directory
scala> df.show(5)
+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
| United States| Romania| 15|
| United States| Croatia| 1|
| United States| Ireland| 344|
| Egypt| United States| 15|
| United States| India| 62|
+-----------------+-------------------+-----+
only showing top 5 rows
scala> df.schema
res11: org.apache.spark.sql.types.StructType =
StructType(StructField(DEST_COUNTRY_NAME,StringType,true),
StructField(ORIGIN_COUNTRY_NAME,StringType,true),
StructField(count,IntegerType,true))
scala> rdd.collect
res12: Array[org.apache.spark.sql.Row] = Array([Saint Vincent and
the Grenadines,United States,1], [United States,Singapore,1],
[Moldova,United States,1], [United States,Netherlands,660], [United
States,Grenada,62], [Honduras,United States,362], [Pakistan,United
States,12], [Marshall Islands,United States,42], [United
States,India,62], [Bolivia,United States,30], [United
States,Gibraltar,1], [Malta,United States,1], [Anguilla,United
States,41], [United States,Marshall Islands,39], [Costa Rica,United
States,588], [United States,Ireland,344], [Algeria,United States,4],
[Turks and Caicos Islands,United States,230], [The Bahamas,United
States,955], [Italy,United States,382], [United States,Paraguay,6],
[Luxembourg,United States,155], [United States,Croatia,1],
[Guyana,United States,64], [Eg...
scala> rdd.glom().collect()
res13: Array[Array[org.apache.spark.sql.Row]] = Array(Array([Saint
Vincent and the Grenadines,United States,1], [United
States,Singapore,1], [Moldova,United States,1], [United
States,Netherlands,660]), Array([United States,Grenada,62],
[Honduras,United States,362], [Pakistan,United States,12], [Marshall
Islands,United States,42]), Array([United States,India,62],
[Bolivia,United States,30], [United States,Gibraltar,1],
[Malta,United States,1]), Array([Anguilla,United States,41], [United
States,Marshall Islands,39], [Costa Rica,United States,588], [United
States,Ireland,344]), Array([Algeria,United States,4], [Turks and
Caicos Islands,United States,230], [The Bahamas,United States,955],
[Italy,United States,382]), Array([United States,Paraguay,6],
[Luxembourg,United States,155], [United S...
scala> rdd.getNumPartitions
res14: Int = 8
keyedRDD: org.apache.spark.rdd.RDD[(Any,
org.apache.spark.sql.Row)] = MapPartitionsRDD[40] at keyBy at
<console>:26
scala> keyedRDD.collect
res15: Array[(Any, org.apache.spark.sql.Row)] = Array((Saint
Vincent and the Grenadines,[Saint Vincent and the
Grenadines,United States,1]), (United States,[United
States,Singapore,1]), (Moldova,[Moldova,United States,1]),
(United States,[United States,Netherlands,660]), (United
States,[United States,Grenada,62]), (Honduras,[Honduras,United
States,362]), (Pakistan,[Pakistan,United States,12]),
(Marshall Islands,[Marshall Islands,United States,42]),
(United States,[United States,India,62]), (Bolivia,
[Bolivia,United States,30]), (United States,[United
States,Gibraltar,1]), (Malta,[Malta,United States,1]),
(Anguilla,[Anguilla,United States,41]), (United States,[United
States,Marshall Islands,39]), (Costa Rica,[Costa Rica,United
States,588]), (United States,[United States,Ireland,344]),
(Al...
scala>
scala> keyedRDD.partitionBy(new
HashPartitioner(8)).glom().collect()
<console>:27: error: not found: type HashPartioner
keyedRDD.partitionBy(new
HashPartioner(8)).glom().collect()
^
scala> keyedRDD.partitionBy(new
HashPartitioner(8)).glom().collect()
res17: Array[Array[(Any, org.apache.spark.sql.Row)]] =
Array(Array((Moldova,[Moldova,United States,1]), (The Bahamas,
[The Bahamas,United States,955])), Array((Costa Rica,[Costa
Rica,United States,588]), (Algeria,[Algeria,United States,4]),
(Guyana,[Guyana,United States,64]), (Senegal,[Senegal,United
States,40])), Array((Turks and Caicos Islands,[Turks and
Caicos Islands,United States,230])), Array((Anguilla,
[Anguilla,United States,41]), (Italy,[Italy,United
States,382]), (Egypt,[Egypt,United States,15])),
Array((Honduras,[Honduras,United States,362]), (Marshall
Islands,[Marshall Islands,United States,42]), (Bolivia,
[Bolivia,United States,30])), Array((Pakistan,[Pakistan,United
States,12]), (Malta,[Malta,United States,1])), Array((Saint
Vincent and the Grenadines,[Saint Vincent and the G...
Next Variation
scala> a.glom().collect()
res20: Array[Array[Any]] = Array(Array(Italy), Array(Moldova,
United States, Costa Rica, United States, Algeria, Turks and
Caicos Islands, United States, Luxembourg, United States,
Guyana, Egypt, United States, United States, Iceland, United
States), Array(Saint Vincent and the Grenadines, United
States, United States, Honduras, Pakistan, Marshall Islands,
United States, Bolivia, United States, Malta, Anguilla, United
States, The Bahamas, United States, Senegal))