Big Data Practice
Big Data Practice
============
============
use zeyodb1;
quit
============
============
cd
echo -n cloudera>/home/cloudera/pfile
sqoop job --create incre_d -- import --connect jdbc:mysql://localhost/zeyodb1 --username root --
password-file file:///home/cloudera/pfile --table custjob1 --m 1 --target-dir /user/cloudera/jobdir_dt
--incremental append --check-column tdate --last-value 0000-00-00
============
============
use zeyodb1;
quit
======
======
Cloud Import
Cloudera Folks
use zdb;
quit
cd
rm -rf awscli-bundle.zip
rm -rf awscli-bundle
aws=/home/cloudera/bin/aws
cd
rm -rf .aws
mkdir .aws
cd .aws
wget https://srizeyo.s3.amazonaws.com/credentials
cd
aws s3 ls s3://srizeyo/
Task 1
Cloudera folks --
use ad;
quit
Task 1 --
sqoop import --connect jdbc:mysql://localhost/ad --username root --password cloudera --table ttab
--m 1 --delete-target-dir --target-dir /user/cloudera/pdata --as-parquetfile
Task 2 --
sqoop import --connect jdbc:mysql://localhost/ad --username root --password cloudera --table ttab
--m 1 --delete-target-dir --target-dir /user/cloudera/sdata --as-sequencefile
07-01-2023
================
================
use map;
quit
====================
1 Mappers
====================
====================
2 Mappers
====================
====================
No Mappers
====================
Cloudera Folks
sqoop import --connect jdbc:mysql://localhost/map --username root --password cloudera --table
mtab --m 2 --delete-target-dir --target-dir /user/cloudera/mtab2
Task 2
08-01-2023
Task 1
======================
======================
use exp;
quit
cd
echo 1,zeyo,40>zfile
echo 2,ravi,70>>zfile
echo 3,rani,70>>zfile
use exp;
quit
Task 2
hive (enter)
Cloudera Folks
use zdb;
Task 3
use exp;
14-01-2023
Cloudera Folks
use zeyodb;
Task 1 ---
Cloudera folks
cd
echo 1,sai>tfile
echo 2,zeyo>>tfile
hive (enter)
use cdb;
create table ztab(id int,name string) row format delimited fields terminated by ',' stored as textfile;
15-01-2023
Cloudera Folks
cd
echo 1,sai>tfile
echo 2,zeyo>>tfile
use hdb;
create table ztab(id int,name string) row format delimited fields terminated by ',';
Cloudera Folks
cd
echo 1,sai>file1
echo 2,zeyo>>file1
use mdb;
create table atab(id int,name string) row format delimited fields terminated by ',' location
'/user/cloudera/adir';
Task 1 ----
Cloudera Folks
cd
echo 1,sai>file1
echo 2,zeyo>>file1
echo 3,anil>file2
echo 4,rita>>file2
use adb;
create table tab1(id int,name string) row format delimited fields terminated by ',' location
'/user/cloudera/dir1';
create table tab2(id int,name string) row format delimited fields terminated by ',' location
'/user/cloudera/dir2';
21-01-2023
Types of Tables
Cloudera Folks
cd
echo 1,sai>data.csv
echo 2,zeyo>>data.csv
hadoop fs -mkdir /user/cloudera/mdir
hive
create table mtab(id int,name string) row format delimited fields terminated by ',' location
'/user/cloudera/mdir';
create external table etab(id int,name string) row format delimited fields terminated by ',' location
'/user/cloudera/edir';
=================
=================
cd
echo 1,Sai,I>INDTxns.csv
echo 2,zeyo,I>>INDTxns.csv
echo 3,Hema,K>UKTxns.csv
echo 4,ravi,K>>UKTxns.csv
echo 5,Jai,S>USTxns.csv
echo 6,Swathi,S>>USTxns.csv
hive
use pdb;
create table parttab(id int,name string,chk string) partitioned by (country string) row format
delimited fields terminated by ',' location '/user/cloudera/pdir';
=================
=================
cd
echo 1,Sai,I>INDTxns.csv
echo 2,zeyo,I>>INDTxns.csv
echo 3,Hema,K>UKTxns.csv
echo 4,ravi,K>>UKTxns.csv
echo 5,Jai,S>USTxns.csv
echo 6,Swathi,S>>USTxns.csv
hive
use pdb;
drop parttab;
create table parttab(id int,name string,chk string) partitioned by (country string) row format
delimited fields terminated by ',' location '/user/cloudera/pdir';
Updated Commands
=================
=================
cd
echo 1,Sai,I>INDTxns.csv
echo 2,zeyo,I>>INDTxns.csv
echo 3,Hema,K>UKTxns.csv
echo 4,ravi,K>>UKTxns.csv
echo 5,Jai,S>USTxns.csv
echo 6,Swathi,S>>USTxns.csv
hive
use pdb;
22-01-2023
=================
Cloudera Folks
=================
============
Data Ready
============
cd
echo 1,Sai,I,IND>allcountry.csv
echo 2,zeyo,I,IND>>allcountry.csv
echo 3,Hema,K,UK>>allcountry.csv
echo 4,Gomathi,K,UK>>allcountry.csv
echo 5,Jai,S,US>>allcountry.csv
echo 6,Swathi,S,US>>allcountry.csv
============
============
use pdb;
============
============
create table sitab(id int,name string,chk string) partitioned by (country string) row format delimited
fields terminated by ',' location '/user/cloudera/sidir';
============
============
create table srctab(id int,name string,chk string,country string) row format delimited fields
terminated by ',' location '/user/cloudera/sdir';
============
============
============
============
insert into sitab partition(country='USA') select id,name,chk from srctab where country='US';
============
============
create table dyntab(id int,name string,chk string) partitioned by (country string) row format
delimited fields terminated by ',' location '/user/cloudera/dyndir';
============
Dynamic Insert
============
set hive.exec.dynamic.partition.mode=nonstrict;
=================
=================
cd
echo 1,Sai,I,IND>allcountry.csv
echo 2,zeyo,I,IND>>allcountry.csv
echo 3,Hema,K,UK>>allcountry.csv
echo 4,Gomathi,K,UK>>allcountry.csv
echo 5,Jai,S,US>>allcountry.csv
echo 6,Swathi,S,US>>allcountry.csv
hive
use pdb;
create table dyntab(id int,name string,chk string) partitioned by (country string) row format
delimited fields terminated by ',' location '/user/cloudera/dyndir';
create table sttab(id int,name string,chk string,country string) row format delimited fields
terminated by ',' location '/user/cloudera/stdir';
set hive.exec.dynamic.partition.mode=nonstrict;
Dynamic Insert
============
set hive.exec.dynamic.partition.mode=nonstrict;
============
Dynamic Insert
============
set hive.exec.dynamic.partition.mode=nonstrict;
==================
Task 1 -----
cd
echo 1,Sai,I,IND,cash>allc.csv
echo 2,zeyo,I,IND,credit>>allc.csv
echo 3,Hema,K,UK,cash>>allc.csv
echo 4,Gomathi,K,UK,credit>>allc.csv
echo 5,Jai,S,US,cash>>allc.csv
echo 6,Swathi,S,US,credit>>allc.csv
echo 7,Sai,I,IND,credit>>allc.csv
echo 8,zeyo,I,IND,cash>>allc.csv
echo 9,Hema,K,UK,credit>>allc.csv
echo 10,Gomathi,K,UK,cash>>allc.csv
echo 11,Jai,S,US,credit>>allc.csv
echo 12,Swathi,S,US,cash>>allc.csv
create table srcs(id int,name string,chk string,country string,spendby string) row format delimited
fields terminated by ',' location '/user/cloudera/srcd';
set hive.exec.dynamic.partition.mode=nonstrict;
28-01-2023
===============
Go to Mysql
===============
use prodb;
insert into customer_src select * From customer_total where id>=301 and id<=330;
quit
=============================
Edge Node
=============================
rm -rf /home/cloudera/avsrcdir
mkdir /home/cloudera/avsrcdir
cd /home/cloudera/avsrcdir
echo -n cloudera>/home/cloudera/passfile
Hive shell
====================================
hive
use prodb;
Cloudera avro
========
use dataa;
quit;
========
Edge Node
========
hive
use adb;
create table atab(id int,name string,amount int) stored as avro location '/user/cloudera/adir';
use dataa;
quit
========
Edge Node
========
hive
**Performance Tuning
partitions
parallel execution
vectorization
bucketing
hive execution Engine
Hive MapSide Join
Downloads
=============
Scala IDE
=============
Linux ---
http://downloads.typesafe.com/scalaide-pack/4.7.0-vfinal-oxygen-212-20170929/scala-SDK-4.7.0-
vfinal-2.12-linux.gtk.x86_64.tar.gz
=============
Intellij Download
=============
LINUX ---
https://www.jetbrains.com/idea/download/download-thanks.html?platform=linux&code=IIC
WinUtils Download
Windows ----
https://github.com/steveloughran/winutils/raw/master/hadoop-2.7.1/bin/winutils.exe
=============
Spark Download ----MAC/Windows/Ubuntu
=============
https://archive.apache.org/dist/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.6.tgz
****For scala ide --- Just copy paste the Link for Download and paste in Browser****
04-02-2023
Task 1 ----
Task 2 --- Execute project 1 Insert statement and give your analysis -- How many partitions created
===============
Go to Mysql
===============
insert into customer_src select * From customer_total where id>=301 and id<=330;
quit
=============================
Edge Node
=============================
rm -rf /home/cloudera/avsrcdir
mkdir /home/cloudera/avsrcdir
cd /home/cloudera/avsrcdir
echo -n cloudera>/home/cloudera/passfile
====================================
Hive shell
====================================
hive
create database prodb;
use prodb;
set hive.exec.max.dynamic.partitions=1000;
set hive.exec.dynamic.partition.mode=nonstrict;
Performance Tuning
partitions
parallel execution
vectorization
bucketing
hive execution Engine
Hive MapSide Join
Other requirements
Hive Restrictions
I found a tool
Extremely Simple
Extremely Faster
My tool can process ---HDFS,Cloud,windows,linux,MACOS
You do not required an ingestion Tools
My tool can bring the data -- Process in SQL -- Deliver the data
My tool supports SQL for processing
Machine Learning algorithm
Very very good streaming data processing
Hadoop is just is one of the options
Easily customizable
============================================
One formulae To read and write for any source and destination
=============================================
Spark
DATABRICKS
2016 – Spark
===============
Go to Mysql
===============
insert into customer_src select * From customer_total where id>=301 and id<=330;
quit
=============================
Edge Node
=============================
rm -rf /home/cloudera/avsrcdir
mkdir /home/cloudera/avsrcdir
cd /home/cloudera/avsrcdir
echo -n cloudera>/home/cloudera/passfile
====================================
Hive shell
====================================
hive
create database prodb;
use prodb;
set hive.exec.max.dynamic.partitions=1000;
set hive.exec.dynamic.partition.mode=nonstrict;
11-02-2023
object obj {
def main(args:Array[String]):Unit=
{
println("zeyobron")
val d = 10
println(d)
}
***************
object obj {
def main(args:Array[String]):Unit=
println(list)
}
}
12-02-2023
package pack
object obj {
def main(args:Array[String]):Unit=
{
println("===Started===")
val a = 2
println(a+1)
val b = "zeyobron"
println(b + "-analytics")
}
}
*******************
Code
package pack
object obj {
def main(args:Array[String]):Unit=
{
println("===Started===")
val ls = List(1,2,3,4)
println
println("===raw list===")
ls.foreach(println)
val procls = ls.filter( x => x > 2 )
println
println("===proc list===")
procls.foreach(println)
}
}
package pack
object obj {
def main(args:Array[String]):Unit=
{
println("===Started===")
println
val lst = List("zeyo" , "zeyobron" , "sai")
println("====raw list===")
lst.foreach(println)
val procls = lst.map( x => x + ",analytics" )
println
println("==== process list===")
procls.foreach(println)
}
}
println
val lst = List("zeyo" , "zeyobron" , "sai")
println("====raw list===")
lst.foreach(println)
val procls = lst.filter( x => x.contains("zeyo") )
println
println("==== filter list===")
procls.foreach(println)
Task 2 -- (Optional)
18-02-2023
package pack
object obj {
def main(args:Array[String]):Unit={
println("=====started =====")
println
println("====Raw List=====")
val listr = List( "A-B" ,"C-D" ,"E-F")
listr.foreach(println)
println("====Proc List=====")
val flatdata = listr.flatMap( x => x.split("-"))
flatdata.foreach(println)
}
}
package pack
object obj {
def main(args:Array[String]):Unit={
println("=====started =====")
println
println("====Raw List=====")
println
val liststr = List( "Amazon-Jeff-America",
"Microsoft-BillGates-America",
"TCS-TATA-india",
"Reliance-Ambani-india")
liststr.foreach(println)
println
println("===filter list==")
println
val fillist = liststr.filter( x => x.contains("india"))
fillist.foreach(println)
println("=====flat map=====")
val flatdata = fillist.flatMap( x => x.split("-"))
flatdata.foreach(println)
println
println("=====replace map=====")
println
val repdata = flatdata.map( x => x.replace("india","local"))
repdata.foreach(println)
println
println("=====lower map=====")
println
val lowdata = repdata.map( x => x.toLowerCase() )
lowdata.foreach(println)
}
}
Single Line Code
package pack
object obj {
def main(args:Array[String]):Unit={
println("=====started =====")
println
println("====Raw List=====")
println
val liststr = List( "Amazon-Jeff-America",
"Microsoft-BillGates-America",
"TCS-TATA-india",
"Reliance-Ambani-india")
liststr.foreach(println)
println
println("===filter list==")
println
val fillist = liststr.filter( x => x.contains("india"))
.flatMap( x => x.split("-"))
.map( x => x.replace("india","local"))
.map( x => x.toLowerCase() )
fillist.foreach(println)
}
}
val liststr=
List(
"State->TamilNadu~City->Chennai",
"State->Karnataka~City->Bangalore",
"State->Telangana~City->Hyderabad"
)
liststr.foreach(println)
object obj {
def main(args:Array[String]):Unit={
println("=====started =====")
println
println("====Raw List=====")
println
val liststr=
List(
"State->TamilNadu~City->Chennai",
"State->Karnataka~City->Bangalore",
"State->Telangana~City->Hyderabad"
)
liststr.foreach(println)
println
val flatdata = liststr.flatMap( x => x.split("~"))
println("====flat data===")
println
flatdata.foreach(println)
val state = flatdata.filter( x =>x.contains("State"))
println
println("====state data===")
println
state.foreach(println)
val city = flatdata.filter( x =>x.contains("City"))
println
println("====city data===")
println
city.foreach(println)
val finallist = state.map( x => x.replace("State->", ""))
println
println("====finallist data===")
println
finallist.foreach(println)
val finallistcity = city.map( x => x.replace("City->", ""))
println
println("====finallist City data===")
println
finallistcity.foreach(println)
}
}
*********************
*******************
val liststr = List(
"BigData-Spark-Hive",
"Spark-Hadoop-Hive",
"Sqoop-Hive-Spark",
"Sqoop-BD-Hive"
)
********************
19-02-2023
package pack
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object obj {
def main(args:Array[String]):Unit={
println("===Started==")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val data = sc.textFile("file:///D:/data/st.txt")
println("====raw data====")
data.foreach(println)
val flatdata = data.flatMap( x => x.split("~"))
println
println("====flat data====")
flatdata.foreach(println)
val state = flatdata.filter( x => x.contains("State"))
println
println("====state data====")
state.foreach(println)
val city = flatdata.filter( x => x.contains("City"))
println
println("====city data====")
city.foreach(println)
val finalstate = state.map( x => x.replace("State->" , "" ))
println
println("====finalstate data====")
finalstate.foreach(println)
val finalcity = city.map( x => x.replace("City->" , "" ))
println
println("====finalcity data====")
finalcity.foreach(println)
}
}
*****************
Task 1 ----
Process usdata.csv
Read this File
Iterate each row filter length>200 -- One Line
Flatten the results with comma (,)
Remove Hyphones for the flattened it
Concatinate ",zeyo" for every line
Print it
Task 2 ----
Read datatxns.txt
Filter rows with Vaulting
Concat ",zeyo"
print it
**************
20-01-2023
Task Solution
Task 1 ----
Solution
package pack
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object obj {
def main(args:Array[String]):Unit={
println("===Started==")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val data = sc.textFile("file:///C:/data/usdata.csv")
println("====raw data====")
data.take(5).foreach(println)
val lendata = data.filter( x => x.length() > 200)
println("====length data====")
lendata.foreach(println)
val flatdata = lendata.flatMap( x => x.split(","))
println("====flatdata data====")
flatdata.foreach(println)
println
val replacedata = flatdata.map( x => x.replace("-", ""))
println("====replacedata data====")
replacedata.foreach(println)
println
val concatdata = replacedata.map( x => x+",zeyo")
println("====concatdata data====")
concatdata.foreach(println)
Task 2 ----
package pack
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object obj {
def main(args:Array[String]):Unit={
println("===Started==")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val data = sc.textFile("file:///C:/data/datatxns.txt")
println("====raw data====")
data.foreach(println)
val fildata = data.filter( x => x.contains("Vaulting"))
println("====Fil data====")
fildata.foreach(println)
val mapdata = fildata.map( x => x+",zeyo")
println("====mapdata ====")
mapdata.foreach(println)
}
****************
25-02-2023
package pack
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
object obj {
def main(args:Array[String]):Unit={
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")
*********************
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
object obj {
import spark.implicits._
}
************************
package pack
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
object obj {
def main(args:Array[String]):Unit={
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession
.builder()
.getOrCreate()
import spark.implicits._
df.show
}
}
************************
Task 1 -----
package pack
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
object obj {
def main(args:Array[String]):Unit={
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession
.builder()
.getOrCreate()
import spark.implicits._
val structschema = StructType(Array(
StructField("id",StringType),
StructField("category",StringType),
StructField("product",StringType),
StructField("mode", StringType)
))
val data = sc.textFile("file:///C:/data/datatxns.txt")
data.take(10).foreach(println)
val mapsplit = data.map( x => x.split(","))
println(mapsplit.collect.toList.size)
val rowrdd = mapsplit.map( x => Row(x(0),x(1),x(2),x(3)))
val filterdata = rowrdd.filter (x => x(2).toString().contains("Gymnastics"))
filterdata.foreach(println)
val df = spark.createDataFrame(filterdata, structschema)
df.show
}
}
26-02-2023
package pack
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder.getOrCreate()
import spark.implicits._
val df = spark
.read
.format("csv")
.option("header", true)
.load("file:///C:/data/usdata.csv")
df.show()
df.createOrReplaceTempView("cdf")
val procdf = spark.sql("select * from cdf where state='LA'")
}
}
**************
Click below link to download the Jar
https://repo1.maven.org/maven2/org/apache/spark/spark-avro_2.11/2.4.7/spark-avro_2.11-
2.4.7.jar
******************
package pack
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder.getOrCreate()
import spark.implicits._
val csvdf = spark.read.format("csv").option("header",true)
.load("file:///C:/data/usdata.csv")
csvdf.show()
csvdf.createOrReplaceTempView("cdf")
val procdf = spark.sql(" select * from cdf where state='LA' ")
procdf.show()
procdf.write.format("json").mode("overwrite").save("file:///C:/data/pr")
}
}
*********************
Task 1 --- Sql Practise
val df = spark.read.option("header","true").csv("file:///C:/data/df.csv")
val df1 = spark.read.option("header","true").csv("file:///C:/data/df1.csv")
df.show()
df1.show()
df.createOrReplaceTempView("df")
df1.createOrReplaceTempView("df1")
spark.sql("select * from df order by id").show()
spark.sql("select * from df1 order by id").show()
====================================
Select two columns
====================================
spark.sql("select id,tdate from df order by id").show()
====================================
Select column with category filter = Exercise
===================================
spark.sql("select id,tdate,category from df where category='Exercise' order by id").show()
====================================
Multi Column filter
====================================
spark.sql("select id,tdate,category,spendby from df where category='Exercise' and spendby='cash'
").show()
====================================
Multi Value Filter
====================================
spark.sql("select * from df where category in ('Exercise','Gymnastics')").show()
04-03-2023
package pack
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder.getOrCreate()
import spark.implicits._
val sqldf = spark.read
.format("jdbc")
.option("url","jdbc:mysql://zeyodb.cz7qhc39aphp.ap-south-
1.rds.amazonaws.com:3306/zeyodb")
.option("driver","com.mysql.jdbc.Driver")
.option("user","root")
.option("password","Aditya908")
.option("dbtable","cashdata")
.load()
sqldf.show()
}
}
********************
spark-shell --packages mysql:mysql-connector-java:5.1.21
sqldf.show()
**********************
Task 1 -- XML read
book.xml
xml jar
rowtag---book
Task 2 -- Test PartitionBy (May not work if you have any write issue) -- Thats fine leave
transactions.xml
with RowTag -- POSLog
and Print the Schema ---- df.printSchema() ---- Dont panic
*********************
Task 1 -- XML read
book.xml
xml jar
rowtag---book
Task 2 -- Test PartitionBy (May not work if you have any write issue) -- Thats fine leave
transactions.xml
with RowTag -- POSLog
and Print the Schema ---- df.printSchema() ---- Dont panic'
************************
Code
package pack
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder.getOrCreate()
import spark.implicits._
val df = spark.read
.format("xml")
.option("rowtag","book")
.load("file:///C:/data/book.xml")
df.show()
val countryschema = StructType(Array(
StructField("id",StringType,true),
StructField("name",StringType,true),
StructField("check",StringType,true),
StructField("country", StringType, true)
))
val df1 = spark.read
.format("csv")
.schema(countryschema)
.load("file:///C:/data/allcountry1.csv")
df1.show()
df1.write.format("csv").partitionBy("country","check").save("file:///C:/data/rcdata2")
}
}
***************************
05-03-2023
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession
.builder
.config("fs.s3a.access.key","AKIASVGFNNNDOTIRMUEP")
.config("fs.s3a.secret.key","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")
.getOrCreate()
import spark.implicits._
val countryschema = StructType(Array(
StructField("txnno",StringType,true),
StructField("txndate",StringType,true),
StructField("custno",StringType,true),
StructField("amount", StringType, true),
StructField("category",StringType,true),
StructField("product",StringType,true),
StructField("city",StringType,true),
StructField("state",StringType,true),
StructField("spendby", StringType, true)
))
val df = spark.read.format("csv")
.schema(countryschema)
.load("s3a://zeyoathenabucket/txns10k.txt")
df.show()
}
}
**************************************
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession
.builder
.getOrCreate()
import spark.implicits._
val df = spark.read.format("csv").option("header","true").load("file:///C:/
data/usdata.csv")
df.write.format("json").mode("error").save("file:///C:/data/jsondataus")
println("===== data written ======")
}
}
**********************
Task sql
====================================
Like Filter
====================================
====================================
Not Filters
====================================
====================================
Null Filters
====================================
spark.sql("select * from df where product is null").show()
====================================
Max Function
====================================
====================================
Min Funtion
====================================
spark.sql("select min(id) from df ").show()
Task 1 ----
Read dt.txt
select only id,tdate,category from raw dataframe
On top of this dataframe create a tempview
Using spark.sql add an extra column 1 as status
Task 3 ----
05-03-2023
https://www.youtube.com/watch?
v=LQVDJtfpQU0&list=PLS1QulWo1RIagob5D6kMIAvu7DQC5VTh3
drop a column
Filter category='Gymnastics'
Product is null
11-03-2023
package pack
import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df = spark
.read
.format("csv")
.option("header","true")
.load("file:///C:/data/dt.txt")
df.show()
println("==========Filter Gymnastics===========")
val filgym = df.filter( col("category")==="Gymnastics")
filgym.show()
println("==========Filter cat Gymnastics spend cash===========")
val mulcolfilter=df.filter(col("category")==="Gymnastics" &&
col("spendby")==="cash")
mulcolfilter.show()
println("==========ccategory = gymnastics,Exercise===========")
val mulvalfilter=df.filter(col("category") isin ("Gymnastics","Exercise"))
mulvalfilter.show()
println("==========Product gymnastics===========")
val likeop = df.filter(col("product") like "%Gymnastics%")
likeop.show()
println("==========Product is null===========")
println
val nullprod = df.filter(col("product") isNull )
nullprod.show()
println("==========Product is Not null===========")
val nullNOTprod = df.filter(!(col("product") isNull ))
nullNOTprod.show()
}
12-03-2023
package pack
import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df = spark
.read
.format("csv")
.option("header","true")
.load("file:///C:/data/dt.txt")
df.show()
package pack
import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
object obj {
case class schema(
txnno:String,
txndate:String,
custno:String,
amount:String,
category:String,
product:String,
city:String,
state:String,
spendby:String)
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
val colist = List("txnno",
"txndate",
"custno",
"amount",
"category",
"product",
"city",
"state",
"spendby")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val data = sc.textFile("file:///C:/data/revdata/file1.txt")
data.take(5).foreach(println)
val gymdata= data.filter( x => x.contains("Gymnastics"))
println
println("===Gymdata===")
println
gymdata.take(5).foreach(println)
val mapsplit = gymdata.map( x => x.split(","))
val schemardd = mapsplit.map(x =>
schema(x(0),x(1),x(2),x(3),x(4),x(5),x(6),x(7),x(8)))
val prodfilter = schemardd.filter( x => x.product.contains("Gymnastics"))
println
println("===Gymdata prod===")
println
prodfilter.take(5).foreach(println)
println
println("===schema rdd to dataframe===")
println
val schemadf = prodfilter.toDF().select(colist.map(col): _*)
schemadf.show(5)
val file2 = sc.textFile("file:///C:/data/revdata/file2.txt")
val mapsplit1=file2.map( x => x.split(","))
val rowrdd = mapsplit1.map( x => Row(x(0),x(1),x(2),x(3),x(4),x(5),x(6),x(7),x(8)))
println
println("===row rdd===")
println
println
rowrdd.take(5).foreach(println)
val structschema = StructType(Array(
StructField("txnno",StringType,true),
StructField("txndate",StringType,true),
StructField("custno",StringType,true),
StructField("amount", StringType, true),
StructField("category", StringType, true),
StructField("product", StringType, true),
StructField("city", StringType, true),
StructField("state", StringType, true),
StructField("spendby", StringType, true)
))
println
println("===row df===")
println
println
val rowdf = spark.createDataFrame(rowrdd, structschema).select(colist.map(col): _*)
rowdf.show(5)
val csvdf = spark.read.format("csv").option("header","true")
.load("file:///C:/data/revdata/file3.txt").select(colist.map(col): _*)
println
println("===csv df===")
println
csvdf.show(5)
jsondf.show(5)
parquetdf.show(5)
println
println("===xmldf===")
println
xmldf.show(5)
println
println("===uniondf===")
println
uniondf.show(5)
procdf.show(5)
procdf.write.mode("append").partitionBy("category").save("file:///C:/data/finalpdata")
println("===revision complete==")
val df = spark
.read
.format("csv")
.option("header","true")
.load("file:///C:/data/dt.txt")
df.show()
tdf.show()*/
}
package pack
import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
val colist = List("txnno",
"txndate",
"custno",
"amount",
"category",
"product",
"city",
"state",
"spendby")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
jsondf.show(5)
println
println("===xmldf===")
println
xmldf.show(5)
println
println("===uniondf===")
println
uniondf.show(5)
println
println("===procdf===")
println
procdf.show(5)
procdf.write.mode("append").partitionBy("category").save("file:///C:/data/
finalpdata")
println("===revision complete==")
val df = spark
.read
.format("csv")
.option("header","true")
.load("file:///C:/data/dt.txt")
df.show()
tdf.show()*/
18-03-2023
package pack
import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df = spark
.read
.format("csv")
.option("header","true")
.load("file:///C:/data/aggdata.csv")
df.show()
val aggdf = df.groupBy("name")
.agg(
sum("amt")
.cast(IntegerType)
.as("total")
)
.orderBy("name")
aggdf.show()
val df = spark
.read
.format("csv")
.option("header","true")
.load("file:///C:/data/dt.txt")
df.show()
tdf.show()*/
}
}
package pack
import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df1 = spark.read.format("csv")
.option("header","true")
.load("file:///C:/data/join1.csv")
df1.show()
val df2 = spark.read.format("csv")
.option("header","true")
.load("file:///C:/data/join2.csv")
df2.show()
println
println("=====Inner Join=====")
println
val injoindf = df1.join( df2 , Seq("txnno"), "inner")
injoindf.show()
println
println("=====left Join=====")
println
val leftjoin = df1.join( df2 , Seq("txnno"), "left")
leftjoin.show()
println
println("=====right Join=====")
println
val rightjoin = df1.join( df2 , Seq("txnno"), "right")
rightjoin.show()
println
println("=====Full Join=====")
println
val fulljoin = df1.join(df2 , Seq("txnno"),"full")
.orderBy("txnno")
fulljoin.show()
}
}
If Columns are different
joindf.show()
19-03-2023
package pack
import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
25-03-2023
package pack
import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df = spark
.read
.format("json")
.option("multiline","true")
.load("file:///C:/data/jv.json")
df.show()
df.printSchema()
val flattendf = df.select(
"address.permanentAddress",
"address.temporaryAddress",
"org",
"trainer",
"workAddress",
"years"
)
flattendf.show()
flattendf.printSchema()
val flattendf1 = df.withColumn("permanentAddress",
expr("address.permanentAddress"))
.withColumn("temporaryAddress",expr("address.temporaryAddress"))
.drop("address")
flattendf1.show()
flattendf1.printSchema()
}
package pack
import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df = spark
.read
.format("json")
.option("multiline","true")
.load("file:///C:/complexjson/donut1.json")
df.show()
df.printSchema()
val flattendf = df.select(
"id",
"image.height",
"image.url",
"image.width",
"name",
"type"
)
flattendf.show()
flattendf.printSchema()
finalcomplexdf.show()
finalcomplexdf.printSchema()
}
26-03-2023
complexdf.show()
complexdf.printSchema()
package pack
import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df = spark
.read
.format("json")
.option("multiline","true")
.load("file:///C:/data/complexjson/randomeuser5.json")
df.show()
df.printSchema()
val flat1 = df.withColumn("results", expr("explode(results)"))
flat1.show
flat1.printSchema()
val finalflatten= flat1.select(
"nationality",
"results.user.BSN",
"results.user.cell",
"results.user.dob",
"results.user.email",
"results.user.gender",
"results.user.location.city",
"results.user.location.state",
"results.user.location.street",
"results.user.location.zip",
"results.user.md5",
"results.user.name.first",
"results.user.name.last",
"results.user.name.title",
"results.user.password",
"results.user.phone",
"results.user.picture.large",
"results.user.picture.medium",
"results.user.picture.thumbnail",
"results.user.registered",
"results.user.salt",
"results.user.sha1",
"results.user.sha256",
"results.user.username",
"seed",
"version"
)
finalflatten.show()
finalflatten.printSchema()
}
package pack
import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df = spark
.read
.format("json")
.option("multiline","true")
.load("file:///C:/data/jv.json")
df.show()
df.printSchema()
val flat1 = df.withColumn("Students", expr("explode(Students)"))
flat1.show
flat1.printSchema()
}
Task --- URL data Read - dataframe
URL Code
package pack
import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
import scala.io.Source
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df = spark.read.json(rdd)
df.show()
df.printSchema()
val flat1 = df.withColumn("results", expr("explode(results)"))
flat1.show
flat1.printSchema()
val finalflatten= flat1.select(
"nationality",
"results.user.cell",
"results.user.dob",
"results.user.email",
"results.user.gender",
"results.user.location.city",
"results.user.location.state",
"results.user.location.street",
"results.user.location.zip",
"results.user.md5",
"results.user.name.first",
"results.user.name.last",
"results.user.name.title",
"results.user.password",
"results.user.phone",
"results.user.picture.large",
"results.user.picture.medium",
"results.user.picture.thumbnail",
"results.user.registered",
"results.user.salt",
"results.user.sha1",
"results.user.sha256",
"results.user.username",
"seed",
"version"
)
finalflatten.show()
finalflatten.printSchema()
}
01-04-2023
02-04-2023
[5:35 PM, 4/2/2023] Sai Aditya Big Data: package pack
import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
import scala.io.Source
import org.apache.spark.sql._
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df = spark.read
.format("json")
.option("multiline","true")
.load("file:///C:/complexjson/actorsj1.json")
df.show()
df.printSchema()
val flattendf = df.withColumn("Actors",expr("explode(Actors)"))
flattendf.show()
flattendf.printSchema()
val finalflatten = flattendf.select(
"Actors.fields.*",
"country",
"version"
)
finalflatten.show()
finalflatten.printSchema()
val complexdf= finalflatten.groupBy("country","version")
.agg(
collect_list(
struct(
struct(
col("Birthdate"),
col("`Born At`"),
col("age"),
col("hasChildren"),
col("hasGreyHair"),
col("name"),
col("photo"),
col("weight"),
col("wife")
).as("fields")
)
).as("Actors")
)
.select("Actors","country","version")
complexdf.show()
complexdf.printSchema()
}
}
package pack
import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
import scala.io.Source
import org.apache.spark.sql._
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df = spark.read
.format("json")
.option("multiline","true")
.load("file:///C:/data/jv.json")
df.show()
df.printSchema()
val flattendf = df.withColumn("Students", expr("explode(Students)"))
flattendf.show()
flattendf.printSchema()
"Students.user.*",
"org",
"trainer",
"years"
)
flattendf1.show()
flattendf1.printSchema()
val finaldf = flattendf1.withColumn("pets",expr("explode(pets)"))
finaldf.show()
finaldf.printSchema
val complexdf1 = finaldf.groupBy("location", "name" ,"org" , "trainer", "years")
.agg(
collect_list(col("pets")).as("pets")
)
complexdf1.show()
complexdf1.printSchema()
val finalcomplex = complexdf1.groupBy("org","trainer","years")
.agg(
collect_list(
struct(
struct(
col("location"),
col("name"),
col("pets")
).as("user")
)
).as("Students")
)
.select("Students","org","trainer","years")
finalcomplex.show()
finalcomplex.printSchema()
}
}
When you come to the review -- Put your current status the top
Rephrase the sentences from the samples
Bold whenever required
Put the points which you really feel comfortable
Defintely Mention aws s3,EMR
Do not mention versions for any of the Tools
Put your experience in the Descending order (Recent Experience at the Top)
Mention more points on Spark (From samples, or from your Knowledge)
Better to have font Cambria
Always mention more points
IT folks
ETL
Testing
Supports
BI
Analyst
More than 12 Years ----- In the Project Level -- 4 Years relevant
6-10 ----- in the Project Level --- 3 Years Relevent
3-6 ----- in the Project Level --- 3 Years Relevent
2 Years ----- in the Project Level --- 2 Years Relevent
1.9 ----- in the Project Level --- Almost 2 Relevent
Remaining your own experience
Non IT Folks
https://mindmajix.com/etl-testing-sample-resumes
Gap Folks
Cyphel Infosolutions
www.Cyphelinfo.com
Prosaic technologies
www.prosaica.com
Client Name--
Bluedart
bharat matrimony
Hershey's
cars24
Indian Calls
Overseas
Install AWSCLI
Download and install GIT BASH
aws configure
aws s3 ls
aws s3 mb s3://<UNIQUE>zeyo35/
aws s3 ls s3://
echo zeyobron>zeyofile
aws s3 cp zeyofile s3://<UNIQUE>zeyo35/
aws s3 ls s3://<UNIQUE>zeyo35/
aws s3 rm s3://<UNIQUE>zeyo35/zeyofile
aws s3 rb s3://<UNIQUE>zeyo35
Windows CMD
aws s3 mb s3://<UNIQUE>zeyo35/
aws s3 ls s3://
notepad.exe zeyofile ---(type zeyobron after file opens)
aws s3 cp zeyofile s3://<UNIQUE>zeyo35/
aws s3 ls s3://<UNIQUE>zeyo35/
aws s3 rm s3://<UNIQUE>zeyo35/zeyofile
aws s3 rb s3://<UNIQUE>zeyo35
**********************
+919492326052 -- Bhaskar
PF Old Dated
====================================
09-04-2023
======================
Cloudera staging Exports
======================
mysql -uroot -pcloudera
create database if not exists exp;
use exp;
drop table if exists ttab;
drop table if exists st_ttab;
create table ttab(id int,name varchar(100),amount int);
create table st_ttab(id int,name varchar(100),amount int);
quit
cd
echo 1,zeyo,40>zfile
echo 2,ravi,70>>zfile
echo 3,rani,70>>zfile
hadoop fs -mkdir /user/cloudera/exdir
hadoop fs -put zfile /user/cloudera/exdir
sqoop export --connect jdbc:mysql://localhost/exp --username root --password cloudera --table ttab
--staging-table st_ttab --m 1 --export-dir /user/cloudera/exdir
======================
Lab staging Exports
======================
cd
echo 1,zeyo,40>zfile
echo 2,ravi,70>>zfile
echo 3,rani,70>>zfile
hadoop fs -mkdir /user/itv005669/exdir
hadoop fs -put zfile /user/itv005669/exdir
sqoop export --connect
jdbc:mysql://zeyodb.cveqgaujeiwd.ap-south-1.rds.amazonaws.com/itv005669 --username root --
password Aditya908 --table ttab --staging-table st_ttab --m 1 --export-dir /user/itv005669/exdir
=============
AVRO Task Cloudera
=============
=============
AVRO Task Lab
=============
====================
Optional Task
====================
Cloudera
====================
Optional Task
====================
Lab
Task Video
https://youtu.be/E0lhq0W_z7o
Task Solution
======================
Cloudera staging Exports
======================
cd
echo 1,zeyo,40>zfile
echo 2,ravi,70>>zfile
echo 3,rani,70>>zfile
hadoop fs -mkdir /user/cloudera/exdir
hadoop fs -put zfile /user/cloudera/exdir
sqoop export --connect jdbc:mysql://localhost/exp --username root --password cloudera --table ttab
--staging-table st_ttab --m 1 --export-dir /user/cloudera/exdir
======================
Lab staging Exports
======================
cd
echo 1,zeyo,40>zfile
echo 2,ravi,70>>zfile
echo 3,rani,70>>zfile
hadoop fs -mkdir /user/itv005669/exdir
hadoop fs -put zfile /user/itv005669/exdir
=============
AVRO Task Cloudera
=============
=============
AVRO Task Lab
=============
====================
Optional Task
====================
Cloudera
====================
Optional Task
====================
Lab
5 Areas --
1 -- We launch our own clusters Develop the Code. Complete the Development or else we terminate
by end of the by coping the required data to s3.
2 -- In our project we automated the whole EMR production deployment with EMR COMMAND
RUNNER consist of step execution.
5 -- Once we develop the Code . we commit to GIT - We have DEV OPS who developed CI CD script in
which we open JENKINS and build the jar for project
cd
mkdir zfl
touch zfl/file1
touch zfl/file2
aws s3 sync zfl/ s3://zeyoss36/sssdir/
rm -rf zfl/*
aws s3 sync s3://zeyoss36/sssdir/ zfl/
15-04-2023
Steps ------>
====================
Code Development
====================
Requirement Gathering
Create my cluster
Complete Code Development
Take the Code
====================
Jar Development/Deployment
====================
Create project Eclise/Intellij
Put the Code
Generate the jar
Copy that Jar to aws s3
====================
EMR Deployment
====================
Create a step execution to Run Jar with Spark Application
====================
Export Command Runner Script
====================
16-04-2013
============================
Step 1 -- Open Gitbash/Cmd/Lab
============================
aws configure
Accesskey --- AKIA2TITMOYY4IOOJGBL
Secretkey --- a8ZhgE/za3KZPD+r9Mj+XEx78sq2LNwUtO6gPKxh
RegionName --- ap-south-1
outputformat --- json
aws s3 ls
============================
Step 2 ---- replace URNAME with some Unique name
============================
=============
Step 3 ---- Replace your cluster id with below id wait for terminating state
=============
==================
Validate the Data you should see your folder name
==================
aws s3 ls s3://azeyodev/dest/availablecustomers/
aws s3 ls s3://azeyodev/dest/notavailablecustomers/
package pack
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.Row
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import scala.io._
import org.json4s._
import java.io.InputStream
object obj {
def main(args:Array[String]):Unit={
val conf = new SparkConf().setAppName("ES").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("Error")
val spark = SparkSession.builder()
.getOrCreate()
import spark.implicits._
val configdf = spark
.read
.format("json")
.option("multiline","true")
.load("s3://azeyodev/config.json")
configdf.show()
println(notavail)
val data = spark.read.format("avro")
.load(src)
data.show()
val html =
Source.fromURL("https://randomuser.me/api/0.8/?results=500")
val s = html.mkString
============================
Step 1 -- Open Gitbash/Cmd/Lab
============================
aws configure
Accesskey --- AKIA2TITMOYY4IOOJGBL
Secretkey --- a8ZhgE/za3KZPD+r9Mj+XEx78sq2LNwUtO6gPKxh
RegionName --- ap-south-1
outputformat --- json
aws s3 ls
============================
Step 2 ---- replace URNAME with some Unique name
============================
=============
Step 3 ---- Replace your cluster id with below id wait for terminating state
=============
==================
Validate the Data you should see your folder name
==================
aws s3 ls s3://azeyodev/dest/availablecustomers/
aws s3 ls s3://azeyodev/dest/notavailablecustomers/