0% found this document useful (0 votes)
383 views93 pages

Big Data Practice

The document describes creating a MySQL database and table, inserting sample data, and using Sqoop to import the data into HDFS incrementally. Key steps include: 1. Creating a MySQL database 'zeyodb1' and table 'custjob1' with sample data. 2. Creating a Sqoop job to import from the MySQL table to HDFS directory '/user/cloudera/jobdir_dt' incrementally based on the 'tdate' column. 3. Adding more data to the MySQL table and re-running the Sqoop job to import only the new records.

Uploaded by

srinivas75k
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
383 views93 pages

Big Data Practice

The document describes creating a MySQL database and table, inserting sample data, and using Sqoop to import the data into HDFS incrementally. Key steps include: 1. Creating a MySQL database 'zeyodb1' and table 'custjob1' with sample data. 2. Creating a Sqoop job to import from the MySQL table to HDFS directory '/user/cloudera/jobdir_dt' incrementally based on the 'tdate' column. 3. Adding more data to the MySQL table and re-running the Sqoop job to import only the new records.

Uploaded by

srinivas75k
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 93

31/12/2022

Cloudera Incremental Job

============

Create data in mysql

============

mysql -uroot -pcloudera

create database if not exists zeyodb1;

use zeyodb1;

drop table custjob1;

create table custjob1(id int,name varchar(100),tdate date);

insert into custjob1 values(1,'hema',now() - interval 7 day);

insert into custjob1 values(2,'ravi',now() - interval 6 day);

insert into custjob1 values(3,'raj',now() - interval 5 day);

insert into custjob1 values(4,'vishnu',now() - interval 4 day);

select * from custjob1;

quit

============

Create a password file and Sqoop Job

============

cd

echo -n cloudera>/home/cloudera/pfile
sqoop job --create incre_d -- import --connect jdbc:mysql://localhost/zeyodb1 --username root --
password-file file:///home/cloudera/pfile --table custjob1 --m 1 --target-dir /user/cloudera/jobdir_dt
--incremental append --check-column tdate --last-value 0000-00-00

sqoop job --exec incre_d

hadoop fs -ls /user/cloudera/jobdir_dt

hadoop fs -cat /user/cloudera/jobdir_dt/part-m-00000

============

Add data in mysql and validate

============

mysql -uroot -pcloudera

use zeyodb1;

insert into custjob1 values(5,'riyaz',now() - interval 3 day);

insert into custjob1 values(6,'venu',now() - interval 2 day);

select * from custjob1;

quit

======

Execute the Job again and validate data

======

sqoop job --exec incre_d

hadoop fs -ls /user/cloudera/jobdir_dt

hadoop fs -cat /user/cloudera/jobdir_dt/part-m-00001


01-01-2023

Cloud Import

Cloudera Folks

mysql -uroot -pcloudera

create database if not exists zdb;

use zdb;

drop table cust;

create table cust(id int,name varchar(100));

insert into cust value(1,'zeyo');

insert into cust value(2,'analytics');

select * from cust;

quit

sqoop import -Dfs.s3a.access.key=AKIAS3T7N3CTNLIPQZZM -


Dfs.s3a.secret.key=AtA8GV9WQRvLFfN2rs/PgkLbh1HYV6yPesBcG5KT -Dfs.s3a.endpoint=s3.ap-
south-1.amazonaws.com --connect jdbc:mysql://localhost/zdb --username root --password cloudera
--table cust --m 1 --target-dir s3a://srizeyo/saidir

To Validate the data

cd

rm -rf awscli-bundle.zip

rm -rf awscli-bundle

curl https://s3.amazonaws.com/aws-cli/awscli-bundle-1.16.188.zip -o awscli-bundle.zip


unzip awscli-bundle.zip

./awscli-bundle/install -i /home/cloudera/aws -b /home/cloudera/bin/aws

aws=/home/cloudera/bin/aws

cd

rm -rf .aws

mkdir .aws

cd .aws

wget https://srizeyo.s3.amazonaws.com/credentials

cd

aws s3 ls s3://srizeyo/

Task 1

Cloudera folks --

mysql -uroot -pcloudera

create database if not exists ad;

use ad;

drop table ttab;

create table ttab(id int,name varchar(100),amount int);

insert into ttab values(1,'zeyo',40);

insert into ttab values(2,'vasu',50);

insert into ttab values(3,'rani',70);

select * from ttab;

quit
Task 1 --

sqoop import --connect jdbc:mysql://localhost/ad --username root --password cloudera --table ttab
--m 1 --delete-target-dir --target-dir /user/cloudera/pdata --as-parquetfile

hadoop fs -ls /user/cloudera/pdata

hadoop fs -cat /user/cloudera/pdata/*

press ctrl+c in keyboard

Task 2 --

sqoop import --connect jdbc:mysql://localhost/ad --username root --password cloudera --table ttab
--m 1 --delete-target-dir --target-dir /user/cloudera/sdata --as-sequencefile

hadoop fs -ls /user/cloudera/sdata

hadoop fs -cat /user/cloudera/sdata/*

07-01-2023

================

Cloudera Multi mappers

================

mysql -uroot -pcloudera


drop database map;

create database map;

use map;

drop table mtab;

create table mtab(id int,name varchar(100),amount int);

insert into mtab values(1,'zeyo',40);

insert into mtab values(2,'vasu',50);

insert into mtab values(3,'rani',70);

insert into mtab values(4,'raji',40);

insert into mtab values(5,'viru',50);

insert into mtab values(6,'raj',70);

insert into mtab values(7,'vinu',40);

insert into mtab values(8,'ajit',50);

insert into mtab values(9,'raki',70);

insert into mtab values(10,'rinu',40);

insert into mtab values(11,'dini',50);

insert into mtab values(12,'div',70);

select * from mtab;

quit

====================

1 Mappers

====================

sqoop import --connect jdbc:mysql://localhost/map --username root --password cloudera --table


mtab --m 1 --delete-target-dir --target-dir /user/cloudera/mtab

hadoop fs -ls /user/cloudera/mtab


hadoop fs -cat /user/cloudera/mtab/part-m-00000

====================

2 Mappers

====================

sqoop import --connect jdbc:mysql://localhost/map --username root --password cloudera --table


mtab --m 2 --split-by id --delete-target-dir --target-dir /user/cloudera/mtab2

hadoop fs -ls /user/cloudera/mtab2

hadoop fs -cat /user/cloudera/mtab2/part-m-00000

hadoop fs -cat /user/cloudera/mtab2/part-m-00001

====================

No Mappers

====================

sqoop import --connect jdbc:mysql://localhost/map --username root --password cloudera --split-by


id --table mtab --delete-target-dir --target-dir /user/cloudera/mtab4

hadoop fs -ls /user/cloudera/mtab4

hadoop fs -cat /user/cloudera/mtab4/part-m-00000

hadoop fs -cat /user/cloudera/mtab4/part-m-00001

hadoop fs -cat /user/cloudera/mtab4/part-m-00002

hadoop fs -cat /user/cloudera/mtab4/part-m-00003

Task 1 --- without Split By (Negative Scenario)

Cloudera Folks
sqoop import --connect jdbc:mysql://localhost/map --username root --password cloudera --table
mtab --m 2 --delete-target-dir --target-dir /user/cloudera/mtab2

Task 2

From sqoop how to query the data of RDBMS ?

08-01-2023

Task 1

======================

Cloudera staging Exports

======================

mysql -uroot -pcloudera

create database if not exists exp;

use exp;

create table ttab(id int,name varchar(100),amount int);

create table st_ttab(id int,name varchar(100),amount int);

quit

cd

echo 1,zeyo,40>zfile

echo 2,ravi,70>>zfile

echo 3,rani,70>>zfile

hadoop fs -mkdir /user/cloudera/exdir

hadoop fs -put zfile /user/cloudera/exdir


sqoop export --connect jdbc:mysql://localhost/exp --username root --password cloudera --table ttab
--staging-table st_ttab --m 1 --export-dir /user/cloudera/exdir

mysql -uroot -pcloudera

use exp;

select * from ttab;

select * from st_ttab;

quit

Task 2

hive (enter)

Cloudera Folks

create database zdb;

use zdb;

create table tab(id int);

select * from tab;

Task 3

mysql -uroot -pcloudera


create database if not exists exp;

use exp;

create table prac(id int,name varchar(100),amount int);

insert into prac values(1,'sai',50);

insert into prac values(2,'zeyo',60);

insert into prac values(3,'raj',70);

select * from prac;

select id,name from prac;

select * from prac where id>1;

select * from prac where id>1 and amount>60;

14-01-2023

Cloudera Folks

hive and enter

create database zeyodb;

!hadoop fs -ls /user/hive/warehouse/;

use zeyodb;

create table ztab(id int);

!hadoop fs -ls /user/hive/warehouse/zeyodb.db/;

Task 1 ---
Cloudera folks

hadoop dfsadmin -safemode leave

cd

echo 1,sai>tfile

echo 2,zeyo>>tfile

hive (enter)

create database if not exists cdb;

use cdb;

create table ztab(id int,name string) row format delimited fields terminated by ',' stored as textfile;

load data local inpath '/home/cloudera/tfile' into table ztab;

select * from ztab;

!hadoop fs -ls /user/hive/warehouse/cdb.db/ztab/;

15-01-2023

Cloudera Folks

hadoop dfsadmin -safemode leave

cd

echo 1,sai>tfile

echo 2,zeyo>>tfile

hadoop fs -put tfile /user/cloudera/

hive (type and enter)

create database if not exists hdb;

use hdb;

create table ztab(id int,name string) row format delimited fields terminated by ',';

load data inpath '/user/cloudera/tfile' into table ztab;


select * from ztab;

!hadoop fs -ls /user/cloudera/; ---> u will not see that tfile

Cloudera Folks

hadoop dfsadmin -safemode leave

cd

echo 1,sai>file1

echo 2,zeyo>>file1

hadoop fs -rmr /user/cloudera/adir

hadoop fs -mkdir /user/cloudera/adir

hadoop fs -put file1 /user/cloudera/adir/

hive (type and enter)

create database if not exists mdb;

use mdb;

create table atab(id int,name string) row format delimited fields terminated by ',' location
'/user/cloudera/adir';

select * from atab;

describe formatted atab;

Task 1 ----

Cloudera Folks

hadoop dfsadmin -safemode leave

cd

echo 1,sai>file1

echo 2,zeyo>>file1

echo 3,anil>file2
echo 4,rita>>file2

hadoop fs -rmr /user/cloudera/dir1

hadoop fs -mkdir /user/cloudera/dir1

hadoop fs -rmr /user/cloudera/dir2

hadoop fs -mkdir /user/cloudera/dir2

hadoop fs -put file1 /user/cloudera/dir1/

hadoop fs -put file2 /user/cloudera/dir2/

hive (type and enter)

create database if not exists adb;

use adb;

drop table tab1;

drop table tab2;

create table tab1(id int,name string) row format delimited fields terminated by ',' location
'/user/cloudera/dir1';

create table tab2(id int,name string) row format delimited fields terminated by ',' location
'/user/cloudera/dir2';

insert into tab1 select * from tab2;

!hadoop fs -ls /user/cloudera/dir1;

21-01-2023

Types of Tables

Cloudera Folks

cd

echo 1,sai>data.csv

echo 2,zeyo>>data.csv
hadoop fs -mkdir /user/cloudera/mdir

hadoop fs -mkdir /user/cloudera/edir

hadoop fs -put data.csv /user/cloudera/mdir

hadoop fs -put data.csv /user/cloudera/edir

hive

create table mtab(id int,name string) row format delimited fields terminated by ',' location
'/user/cloudera/mdir';

select * from mtab;

create external table etab(id int,name string) row format delimited fields terminated by ',' location
'/user/cloudera/edir';

select * from etab;

!hadoop fs -ls /user/cloudera/;

drop table mtab;

drop table etab;

!hadoop fs -ls /user/cloudera/;

=================

Cloudera Folks -- static load

=================

cd

echo 1,Sai,I>INDTxns.csv
echo 2,zeyo,I>>INDTxns.csv

echo 3,Hema,K>UKTxns.csv

echo 4,ravi,K>>UKTxns.csv

echo 5,Jai,S>USTxns.csv

echo 6,Swathi,S>>USTxns.csv

hive

create database if not exists pdb;

use pdb;

create table parttab(id int,name string,chk string) partitioned by (country string) row format
delimited fields terminated by ',' location '/user/cloudera/pdir';

load data local inpath '/home/cloudera/INDTxns.csv' into table parttab partition(country='INDIA');

load data local inpath '/home/cloudera/USTxns.csv' into table parttab partition(country='USA');

load data local inpath '/home/cloudera/UKTxnx.csv' into table parttab partition(country='UK');

select * from parttab;

!hadoop fs -ls /user/cloudera/pdir/;

!hadoop fs -ls /user/cloudera/pdir/country=INDIA;

!hadoop fs -ls /user/cloudera/pdir/country=USA;

!hadoop fs -ls /user/cloudera/pdir/country=UK;


Updated Commands

=================

Cloudera Folks -- static load

=================

cd

echo 1,Sai,I>INDTxns.csv

echo 2,zeyo,I>>INDTxns.csv

echo 3,Hema,K>UKTxns.csv

echo 4,ravi,K>>UKTxns.csv

echo 5,Jai,S>USTxns.csv

echo 6,Swathi,S>>USTxns.csv

hive

create database if not exists pdb;

use pdb;

drop parttab;

create table parttab(id int,name string,chk string) partitioned by (country string) row format
delimited fields terminated by ',' location '/user/cloudera/pdir';

load data local inpath '/home/cloudera/INDTxns.csv' into table parttab partition(country='INDIA');

load data local inpath '/home/cloudera/USTxns.csv' into table parttab partition(country='USA');

load data local inpath '/home/cloudera/UKTxns.csv' into table parttab partition(country='UK');


select * from parttab;

!hadoop fs -ls /user/cloudera/pdir/;

!hadoop fs -ls /user/cloudera/pdir/country=INDIA;

!hadoop fs -ls /user/cloudera/pdir/country=USA;

!hadoop fs -ls /user/cloudera/pdir/country=UK;

Updated Commands

=================

Cloudera Folks -- static load

=================

cd

echo 1,Sai,I>INDTxns.csv

echo 2,zeyo,I>>INDTxns.csv

echo 3,Hema,K>UKTxns.csv

echo 4,ravi,K>>UKTxns.csv

echo 5,Jai,S>USTxns.csv

echo 6,Swathi,S>>USTxns.csv

hive

create database if not exists pdb;

use pdb;

drop table parttab;


create table parttab(id int,name string,chk string) partitioned by (country string) row format
delimited fields terminated by ',' location '/user/cloudera/pdir';

load data local inpath '/home/cloudera/INDTxns.csv' into table parttab partition(country='INDIA');

load data local inpath '/home/cloudera/USTxns.csv' into table parttab partition(country='USA');

load data local inpath '/home/cloudera/UKTxns.csv' into table parttab partition(country='UK');

select * from parttab;

!hadoop fs -ls /user/cloudera/pdir/;

!hadoop fs -ls /user/cloudera/pdir/country=INDIA;

!hadoop fs -ls /user/cloudera/pdir/country=USA;

!hadoop fs -ls /user/cloudera/pdir/country=UK;

22-01-2023

=================

Cloudera Folks

=================

============

Data Ready

============

cd

echo 1,Sai,I,IND>allcountry.csv
echo 2,zeyo,I,IND>>allcountry.csv

echo 3,Hema,K,UK>>allcountry.csv

echo 4,Gomathi,K,UK>>allcountry.csv

echo 5,Jai,S,US>>allcountry.csv

echo 6,Swathi,S,US>>allcountry.csv

============

hive -- Go inside hive

============

create database if not exists pdb;

use pdb;

============

Static Table creation

============

drop table sitab;

create table sitab(id int,name string,chk string) partitioned by (country string) row format delimited
fields terminated by ',' location '/user/cloudera/sidir';

drop table srctab;

============

Source table creation

============

create table srctab(id int,name string,chk string,country string) row format delimited fields
terminated by ',' location '/user/cloudera/sdir';
============

Load allcountry data to srctab

============

load data local inpath '/home/cloudera/allcountry.csv' into table srctab;

============

static insert for USA Partitions

============

insert into sitab partition(country='USA') select id,name,chk from srctab where country='US';

!hadoop fs -ls /user/cloudera/sidir; ---> u will see country=USA

============

Dynamic Table creation

============

drop table dyntab;

create table dyntab(id int,name string,chk string) partitioned by (country string) row format
delimited fields terminated by ',' location '/user/cloudera/dyndir';

============

Dynamic Insert
============

set hive.exec.dynamic.partition.mode=nonstrict;

insert into dyntab partition(country) select id,name,chk,country from dyntab;

!hadoop fs -ls /user/cloudera/dyndir; --- U will see all the country

=================

Cloudera Folks --- Dynamic Partitions

=================

cd

echo 1,Sai,I,IND>allcountry.csv

echo 2,zeyo,I,IND>>allcountry.csv

echo 3,Hema,K,UK>>allcountry.csv

echo 4,Gomathi,K,UK>>allcountry.csv

echo 5,Jai,S,US>>allcountry.csv

echo 6,Swathi,S,US>>allcountry.csv

hive

create database if not exists pdb;

use pdb;

create table dyntab(id int,name string,chk string) partitioned by (country string) row format
delimited fields terminated by ',' location '/user/cloudera/dyndir';
create table sttab(id int,name string,chk string,country string) row format delimited fields
terminated by ',' location '/user/cloudera/stdir';

load data local inpath '/home/cloudera/allcountry.csv' into table sttab;

set hive.exec.dynamic.partition.mode=nonstrict;

insert into dyntab partition(country) select id,name,chk,country from sttab;

Dynamic Insert

============

set hive.exec.dynamic.partition.mode=nonstrict;

insert into dyntab partition(country) select id,name,chk,country from srctab;

!hadoop fs -ls /user/cloudera/dyndir; --- U will see all the country

============

Dynamic Insert

============

set hive.exec.dynamic.partition.mode=nonstrict;

insert into dyntab partition(country) select id,name,chk,country from srctab;

!hadoop fs -ls /user/cloudera/dyndir; --- U will see all the country

Task Cloudera --- Sub partitions

==================

Sub partitions-- Cloudera Folks


==================

Task 1 -----

cd

echo 1,Sai,I,IND,cash>allc.csv

echo 2,zeyo,I,IND,credit>>allc.csv

echo 3,Hema,K,UK,cash>>allc.csv

echo 4,Gomathi,K,UK,credit>>allc.csv

echo 5,Jai,S,US,cash>>allc.csv

echo 6,Swathi,S,US,credit>>allc.csv

echo 7,Sai,I,IND,credit>>allc.csv

echo 8,zeyo,I,IND,cash>>allc.csv

echo 9,Hema,K,UK,credit>>allc.csv

echo 10,Gomathi,K,UK,cash>>allc.csv

echo 11,Jai,S,US,credit>>allc.csv

echo 12,Swathi,S,US,cash>>allc.csv

====> Go inside Hive <=======

create table srcs(id int,name string,chk string,country string,spendby string) row format delimited
fields terminated by ',' location '/user/cloudera/srcd';

load data local inpath '/home/cloudera/allc.csv' into table srcs;


create table tars(id int,name string,chk string) partitioned by (country string,spendby string) row
format delimited fields terminated by ',' location '/user/cloudera/tard';

set hive.exec.dynamic.partition.mode=nonstrict;

insert into tars partition (country,spendby) select id,name,chk,country,spendby from srcs;

!hadoop fs -ls /user/cloudera/tard;

28-01-2023

Project solution for Both Cloudera and Lab

Cloudera Folks Project

===============

Go to Mysql

===============

mysql -uroot -pcloudera

Create database if not exists prodb;

use prodb;

select * from customer_total;


create table customer_src(id int(10),username varchar(100),sub_port varchar(100),host
varchar(100),date_time varchar(100),hit_count_val_1 varchar(100),hit_count_val_2
varchar(100),hit_count_val_3 varchar(100),timezone varchar(100),method varchar(100),`procedure`
varchar(100),value varchar(100),sub_product varchar(100),web_info varchar(100),status_code
varchar(100));

insert into customer_src select * From customer_total where id>=301 and id<=330;

quit

=============================

Edge Node

=============================

rm -rf /home/cloudera/avsrcdir

mkdir /home/cloudera/avsrcdir

cd /home/cloudera/avsrcdir

echo -n cloudera>/home/cloudera/passfile

sqoop job --delete inpjob

sqoop job --create inpjob -- import --connect jdbc:mysql://localhost/prodb --username root --


password-file file:///home/cloudera/passfile -m 1 --table customer_src --target-dir
/user/cloudera/customer_stage_loc --incremental append --check-column id --last-value 0 --as-
avrodatafile

sqoop job --list

sqoop job --exec inpjob

hadoop fs -mkdir /user/cloudera/avscdirpro

hadoop fs -put /home/cloudera/avsrcdir/customer_src.avsc /user/cloudera/avscdirpro


====================================

Hive shell

====================================

hive

create database prodb;

use prodb;

create table customer_src ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'


STORED AS AVRO LOCATION '/user/cloudera/customer_stage_loc' TBLPROPERTIES
('avro.schema.url'='/user/cloudera/avscdirpro/customer_src.avsc');

select * from customer_src; === U will see the data

create external table customer_target_tab partitioned by (current_day string,year string,month


string,day string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' STORED AS
AVRO LOCATION '/user/cloudera/customer_target_tab' TBLPROPERTIES
('avro.schema.url'='/user/cloudera/avscdirpro/customer_src.avsc');

select * from customer_target_tab; ==== U will not see the data

Updated Commands Task 1

Cloudera avro

========

mysql -uroot -pcloudera


create database dataa;

use dataa;

drop table atab;

create table atab(id int,name varchar(100),amount int);

insert into atab values(1,'rajesh',40);

insert into atab values(2,'vishnu',10);

insert into atab values(3,'rani',60);

select * from atab;

quit;

========

Edge Node

sqoop import --connect jdbc:mysql://localhost/dataa --username root --password cloudera --table


atab --m 1 --delete-target-dir --target-dir /user/cloudera/adir --as-avrodatafile

========

hive

create database if not exists adb;

use adb;

drop table atab;

create table atab(id int,name string,amount int) stored as avro location '/user/cloudera/adir';

select * from atab;


====================

mysql -uroot -pcloudera

use dataa;

alter table atab drop column name;

insert into atab values(4,90);

insert into atab values(5,20);

select * from atab;

quit

========

Edge Node

sqoop import --connect jdbc:mysql://localhost/dataa --username root --password cloudera --table


atab --m 1 --target-dir /user/cloudera/adir --as-avrodatafile --incremental append --check-column id
--last-value 3

========

hive

select * from adb.atab;

**Performance Tuning

partitions
parallel execution
vectorization
bucketing
hive execution Engine
Hive MapSide Join
Downloads

=============
Scala IDE
=============

Windows --- http://downloads.typesafe.com/scalaide-pack/4.7.0-vfinal-oxygen-212-20170929/


scala-SDK-4.7.0-vfinal-2.12-win32.win32.x86_64.zip

Mac --- http://downloads.typesafe.com/scalaide-pack/4.7.0-vfinal-oxygen-212-20170929/scala-


SDK-4.7.0-vfinal-2.12-macosx.cocoa.x86_64.zip

Linux ---

http://downloads.typesafe.com/scalaide-pack/4.7.0-vfinal-oxygen-212-20170929/scala-SDK-4.7.0-
vfinal-2.12-linux.gtk.x86_64.tar.gz

=============
Intellij Download
=============

windows --- https://www.jetbrains.com/idea/download/download-thanks.html?


platform=windows&code=IIC

MAC --- https://www.jetbrains.com/idea/download/download-thanks.html?


platform=mac&code=IIC

LINUX ---

https://www.jetbrains.com/idea/download/download-thanks.html?platform=linux&code=IIC

WinUtils Download

Windows ----

https://github.com/steveloughran/winutils/raw/master/hadoop-2.7.1/bin/winutils.exe

=============
Spark Download ----MAC/Windows/Ubuntu
=============

https://archive.apache.org/dist/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.6.tgz

****For scala ide --- Just copy paste the Link for Download and paste in Browser****
04-02-2023

Task 1 ----

What is Map Side Join

Task 2 --- Execute project 1 Insert statement and give your analysis -- How many partitions created

Cloudera Folks Project

===============
Go to Mysql
===============

mysql -uroot -pcloudera

Create database if not exists prodb;


use prodb;

select * from customer_total;

create table customer_src(id int(10),username varchar(100),sub_port varchar(100),host


varchar(100),date_time varchar(100),hit_count_val_1 varchar(100),hit_count_val_2
varchar(100),hit_count_val_3 varchar(100),timezone varchar(100),method varchar(100),`procedure`
varchar(100),value varchar(100),sub_product varchar(100),web_info varchar(100),status_code
varchar(100));

insert into customer_src select * From customer_total where id>=301 and id<=330;

quit
=============================
Edge Node
=============================
rm -rf /home/cloudera/avsrcdir
mkdir /home/cloudera/avsrcdir
cd /home/cloudera/avsrcdir
echo -n cloudera>/home/cloudera/passfile

sqoop job --delete inpjob


sqoop job --create inpjob -- import --connect jdbc:mysql://localhost/prodb --username root --
password-file file:///home/cloudera/passfile -m 1 --table customer_src --target-dir
/user/cloudera/customer_stage_loc --incremental append --check-column id --last-value 0 --as-
avrodatafile
sqoop job --list
sqoop job --exec inpjob

hadoop fs -mkdir /user/cloudera/avscdirpro


hadoop fs -put /home/cloudera/avsrcdir/customer_src.avsc /user/cloudera/avscdirpro

====================================
Hive shell
====================================

hive
create database prodb;
use prodb;

create table customer_src ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'


STORED AS AVRO LOCATION '/user/cloudera/customer_stage_loc' TBLPROPERTIES
('avro.schema.url'='/user/cloudera/avscdirpro/customer_src.avsc');

select * from customer_src; === U will see the data

create external table customer_target_tab partitioned by (current_day string,year string,month


string,day string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' STORED AS
AVRO LOCATION '/user/cloudera/customer_target_tab' TBLPROPERTIES
('avro.schema.url'='/user/cloudera/avscdirpro/customer_src.avsc');

select * from customer_target_tab; ==== U will not see the data

set hive.exec.max.dynamic.partitions=1000;
set hive.exec.dynamic.partition.mode=nonstrict;

insert into projdb.customer_target_tab partition (current_day,year,month,day) select id,


username,sub_port,host,date_time,hit_count_val_1,hit_count_val_2,hit_count_val_3,timezone,met
hod,procedure,value,sub_product,web_info,status_code,current_date,year(from_unixtime(unix_tim
estamp(date_time,'dd/MMM/yyyy:HH:mm:ss Z'),'yyyy-MM-dd')) ,
MONTH(from_unixtime(unix_timestamp(date_time,'dd/MMM/yyyy:HH:mm:ss Z'),'yyyy-MM-dd')) ,
DAY(from_unixtime(unix_timestamp(date_time,'dd/MMM/yyyy:HH:mm:ss Z'),'yyyy-MM-dd'))from
customer_src where not(upper(web_info) like'%JAKARTA%');

!hadoop fs -ls /user/cloudera/customer_target_tab;


!hadoop fs -ls /user/cloudera/customer_target_tab/current_day=2022-07-29;
!hadoop fs -ls /user/cloudera/customer_target_tab/current_day=2022-07-29/year=2011;

Performance Tuning

partitions
parallel execution
vectorization
bucketing
hive execution Engine
Hive MapSide Join

2006 --- Hadoop Launched -- HDFS, MR

2008 --- Hive came into Picture

2010 -- Hive successful

2010- 2011 --- Data world Need some thing Else

Hadoop/Hive has so many restrictions


We use to have restricted Thoughts

Other requirements

Simple and Faster tools


Machine learning support
Data streaming Support
Highly customizable engine
Need a tool who do entire Extract-- transform - Load

Hive Restrictions

I will process in HDFS/Cloud


Hive is faster --- 10 Min --- 50 TB data --- can you complete 1 Min ?
Hive no support Machine Learning
Hive no streaming Support
No much customization
Hive NO WINDOWS/ LINUX / MACOS ----- ENVIRONMENT Dependent
No much NOSQL integration Support
Bring the data HDFS -- I will process it

2010 -- UC Berkeley --- Presentation--- Matei zaharia

I found a tool
Extremely Simple
Extremely Faster
My tool can process ---HDFS,Cloud,windows,linux,MACOS
You do not required an ingestion Tools
My tool can bring the data -- Process in SQL -- Deliver the data
My tool supports SQL for processing
Machine Learning algorithm
Very very good streaming data processing
Hadoop is just is one of the options
Easily customizable
============================================
One formulae To read and write for any source and destination

=============================================

Spark

Lightening fast cluster computing

2014 --- He sold rights to apache software foundation

Zaharia -- Lauched his own organization to provide licenced Spark

DATABRICKS

2016 – Spark

Emergency Project Update -- Change in Database name -- Cloudera Folks

insert into prodb.customer_target_tab partition (current_day,year,month,day) select id,


username,sub_port,host,date_time,hit_count_val_1,hit_count_val_2,hit_count_val_3,timezone,met
hod,procedure,value,sub_product,web_info,status_code,current_date,year(from_unixtime(unix_tim
estamp(date_time,'dd/MMM/yyyy:HH:mm:ss Z'),'yyyy-MM-dd')) ,
MONTH(from_unixtime(unix_timestamp(date_time,'dd/MMM/yyyy:HH:mm:ss Z'),'yyyy-MM-dd')) ,
DAY(from_unixtime(unix_timestamp(date_time,'dd/MMM/yyyy:HH:mm:ss Z'),'yyyy-MM-dd'))from
customer_src where not(upper(web_info) like'%JAKARTA%');

Emergency Project Update Corrected the Code

Cloudera Folks Project

===============
Go to Mysql
===============

mysql -uroot -pcloudera

Create database if not exists prodb;


use prodb;
select * from customer_total;

create table customer_src(id int(10),username varchar(100),sub_port varchar(100),host


varchar(100),date_time varchar(100),hit_count_val_1 varchar(100),hit_count_val_2
varchar(100),hit_count_val_3 varchar(100),timezone varchar(100),method varchar(100),`procedure`
varchar(100),value varchar(100),sub_product varchar(100),web_info varchar(100),status_code
varchar(100));

insert into customer_src select * From customer_total where id>=301 and id<=330;

quit
=============================
Edge Node
=============================
rm -rf /home/cloudera/avsrcdir
mkdir /home/cloudera/avsrcdir
cd /home/cloudera/avsrcdir
echo -n cloudera>/home/cloudera/passfile

sqoop job --delete inpjob


sqoop job --create inpjob -- import --connect jdbc:mysql://localhost/prodb --username root --
password-file file:///home/cloudera/passfile -m 1 --table customer_src --target-dir
/user/cloudera/customer_stage_loc --incremental append --check-column id --last-value 0 --as-
avrodatafile
sqoop job --list
sqoop job --exec inpjob

hadoop fs -mkdir /user/cloudera/avscdirpro


hadoop fs -put /home/cloudera/avsrcdir/customer_src.avsc /user/cloudera/avscdirpro

====================================
Hive shell
====================================

hive
create database prodb;
use prodb;

create table customer_src ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'


STORED AS AVRO LOCATION '/user/cloudera/customer_stage_loc' TBLPROPERTIES
('avro.schema.url'='/user/cloudera/avscdirpro/customer_src.avsc');

select * from customer_src; === U will see the data

create external table customer_target_tab partitioned by (current_day string,year string,month


string,day string) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' STORED AS
AVRO LOCATION '/user/cloudera/customer_target_tab' TBLPROPERTIES
('avro.schema.url'='/user/cloudera/avscdirpro/customer_src.avsc');

select * from customer_target_tab; ==== U will not see the data

set hive.exec.max.dynamic.partitions=1000;
set hive.exec.dynamic.partition.mode=nonstrict;

insert into prodb.customer_target_tab partition (current_day,year,month,day) select id,


username,sub_port,host,date_time,hit_count_val_1,hit_count_val_2,hit_count_val_3,timezone,met
hod,procedure,value,sub_product,web_info,status_code,current_date,year(from_unixtime(unix_tim
estamp(date_time,'dd/MMM/yyyy:HH:mm:ss Z'),'yyyy-MM-dd')) ,
MONTH(from_unixtime(unix_timestamp(date_time,'dd/MMM/yyyy:HH:mm:ss Z'),'yyyy-MM-dd')) ,
DAY(from_unixtime(unix_timestamp(date_time,'dd/MMM/yyyy:HH:mm:ss Z'),'yyyy-MM-dd'))from
customer_src where not(upper(web_info) like'%JAKARTA%');

!hadoop fs -ls /user/cloudera/customer_target_tab;


!hadoop fs -ls /user/cloudera/customer_target_tab/current_day=2023-02-04;

11-02-2023

object obj {
def main(args:Array[String]):Unit=
{
println("zeyobron")
val d = 10
println(d)
}

***************
object obj {

def main(args:Array[String]):Unit=

val list = List(1,2,3,4)

println(list)

}
}
12-02-2023
package pack

object obj {
def main(args:Array[String]):Unit=
{
println("===Started===")
val a = 2
println(a+1)
val b = "zeyobron"
println(b + "-analytics")

}
}

*******************
Code

package pack

object obj {
def main(args:Array[String]):Unit=
{
println("===Started===")
val ls = List(1,2,3,4)
println
println("===raw list===")
ls.foreach(println)
val procls = ls.filter( x => x > 2 )
println
println("===proc list===")
procls.foreach(println)
}
}

package pack

object obj {
def main(args:Array[String]):Unit=
{
println("===Started===")
println
val lst = List("zeyo" , "zeyobron" , "sai")
println("====raw list===")
lst.foreach(println)
val procls = lst.map( x => x + ",analytics" )
println
println("==== process list===")
procls.foreach(println)
}
}

Task 1 ---- zeyo filter

println
val lst = List("zeyo" , "zeyobron" , "sai")
println("====raw list===")
lst.foreach(println)
val procls = lst.filter( x => x.contains("zeyo") )
println
println("==== filter list===")
procls.foreach(println)

Task 2 -- (Optional)

Iterate each element Replace zeyo with tera

Do(map) the replace

18-02-2023

package pack

object obj {
def main(args:Array[String]):Unit={
println("=====started =====")
println
println("====Raw List=====")
val listr = List( "A-B" ,"C-D" ,"E-F")
listr.foreach(println)
println("====Proc List=====")
val flatdata = listr.flatMap( x => x.split("-"))
flatdata.foreach(println)

}
}

val liststr = List( "Amazon-Jeff-America",


"Microsoft-BillGates-America",
"TCS-TATA-india",
"Reliance-Ambani-india")
1 ---- Filter elements contains india
2 ---- Flatten with delimiter " - "
3 ---- replace "india" with "local"
4 ---- Convert all the string to lower case

package pack

object obj {
def main(args:Array[String]):Unit={
println("=====started =====")
println
println("====Raw List=====")
println
val liststr = List( "Amazon-Jeff-America",
"Microsoft-BillGates-America",
"TCS-TATA-india",
"Reliance-Ambani-india")

liststr.foreach(println)
println
println("===filter list==")
println
val fillist = liststr.filter( x => x.contains("india"))
fillist.foreach(println)
println("=====flat map=====")
val flatdata = fillist.flatMap( x => x.split("-"))
flatdata.foreach(println)
println
println("=====replace map=====")
println
val repdata = flatdata.map( x => x.replace("india","local"))
repdata.foreach(println)
println
println("=====lower map=====")
println
val lowdata = repdata.map( x => x.toLowerCase() )
lowdata.foreach(println)

}
}
Single Line Code

package pack
object obj {
def main(args:Array[String]):Unit={
println("=====started =====")
println
println("====Raw List=====")
println
val liststr = List( "Amazon-Jeff-America",
"Microsoft-BillGates-America",
"TCS-TATA-india",
"Reliance-Ambani-india")

liststr.foreach(println)
println
println("===filter list==")
println
val fillist = liststr.filter( x => x.contains("india"))
.flatMap( x => x.split("-"))
.map( x => x.replace("india","local"))
.map( x => x.toLowerCase() )
fillist.foreach(println)
}
}

val liststr=
List(
"State->TamilNadu~City->Chennai",
"State->Karnataka~City->Bangalore",
"State->Telangana~City->Hyderabad"
)

liststr.foreach(println)

object obj {
def main(args:Array[String]):Unit={
println("=====started =====")
println
println("====Raw List=====")
println
val liststr=
List(
"State->TamilNadu~City->Chennai",
"State->Karnataka~City->Bangalore",
"State->Telangana~City->Hyderabad"
)
liststr.foreach(println)
println
val flatdata = liststr.flatMap( x => x.split("~"))
println("====flat data===")
println
flatdata.foreach(println)
val state = flatdata.filter( x =>x.contains("State"))
println
println("====state data===")
println
state.foreach(println)
val city = flatdata.filter( x =>x.contains("City"))
println
println("====city data===")
println
city.foreach(println)
val finallist = state.map( x => x.replace("State->", ""))
println
println("====finallist data===")
println
finallist.foreach(println)
val finallistcity = city.map( x => x.replace("City->", ""))
println
println("====finallist City data===")
println
finallistcity.foreach(println)
}
}

*********************

val liststr = List( "Amazon-Jeff-America",


"Microsoft-BillGates-canada",
"TCS-TATA-india",
"Reliance-Ambani-india")
liststr.foreach(println)
println
println("===filter list==")
println
val fillist = liststr.filter( x => x.contains("india") || x.contains("canada"))
fillist.foreach(println)
println
val notlist = liststr.filter( x => !(x.contains("America")))
notlist.foreach(println)
Task 2 ---- (Optional)

*******************
val liststr = List(
"BigData-Spark-Hive",
"Spark-Hadoop-Hive",
"Sqoop-Hive-Spark",
"Sqoop-BD-Hive"
)

********************
19-02-2023

Code -- Ensure the right Path

package pack

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object obj {
def main(args:Array[String]):Unit={
println("===Started==")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val data = sc.textFile("file:///D:/data/st.txt")
println("====raw data====")
data.foreach(println)
val flatdata = data.flatMap( x => x.split("~"))
println
println("====flat data====")
flatdata.foreach(println)
val state = flatdata.filter( x => x.contains("State"))
println
println("====state data====")
state.foreach(println)
val city = flatdata.filter( x => x.contains("City"))
println
println("====city data====")
city.foreach(println)
val finalstate = state.map( x => x.replace("State->" , "" ))
println
println("====finalstate data====")
finalstate.foreach(println)
val finalcity = city.map( x => x.replace("City->" , "" ))
println
println("====finalcity data====")
finalcity.foreach(println)
}
}

*****************
Task 1 ----

Process usdata.csv
Read this File
Iterate each row filter length>200 -- One Line
Flatten the results with comma (,)
Remove Hyphones for the flattened it
Concatinate ",zeyo" for every line
Print it

Task 2 ----

Read datatxns.txt
Filter rows with Vaulting
Concat ",zeyo"
print it

**************
20-01-2023
Task Solution

Task 1 ----

Solution

package pack

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

object obj {
def main(args:Array[String]):Unit={
println("===Started==")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val data = sc.textFile("file:///C:/data/usdata.csv")
println("====raw data====")
data.take(5).foreach(println)
val lendata = data.filter( x => x.length() > 200)
println("====length data====")
lendata.foreach(println)
val flatdata = lendata.flatMap( x => x.split(","))
println("====flatdata data====")
flatdata.foreach(println)
println
val replacedata = flatdata.map( x => x.replace("-", ""))
println("====replacedata data====")
replacedata.foreach(println)
println
val concatdata = replacedata.map( x => x+",zeyo")
println("====concatdata data====")
concatdata.foreach(println)

Task 2 ----

package pack
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object obj {
def main(args:Array[String]):Unit={
println("===Started==")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val data = sc.textFile("file:///C:/data/datatxns.txt")
println("====raw data====")
data.foreach(println)
val fildata = data.filter( x => x.contains("Vaulting"))
println("====Fil data====")
fildata.foreach(println)
val mapdata = fildata.map( x => x+",zeyo")
println("====mapdata ====")
mapdata.foreach(println)
}

****************
25-02-2023

package pack

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._

object obj {

case class schema(id:String,category:String,product:String,mode:String)

def main(args:Array[String]):Unit={
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")

val sc = new SparkContext(conf)


sc.setLogLevel("ERROR")
val data = sc.textFile("file:///C:/data44/datatxns.txt")
data.take(10).foreach(println)
val mapsplit = data.map( x => x.split(","))
val schemardd = mapsplit.map( x => schema(x(0),x(1),x(2),x(3)))
val filterrdd= schemardd.filter( x => x.product.contains("Gymnastics"))
println
println
filterrdd.foreach(println)
}
}

*********************

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession

object obj {

case class schema(id:String,category:String,product:String,mode:String)


def main(args:Array[String]):Unit={
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession .builder().getOrCreate()

import spark.implicits._

val data = sc.textFile("file:///C:/data/datatxns.txt")


data.take(10).foreach(println)
val mapsplit = data.map( x => x.split(","))
val schemardd = mapsplit.map( x => schema(x(0),x(1),x(2),x(3)))
val filterrdd= schemardd.filter( x => x.product.contains("Gymnastics"))
println
println
filterrdd.foreach(println)
val df = filterrdd.toDF()
df.show
}

}
************************
package pack

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession

object obj {

def main(args:Array[String]):Unit={
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession
.builder()
.getOrCreate()
import spark.implicits._

val structschema = StructType(Array(


StructField("id",StringType),
StructField("category",StringType),
StructField("product",StringType),
StructField("mode", StringType)
))
val data = sc.textFile("file:///C:/data/datatxns.txt")
data.take(10).foreach(println)
val mapsplit = data.map( x => x.split(","))
println(mapsplit.collect.toList.size)
val rowrdd = mapsplit.map( x => Row(x(0),x(1),x(2),x(3)))
val filterdata = rowrdd.filter (x => x(2).toString().contains("Gymnastics"))
filterdata.foreach(println)
=
val df = spark.createDataFrame(filterdata, structschema)=

df.show
}
}
************************

Task 1 -----

package pack

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession

object obj {
def main(args:Array[String]):Unit={
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession
.builder()
.getOrCreate()
import spark.implicits._
val structschema = StructType(Array(
StructField("id",StringType),
StructField("category",StringType),
StructField("product",StringType),
StructField("mode", StringType)
))
val data = sc.textFile("file:///C:/data/datatxns.txt")
data.take(10).foreach(println)
val mapsplit = data.map( x => x.split(","))
println(mapsplit.collect.toList.size)
val rowrdd = mapsplit.map( x => Row(x(0),x(1),x(2),x(3)))
val filterdata = rowrdd.filter (x => x(2).toString().contains("Gymnastics"))
filterdata.foreach(println)
val df = spark.createDataFrame(filterdata, structschema)
df.show
}
}
26-02-2023

package pack

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder.getOrCreate()
import spark.implicits._
val df = spark
.read
.format("csv")
.option("header", true)
.load("file:///C:/data/usdata.csv")
df.show()
df.createOrReplaceTempView("cdf")
val procdf = spark.sql("select * from cdf where state='LA'")
}
}

**************
Click below link to download the Jar

https://repo1.maven.org/maven2/org/apache/spark/spark-avro_2.11/2.4.7/spark-avro_2.11-
2.4.7.jar

Add this jar to the Eclipse/Intellij

Download part.avro and place it in the data folder

val avrodf = spark.read.format("avro").load("file:///C:/data/part.avro")


avrodf.show()
https://zeyoathenabucket.s3.amazonaws.com/part.avro?X-Amz-Algorithm=AWS4-HMAC-
SHA256&X-Amz-Credential=AKIASVGFNNNDPGMWOKGP%2F20230226%2Fap-south-
1%2Fs3%2Faws4_request&X-Amz-Date=20230226T123952Z&X-Amz-Expires=3600&X-Amz-
SignedHeaders=host&X-Amz-
Signature=8dc9bc05c8729f4883e2d03cb3741822e681dc81587ec07ef8406437f10733ab

******************
package pack

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder.getOrCreate()
import spark.implicits._
val csvdf = spark.read.format("csv").option("header",true)
.load("file:///C:/data/usdata.csv")
csvdf.show()
csvdf.createOrReplaceTempView("cdf")
val procdf = spark.sql(" select * from cdf where state='LA' ")
procdf.show()
procdf.write.format("json").mode("overwrite").save("file:///C:/data/pr")
}
}

*********************
Task 1 --- Sql Practise

val df = spark.read.option("header","true").csv("file:///C:/data/df.csv")
val df1 = spark.read.option("header","true").csv("file:///C:/data/df1.csv")
df.show()
df1.show()
df.createOrReplaceTempView("df")
df1.createOrReplaceTempView("df1")
spark.sql("select * from df order by id").show()
spark.sql("select * from df1 order by id").show()
====================================
Select two columns
====================================
spark.sql("select id,tdate from df order by id").show()
====================================
Select column with category filter = Exercise
===================================
spark.sql("select id,tdate,category from df where category='Exercise' order by id").show()

====================================
Multi Column filter
====================================
spark.sql("select id,tdate,category,spendby from df where category='Exercise' and spendby='cash'
").show()

====================================
Multi Value Filter
====================================
spark.sql("select * from df where category in ('Exercise','Gymnastics')").show()

04-03-2023

Spark AWS RDS integration

package pack

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
object obj {
def main(args:Array[String]):Unit={

System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder.getOrCreate()
import spark.implicits._
val sqldf = spark.read
.format("jdbc")
.option("url","jdbc:mysql://zeyodb.cz7qhc39aphp.ap-south-
1.rds.amazonaws.com:3306/zeyodb")
.option("driver","com.mysql.jdbc.Driver")
.option("user","root")
.option("password","Aditya908")
.option("dbtable","cashdata")
.load()
sqldf.show()
}
}

********************
spark-shell --packages mysql:mysql-connector-java:5.1.21

val sqldf = spark.read.format("jdbc").option("url","jdbc:mysql://zeyodb.cz7qhc39aphp.ap-south-


1.rds.amazonaws.com:3306/
zeyodb").option("driver","com.mysql.jdbc.Driver").option("user","root").option("password","Aditya
908").option("dbtable","cashdata").load()

sqldf.show()

**********************
Task 1 -- XML read

book.xml
xml jar
rowtag---book

Task 2 -- Test PartitionBy (May not work if you have any write issue) -- Thats fine leave

Task 3 -- Optional ---

transactions.xml
with RowTag -- POSLog
and Print the Schema ---- df.printSchema() ---- Dont panic

*********************
Task 1 -- XML read

book.xml
xml jar
rowtag---book

Task 2 -- Test PartitionBy (May not work if you have any write issue) -- Thats fine leave

Task 3 -- Optional ---

transactions.xml
with RowTag -- POSLog
and Print the Schema ---- df.printSchema() ---- Dont panic'

************************
Code

package pack
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder.getOrCreate()
import spark.implicits._
val df = spark.read
.format("xml")
.option("rowtag","book")
.load("file:///C:/data/book.xml")
df.show()
val countryschema = StructType(Array(
StructField("id",StringType,true),
StructField("name",StringType,true),
StructField("check",StringType,true),
StructField("country", StringType, true)
))
val df1 = spark.read
.format("csv")
.schema(countryschema)
.load("file:///C:/data/allcountry1.csv")
df1.show()
df1.write.format("csv").partitionBy("country","check").save("file:///C:/data/rcdata2")
}
}

***************************
05-03-2023

val countryschema = StructType(Array(


StructField("txnno",StringType,true),
StructField("txndate",StringType,true),
StructField("custno",StringType,true),
StructField("amount", StringType, true),
StructField("category",StringType,true),
StructField("product",StringType,true),
StructField("city",StringType,true),
StructField("state",StringType,true),
StructField("spendby", StringType, true)
))
val df = spark.read.format("csv")
.schema(countryschema)
.load("s3a://zeyoathenabucket/txns10k.txt")
df.show()
}
}
*******************

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession
object obj {
def main(args:Array[String]):Unit={

System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession
.builder
.config("fs.s3a.access.key","AKIASVGFNNNDOTIRMUEP")

.config("fs.s3a.secret.key","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")
.getOrCreate()
import spark.implicits._
val countryschema = StructType(Array(
StructField("txnno",StringType,true),
StructField("txndate",StringType,true),
StructField("custno",StringType,true),
StructField("amount", StringType, true),
StructField("category",StringType,true),
StructField("product",StringType,true),
StructField("city",StringType,true),
StructField("state",StringType,true),
StructField("spendby", StringType, true)
))
val df = spark.read.format("csv")
.schema(countryschema)
.load("s3a://zeyoathenabucket/txns10k.txt")
df.show()
}

}
**************************************
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SparkSession

object obj {
def main(args:Array[String]):Unit={

System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("===started===")
val conf = new SparkConf().setMaster("local[*]").setAppName("first")
.set("spark.driver.allowMultipleContexts", "true")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession
.builder
.getOrCreate()
import spark.implicits._
val df = spark.read.format("csv").option("header","true").load("file:///C:/
data/usdata.csv")
df.write.format("json").mode("error").save("file:///C:/data/jsondataus")
println("===== data written ======")

}
}

**********************

Task sql

====================================
Like Filter
====================================

spark.sql("select * from df where product like ('%Gymnastics%')").show()

====================================
Not Filters
====================================

spark.sql("select * from df where category != 'Exercise'").show()


====================================
Not In Filters
====================================

spark.sql("select * from df where category not in ('Exercise','Gymnastics')").show()

====================================
Null Filters
====================================
spark.sql("select * from df where product is null").show()

====================================
Max Function
====================================

spark.sql("select max(id) from df ").show()

====================================
Min Funtion
====================================
spark.sql("select min(id) from df ").show()

Task 1 ----

Read dt.txt
select only id,tdate,category from raw dataframe
On top of this dataframe create a tempview
Using spark.sql add an extra column 1 as status

Task 2 --- Sql Practise

Complete till Min and MAX

Task 3 ----

In scala tutorials for Beginners—Complete

05-03-2023
https://www.youtube.com/watch?
v=LQVDJtfpQU0&list=PLS1QulWo1RIagob5D6kMIAvu7DQC5VTh3

Selecting two columns

drop a column

Filter category='Gymnastics'

Filter category='Gymnastics' and spendby='cash' (Multi Column filter and )

Filter category='Gymnastics' or spendby='cash' (Multi Column filter or )

Filter category='Gymnastics' or 'Exercise'

Filter Product contains 'Gymnastics'

Product is null

Product is Not null

11-03-2023

package pack

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._

object obj {

def main(args:Array[String]):Unit={

System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df = spark
.read
.format("csv")
.option("header","true")
.load("file:///C:/data/dt.txt")

df.show()
println("==========Filter Gymnastics===========")
val filgym = df.filter( col("category")==="Gymnastics")
filgym.show()
println("==========Filter cat Gymnastics spend cash===========")
val mulcolfilter=df.filter(col("category")==="Gymnastics" &&
col("spendby")==="cash")
mulcolfilter.show()
println("==========ccategory = gymnastics,Exercise===========")
val mulvalfilter=df.filter(col("category") isin ("Gymnastics","Exercise"))
mulvalfilter.show()
println("==========Product gymnastics===========")
val likeop = df.filter(col("product") like "%Gymnastics%")
likeop.show()
println("==========Product is null===========")
println
val nullprod = df.filter(col("product") isNull )
nullprod.show()
println("==========Product is Not null===========")
val nullNOTprod = df.filter(!(col("product") isNull ))
nullNOTprod.show()
}

12-03-2023

package pack

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf

object obj {

def main(args:Array[String]):Unit={

System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df = spark
.read
.format("csv")
.option("header","true")
.load("file:///C:/data/dt.txt")

df.show()

val tdf = df.selectExpr(


"id",
"tdate",
"split(tdate,'-')[2] as year",
"amount",
"category",
"product",
"spendby",
"case when spendby='cash' then 1 else 0 end as status"
)
tdf.show()

package pack

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf

object obj {
case class schema(
txnno:String,
txndate:String,
custno:String,
amount:String,
category:String,
product:String,
city:String,
state:String,
spendby:String)
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
val colist = List("txnno",
"txndate",
"custno",
"amount",
"category",
"product",
"city",
"state",
"spendby")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val data = sc.textFile("file:///C:/data/revdata/file1.txt")
data.take(5).foreach(println)
val gymdata= data.filter( x => x.contains("Gymnastics"))
println
println("===Gymdata===")
println
gymdata.take(5).foreach(println)
val mapsplit = gymdata.map( x => x.split(","))
val schemardd = mapsplit.map(x =>
schema(x(0),x(1),x(2),x(3),x(4),x(5),x(6),x(7),x(8)))
val prodfilter = schemardd.filter( x => x.product.contains("Gymnastics"))
println
println("===Gymdata prod===")
println
prodfilter.take(5).foreach(println)
println
println("===schema rdd to dataframe===")
println
val schemadf = prodfilter.toDF().select(colist.map(col): _*)
schemadf.show(5)
val file2 = sc.textFile("file:///C:/data/revdata/file2.txt")
val mapsplit1=file2.map( x => x.split(","))
val rowrdd = mapsplit1.map( x => Row(x(0),x(1),x(2),x(3),x(4),x(5),x(6),x(7),x(8)))
println
println("===row rdd===")
println
println
rowrdd.take(5).foreach(println)
val structschema = StructType(Array(
StructField("txnno",StringType,true),
StructField("txndate",StringType,true),
StructField("custno",StringType,true),
StructField("amount", StringType, true),
StructField("category", StringType, true),
StructField("product", StringType, true),
StructField("city", StringType, true),
StructField("state", StringType, true),
StructField("spendby", StringType, true)
))
println
println("===row df===")
println
println
val rowdf = spark.createDataFrame(rowrdd, structschema).select(colist.map(col): _*)
rowdf.show(5)
val csvdf = spark.read.format("csv").option("header","true")
.load("file:///C:/data/revdata/file3.txt").select(colist.map(col): _*)

println
println("===csv df===")
println

csvdf.show(5)

val jsondf = spark.read.format("json").load("file:///C:/data/revdata/file4.json")


.select(colist.map(col): _*)
println
println("===json df===")
println

jsondf.show(5)

val parquetdf = spark.read.load("file:///C:/data/revdata/file5.parquet")


.select(colist.map(col): _*)
println
println("===parquet df===")
println

parquetdf.show(5)

val xmldf = spark.read.format("xml").option("rowtag","txndata")


.load("file:///C:/data/revdata/file6")
.select(colist.map(col): _*)

println
println("===xmldf===")
println

xmldf.show(5)

val uniondf = schemadf


.union(rowdf)
.union(csvdf)
.union(jsondf)
.union(parquetdf)
.union(xmldf)

println
println("===uniondf===")
println

uniondf.show(5)

val procdf = uniondf.withColumn("txndate",expr("split(txndate,'-')[2]"))


.withColumnRenamed("txndate", "year")
.withColumn("status",expr("case when spendby='cash' then 1 else 0 end"))
.filter(col("txnno")>50000)
println
println("===procdf===")
println

procdf.show(5)

procdf.write.mode("append").partitionBy("category").save("file:///C:/data/finalpdata")

println("===revision complete==")

/*val conf = new SparkConf().setAppName("revision").setMaster("local[]")


val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._

val df = spark
.read
.format("csv")
.option("header","true")
.load("file:///C:/data/dt.txt")

df.show()

val tdf = df.withColumn( "year" , expr("split(tdate,'-')[2]") )


.withColumnRenamed("tdate", "year")
.withColumn("category", expr("substring(category,1,4)"))
.withColumn("status", expr("case when spendby='cash' then 1 else 0
end"))

tdf.show()*/

}
package pack

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf

object obj {

case class schema(


txnno:String,
txndate:String,
custno:String,
amount:String,
category:String,
product:String,
city:String,
state:String,
spendby:String)

def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
val colist = List("txnno",
"txndate",
"custno",
"amount",
"category",
"product",
"city",
"state",
"spendby")

println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._

val data = sc.textFile("file:///C:/data/revdata/file1.txt")


data.take(5).foreach(println)
val gymdata= data.filter( x => x.contains("Gymnastics"))
println
println("===Gymdata===")
println
gymdata.take(5).foreach(println)
val mapsplit = gymdata.map( x => x.split(","))
val schemardd = mapsplit.map(x =>
schema(x(0),x(1),x(2),x(3),x(4),x(5),x(6),x(7),x(8)))

val prodfilter = schemardd.filter( x => x.product.contains("Gymnastics"))


println
println("===Gymdata prod===")
println
prodfilter.take(5).foreach(println)
println
println("===schema rdd to dataframe===")
println
val schemadf = prodfilter.toDF().select(colist.map(col): _*)
schemadf.show(5)

val file2 = sc.textFile("file:///C:/data/revdata/file2.txt")


val mapsplit1=file2.map( x => x.split(","))
val rowrdd = mapsplit1.map( x => Row(x(0),x(1),x(2),x(3),x(4),x(5),x(6),x(7),x(8)))
println
println("===row rdd===")
println
println
rowrdd.take(5).foreach(println)
val structschema = StructType(Array(
StructField("txnno",StringType,true),
StructField("txndate",StringType,true),
StructField("custno",StringType,true),
StructField("amount", StringType, true),
StructField("category", StringType, true),
StructField("product", StringType, true),
StructField("city", StringType, true),
StructField("state", StringType, true),
StructField("spendby", StringType, true)
))
println
println("===row df===")
println
println
val rowdf = spark.createDataFrame(rowrdd, structschema).select(colist.map(col): _*)
rowdf.show(5)
val csvdf = spark.read.format("csv").option("header","true")
.load("file:///C:/data/revdata/file3.txt").select(colist.map(col): _*)
println
println("===csv df===")
println
csvdf.show(5)
val jsondf = spark.read.format("json").load("file:///C:/data/revdata/file4.json")
.select(colist.map(col): _*)
println
println("===json df===")
println

jsondf.show(5)

val parquetdf = spark.read.load("file:///C:/data/revdata/file5.parquet")


.select(colist.map(col): _*)
println
println("===parquet df===")
println
parquetdf.show(5)
val xmldf = spark.read.format("xml").option("rowtag","txndata")
.load("file:///C:/data/revdata/file6")
.select(colist.map(col): _*)

println
println("===xmldf===")
println

xmldf.show(5)

val uniondf = schemadf


.union(rowdf)
.union(csvdf)
.union(jsondf)
.union(parquetdf)
.union(xmldf)

println
println("===uniondf===")
println

uniondf.show(5)

val procdf = uniondf.withColumn("txndate",expr("split(txndate,'-')[2]"))


.withColumnRenamed("txndate", "year")
.withColumn("status",expr("case when spendby='cash' then 1 else 0 end"))
.filter(col("txnno")>50000)

println
println("===procdf===")
println

procdf.show(5)
procdf.write.mode("append").partitionBy("category").save("file:///C:/data/
finalpdata")

println("===revision complete==")

/*val conf = new SparkConf().setAppName("revision").setMaster("local[]")


val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._

val df = spark
.read
.format("csv")
.option("header","true")
.load("file:///C:/data/dt.txt")

df.show()

val tdf = df.withColumn( "year" , expr("split(tdate,'-')[2]") )


.withColumnRenamed("tdate", "year")
.withColumn("category", expr("substring(category,1,4)"))
.withColumn("status", expr("case when spendby='cash' then 1 else 0
end"))

tdf.show()*/

18-03-2023

package pack

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf

object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df = spark
.read
.format("csv")
.option("header","true")
.load("file:///C:/data/aggdata.csv")

df.show()
val aggdf = df.groupBy("name")
.agg(
sum("amt")
.cast(IntegerType)
.as("total")
)

.orderBy("name")

aggdf.show()

/*val conf = new SparkConf().setAppName("revision").setMaster("local[]")


val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._

val df = spark
.read
.format("csv")
.option("header","true")
.load("file:///C:/data/dt.txt")

df.show()

val tdf = df.withColumn( "year" , expr("split(tdate,'-')[2]") )


.withColumnRenamed("tdate", "year")
.withColumn("category", expr("substring(category,1,4)"))
.withColumn("status", expr("case when spendby='cash' then 1 else 0 end"))

tdf.show()*/
}
}

package pack
import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf

object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df1 = spark.read.format("csv")
.option("header","true")
.load("file:///C:/data/join1.csv")
df1.show()
val df2 = spark.read.format("csv")
.option("header","true")
.load("file:///C:/data/join2.csv")
df2.show()
println
println("=====Inner Join=====")
println
val injoindf = df1.join( df2 , Seq("txnno"), "inner")
injoindf.show()
println
println("=====left Join=====")
println
val leftjoin = df1.join( df2 , Seq("txnno"), "left")
leftjoin.show()
println
println("=====right Join=====")
println
val rightjoin = df1.join( df2 , Seq("txnno"), "right")
rightjoin.show()
println
println("=====Full Join=====")
println
val fulljoin = df1.join(df2 , Seq("txnno"),"full")
.orderBy("txnno")
fulljoin.show()
}
}
If Columns are different

val joindf = df1.join(df2, df1("txnno")===df2("tno") ,


"inner")
.drop("tno")

joindf.show()

19-03-2023

package pack

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf

object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._

val df1 = spark.read.format("csv")


.option("header","true")
.load("file:///C:/data/join1.csv")
df1.show()
val df2 = spark.read.format("csv")
.option("header","true")
.load("file:///C:/data/join2.csv")
df2.show()
println
println("=====Inner Join=====")
println
val injoindf = df1.join( df2 , Seq("txnno"), "inner")
injoindf.show()
println
println("=====left Join=====")
println
val leftjoin = df1.join( df2 , Seq("txnno"), "left")
leftjoin.show()
println
println("=====right Join=====")
println
val rightjoin = df1.join( df2 , Seq("txnno"), "right")
rightjoin.show()
println
println("=====Full Join=====")
println
val fulljoin = df1.join(df2 , Seq("txnno"),"full")
.orderBy("txnno")
fulljoin.show()
println
println("====Left anti=====")
println
val antijoin = df1.join(df2,Seq("txnno"),"left_anti")
antijoin.show()
println
println("====cross anti=====")
println
val crossjoin = df1.crossJoin(df2)
crossjoin.show()

25-03-2023

package pack

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf

object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df = spark
.read
.format("json")
.option("multiline","true")
.load("file:///C:/data/jv.json")
df.show()
df.printSchema()
val flattendf = df.select(
"address.permanentAddress",
"address.temporaryAddress",
"org",
"trainer",
"workAddress",
"years"

)
flattendf.show()
flattendf.printSchema()
val flattendf1 = df.withColumn("permanentAddress",
expr("address.permanentAddress"))
.withColumn("temporaryAddress",expr("address.temporaryAddress"))
.drop("address")
flattendf1.show()
flattendf1.printSchema()
}

package pack

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df = spark
.read
.format("json")
.option("multiline","true")
.load("file:///C:/complexjson/donut1.json")
df.show()
df.printSchema()
val flattendf = df.select(

"id",
"image.height",
"image.url",
"image.width",
"name",
"type"
)
flattendf.show()
flattendf.printSchema()

val procdf = flattendf.withColumn("store", expr("'KSBakers'"))


procdf.show()
procdf.printSchema()
val finalcomplexdf = procdf.select(
col("id"),
col("type"),
col("name"),
struct(
col("store"),
col("url"),
col("width"),
col("height")
).as("allfields")

finalcomplexdf.show()
finalcomplexdf.printSchema()
}

26-03-2023

val complexdf = flattendf.withColumn("location",


expr("struct(permanentAddress,temporaryAddress,workAddress)"))
.drop("permanentAddress","temporaryAddress","workAddress")

complexdf.show()
complexdf.printSchema()
package pack

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf

object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._

val df = spark
.read
.format("json")
.option("multiline","true")
.load("file:///C:/data/complexjson/randomeuser5.json")
df.show()
df.printSchema()
val flat1 = df.withColumn("results", expr("explode(results)"))
flat1.show
flat1.printSchema()
val finalflatten= flat1.select(

"nationality",
"results.user.BSN",
"results.user.cell",
"results.user.dob",
"results.user.email",
"results.user.gender",
"results.user.location.city",
"results.user.location.state",
"results.user.location.street",
"results.user.location.zip",
"results.user.md5",
"results.user.name.first",
"results.user.name.last",
"results.user.name.title",
"results.user.password",
"results.user.phone",
"results.user.picture.large",
"results.user.picture.medium",
"results.user.picture.thumbnail",
"results.user.registered",
"results.user.salt",
"results.user.sha1",
"results.user.sha256",
"results.user.username",
"seed",
"version"
)
finalflatten.show()
finalflatten.printSchema()
}

package pack

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
object obj {

def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._

val df = spark
.read
.format("json")
.option("multiline","true")
.load("file:///C:/data/jv.json")
df.show()
df.printSchema()
val flat1 = df.withColumn("Students", expr("explode(Students)"))
flat1.show
flat1.printSchema()

}
Task --- URL data Read - dataframe

URL Code

package pack

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
import scala.io.Source

object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._

// ==== SCALA URL Consumption

val html = Source.fromURL("https://randomuser.me/api/0.8/?results=10")


val urldata = html.mkString
// ==== String - RDD

val rdd = sc.parallelize(List(urldata))

// ==== JSON RDD - df

val df = spark.read.json(rdd)
df.show()
df.printSchema()
val flat1 = df.withColumn("results", expr("explode(results)"))
flat1.show
flat1.printSchema()
val finalflatten= flat1.select(

"nationality",
"results.user.cell",
"results.user.dob",
"results.user.email",
"results.user.gender",
"results.user.location.city",
"results.user.location.state",
"results.user.location.street",
"results.user.location.zip",
"results.user.md5",
"results.user.name.first",
"results.user.name.last",
"results.user.name.title",
"results.user.password",
"results.user.phone",
"results.user.picture.large",
"results.user.picture.medium",
"results.user.picture.thumbnail",
"results.user.registered",
"results.user.salt",
"results.user.sha1",
"results.user.sha256",
"results.user.username",
"seed",
"version"
)
finalflatten.show()
finalflatten.printSchema()
}

01-04-2023

1) Create an ecplise/Intellij Project /Lab


2) Read projectsample.avro as avro and show it (Add spark avro jar)
3) Read url data and show it
https://randomuser.me/api/0.8/?results=500
4) Flatten the complex url dataframe completely
5) Find a way to Remove numericals from username from flattened dataframe
6) Do left join

avrodataframe --- numericals removed flattend dataframe


username column

7) From Joined Dataframe-- Create two more dataframes

availablecustomer -- joinedf.filter("nationality is not null")


unavailablecustomer-- joinedf.filter("nationality is null")

02-04-2023
[5:35 PM, 4/2/2023] Sai Aditya Big Data: package pack

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
import scala.io.Source
import org.apache.spark.sql._

object obj {
def main(args:Array[String]):Unit={
System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df = spark.read
.format("json")
.option("multiline","true")
.load("file:///C:/complexjson/actorsj1.json")
df.show()
df.printSchema()
val flattendf = df.withColumn("Actors",expr("explode(Actors)"))
flattendf.show()
flattendf.printSchema()
val finalflatten = flattendf.select(
"Actors.fields.*",
"country",
"version"
)
finalflatten.show()
finalflatten.printSchema()
val complexdf= finalflatten.groupBy("country","version")
.agg(
collect_list(
struct(
struct(
col("Birthdate"),
col("`Born At`"),
col("age"),
col("hasChildren"),
col("hasGreyHair"),
col("name"),
col("photo"),
col("weight"),
col("wife")
).as("fields")
)
).as("Actors")

)
.select("Actors","country","version")

complexdf.show()
complexdf.printSchema()

}
}

package pack

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.udf
import scala.io.Source
import org.apache.spark.sql._

object obj {

def main(args:Array[String]):Unit={

System.setProperty("hadoop.home.dir", "C:\\hadoop")
println("================Started1============")
val conf = new SparkConf().setAppName("revision").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df = spark.read
.format("json")
.option("multiline","true")
.load("file:///C:/data/jv.json")
df.show()
df.printSchema()
val flattendf = df.withColumn("Students", expr("explode(Students)"))
flattendf.show()
flattendf.printSchema()

val flattendf1 = flattendf.select(

"Students.user.*",
"org",
"trainer",
"years"
)
flattendf1.show()
flattendf1.printSchema()
val finaldf = flattendf1.withColumn("pets",expr("explode(pets)"))
finaldf.show()
finaldf.printSchema
val complexdf1 = finaldf.groupBy("location", "name" ,"org" , "trainer", "years")
.agg(
collect_list(col("pets")).as("pets")
)
complexdf1.show()
complexdf1.printSchema()
val finalcomplex = complexdf1.groupBy("org","trainer","years")
.agg(

collect_list(
struct(
struct(
col("location"),
col("name"),
col("pets")
).as("user")
)
).as("Students")
)
.select("Students","org","trainer","years")
finalcomplex.show()
finalcomplex.printSchema()

}
}

Profile Building ---- Today


Profile Review ---- Tomorrow
Upload
Resume Only Once
Profile Building Tips

When you come to the review -- Put your current status the top
Rephrase the sentences from the samples
Bold whenever required
Put the points which you really feel comfortable
Defintely Mention aws s3,EMR
Do not mention versions for any of the Tools
Put your experience in the Descending order (Recent Experience at the Top)
Mention more points on Spark (From samples, or from your Knowledge)
Better to have font Cambria
Always mention more points

IT folks

ETL
Testing
Supports
BI
Analyst
More than 12 Years ----- In the Project Level -- 4 Years relevant
6-10 ----- in the Project Level --- 3 Years Relevent
3-6 ----- in the Project Level --- 3 Years Relevent
2 Years ----- in the Project Level --- 2 Years Relevent
1.9 ----- in the Project Level --- Almost 2 Relevent
Remaining your own experience

Non IT Folks

Dont Show any NON IT experience


Do not change your Company Name

More than 12 Years ----- In the Project Level -- 4 Years relevant


6-10 ----- in the Project Level --- 3 Years Relevent
3-6 ----- in the Project Level --- 3 Years Relevent
2 Years ----- in the Project Level --- 2 Years Relevent
1.9 ----- in the Project Level --- Almost 2 Relevent

Remaining Exp ---- ETL Testing

https://mindmajix.com/etl-testing-sample-resumes

Gap Folks

We are going to create an Experience which does not Exist


I will give a company to Put in the resume. Try for Jobs But do not continue to put the same
company name more than 2 Months

Cyphel Infosolutions
www.Cyphelinfo.com

Prosaic technologies
www.prosaica.com

Client Name--

Bluedart
bharat matrimony
Hershey's
cars24

For documentation --- +91 7795663545 - Karthik

Indian Calls

Change Email id and Phone Number


New Customer
Profile Building in Naukri -- LinkedIn

Naukri --- Youtube Video -- Indian


Linked IN ---- Remove the skill set description - Have your company names

Overseas

US Folks --- Will be you Employers tomorrow Evening


08-04-2023

Install AWSCLI
Download and install GIT BASH

aws configure

Access Key- AKIA2TITMOYYWSY74MSN


Secret Key- v3LKZcj6a5jqSW/uYgBJ2ycgLSaohfDX+E9kZwsO
Region - ap-south-1
outputformat- json

aws s3 ls

Git bash /Lab folks / Cloudera

aws s3 mb s3://<UNIQUE>zeyo35/
aws s3 ls s3://
echo zeyobron>zeyofile
aws s3 cp zeyofile s3://<UNIQUE>zeyo35/
aws s3 ls s3://<UNIQUE>zeyo35/
aws s3 rm s3://<UNIQUE>zeyo35/zeyofile
aws s3 rb s3://<UNIQUE>zeyo35

Windows CMD

aws s3 mb s3://<UNIQUE>zeyo35/
aws s3 ls s3://
notepad.exe zeyofile ---(type zeyobron after file opens)
aws s3 cp zeyofile s3://<UNIQUE>zeyo35/
aws s3 ls s3://<UNIQUE>zeyo35/
aws s3 rm s3://<UNIQUE>zeyo35/zeyofile
aws s3 rb s3://<UNIQUE>zeyo35

**********************
+919492326052 -- Bhaskar

Genuine PF UAN Companies

Form16 Orginal (Check in Website)


Live Companies (Physical Locations Existing)

PF Old Dated
====================================

1. NEWFOUND INFO TECH PRIVATE LIMITED


www.infotechedu.com
ESTD:2010
Location:Bangalore

2. LONGTAIL WEB SERVICES PRIVATE LIMITED


www.webshareservices.net
ESTD:2010
Location:Bangalore

3. TECHNOLOGIC SOLUTIONS PRIVATE LIMITED - Pune Based


www.technosysec.com
ESTD:2001
Location:Pune

4. MASTERKUBE SOFTWARE SOLUTIONS AND SERVICES PRIVATE - Chennai Based


www.webmasterdev.com
ESTD:2010
Location:Chennai

5. Antrix Technology India Private Limited (29-10-2010)


antenix.com
Bangalore Based
DOI: 29/10/2010

6. MICROHAZE TECHNOLOGIES PRIVATE LIMITED


microhaze.com
Bangalore Based
DOI: 09/10/2018

7. VWEB Hr Solutions Private Limited


www.vwebhrsolutions.in
DOI: 18/09/2017
Pune Based

09-04-2023

Staging Task Cloudera/Lab

======================
Cloudera staging Exports
======================
mysql -uroot -pcloudera
create database if not exists exp;
use exp;
drop table if exists ttab;
drop table if exists st_ttab;
create table ttab(id int,name varchar(100),amount int);
create table st_ttab(id int,name varchar(100),amount int);
quit

cd
echo 1,zeyo,40>zfile
echo 2,ravi,70>>zfile
echo 3,rani,70>>zfile
hadoop fs -mkdir /user/cloudera/exdir
hadoop fs -put zfile /user/cloudera/exdir

sqoop export --connect jdbc:mysql://localhost/exp --username root --password cloudera --table ttab
--staging-table st_ttab --m 1 --export-dir /user/cloudera/exdir

mysql -uroot -pcloudera


use exp;
select * from ttab;
select * from st_ttab;
quit

======================
Lab staging Exports
======================

mysql --host=zeyodb.cveqgaujeiwd.ap-south-1.rds.amazonaws.com --user=root --


password=Aditya908

create database if not exists itv005669;


use itv005669;
drop table if exists ttab;
drop table if exists st_ttab;
create table ttab(id int,name varchar(100),amount int);
create table st_ttab(id int,name varchar(100),amount int);
quit

cd
echo 1,zeyo,40>zfile
echo 2,ravi,70>>zfile
echo 3,rani,70>>zfile
hadoop fs -mkdir /user/itv005669/exdir
hadoop fs -put zfile /user/itv005669/exdir
sqoop export --connect
jdbc:mysql://zeyodb.cveqgaujeiwd.ap-south-1.rds.amazonaws.com/itv005669 --username root --
password Aditya908 --table ttab --staging-table st_ttab --m 1 --export-dir /user/itv005669/exdir

mysql --host=zeyodb.cveqgaujeiwd.ap-south-1.rds.amazonaws.com --user=root --


password=Aditya908
use itv005669;
select * from ttab;
select * from st_ttab;
quit

=============
AVRO Task Cloudera
=============

mysql -uroot -pcloudera

drop database if exists map;


create database if not exists map;
use map;
drop table if exists mtab;
create table mtab(id int,name varchar(100),amount int);
insert into mtab values(1,'zeyo',40);
insert into mtab values(2,'vasu',50);
insert into mtab values(3,'rani',70);
select * from mtab;
quit

sqoop import --connect jdbc:mysql://localhost/map --username root --password cloudera --table


mtab --m 1 --delete-target-dir --target-dir /user/cloudera/adir --as-avrodatafile

hadoop fs -ls /user/cloudera/adir


hadoop fs -cat /user/cloudera/adir/*

click ctrl+c in keyboard

=============
AVRO Task Lab
=============

mysql --host=zeyodb.cveqgaujeiwd.ap-south-1.rds.amazonaws.com --user=root --


password=Aditya908
create database if not exists itv005669;
use itv005669;
drop table mtab;
create table mtab(id int,name varchar(100),amount int);
insert into mtab values(1,'zeyo',40);
insert into mtab values(2,'vasu',50);
insert into mtab values(3,'rani',70);
select * from mtab;
quit

sqoop import -Dmapreduce.job.user.classpath.first=true --connect


jdbc:mysql://zeyodb.cveqgaujeiwd.ap-south-1.rds.amazonaws.com/itv005669 --username root --
password Aditya908 --table mtab --m 1 --delete-target-dir --target-dir /user/itv005669/adir --as-
avrodatafile

hadoop fs -ls /user/itv005669/adir


hadoop fs -cat /user/itv005669/adir/*

click ctrl+c in keyboard

====================
Optional Task
====================

Cloudera

type hive and go inside it

drop table if exists avtab;


create table avtab(id int,name string) stored as avro location '/user/cloudera/adir';
select * from avtab;

====================
Optional Task
====================

Lab

type hive and go inside it


set hive.metastore.warehouse.dir=/user/itv005669/warehouse;
drop table if exists avtab;
create table avtab2(id int,name string) stored as avro location '/user/itv005669/adir';
select * from avtab2;

Task Video
https://youtu.be/E0lhq0W_z7o

Task Solution

Lab Folks There Is an issue in command Updated Below

Staging Task Cloudera/Lab

======================
Cloudera staging Exports
======================

mysql -uroot -pcloudera


create database if not exists exp;
use exp;
drop table if exists ttab;
drop table if exists st_ttab;
create table ttab(id int,name varchar(100),amount int);
create table st_ttab(id int,name varchar(100),amount int);
quit

cd
echo 1,zeyo,40>zfile
echo 2,ravi,70>>zfile
echo 3,rani,70>>zfile
hadoop fs -mkdir /user/cloudera/exdir
hadoop fs -put zfile /user/cloudera/exdir

sqoop export --connect jdbc:mysql://localhost/exp --username root --password cloudera --table ttab
--staging-table st_ttab --m 1 --export-dir /user/cloudera/exdir

mysql -uroot -pcloudera


use exp;
select * from ttab;
select * from st_ttab;
quit

======================
Lab staging Exports
======================

mysql --host=zeyodb.cveqgaujeiwd.ap-south-1.rds.amazonaws.com --user=root --


password=Aditya908

create database if not exists itv005669;


use itv005669;
drop table if exists ttab;
drop table if exists st_ttab;
create table ttab(id int,name varchar(100),amount int);
create table st_ttab(id int,name varchar(100),amount int);
quit

cd
echo 1,zeyo,40>zfile
echo 2,ravi,70>>zfile
echo 3,rani,70>>zfile
hadoop fs -mkdir /user/itv005669/exdir
hadoop fs -put zfile /user/itv005669/exdir

sqoop export --connect


jdbc:mysql://zeyodb.cveqgaujeiwd.ap-south-1.rds.amazonaws.com/itv005669 --username root --
password Aditya908 --table ttab --staging-table st_ttab --m 1 --export-dir /user/itv005669/exdir

mysql --host=zeyodb.cveqgaujeiwd.ap-south-1.rds.amazonaws.com --user=root --


password=Aditya908
use itv005669;
select * from ttab;
select * from st_ttab;
quit

=============
AVRO Task Cloudera
=============

mysql -uroot -pcloudera

drop database if exists map;


create database if not exists map;
use map;
drop table if exists mtab;
create table mtab(id int,name varchar(100),amount int);
insert into mtab values(1,'zeyo',40);
insert into mtab values(2,'vasu',50);
insert into mtab values(3,'rani',70);
select * from mtab;
quit

sqoop import --connect jdbc:mysql://localhost/map --username root --password cloudera --table


mtab --m 1 --delete-target-dir --target-dir /user/cloudera/adir --as-avrodatafile
hadoop fs -ls /user/cloudera/adir
hadoop fs -cat /user/cloudera/adir/*

click ctrl+c in keyboard

=============
AVRO Task Lab
=============

mysql --host=zeyodb.cveqgaujeiwd.ap-south-1.rds.amazonaws.com --user=root --


password=Aditya908
create database if not exists itv005669;
use itv005669;
drop table mtab;
create table mtab(id int,name varchar(100),amount int);
insert into mtab values(1,'zeyo',40);
insert into mtab values(2,'vasu',50);
insert into mtab values(3,'rani',70);
select * from mtab;
quit

sqoop import -Dmapreduce.job.user.classpath.first=true --connect


jdbc:mysql://zeyodb.cveqgaujeiwd.ap-south-1.rds.amazonaws.com/itv005669 --username root --
password Aditya908 --table mtab --m 1 --delete-target-dir --target-dir /user/itv005669/adir --as-
avrodatafile

hadoop fs -ls /user/itv005669/adir


hadoop fs -cat /user/itv005669/adir/*

click ctrl+c in keyboard

====================
Optional Task
====================

Cloudera

type hive and go inside it

drop table if exists avtab;


create table avtab(id int,name string) stored as avro location '/user/cloudera/adir';
select * from avtab;

====================
Optional Task
====================

Lab

type hive and go inside it


set hive.metastore.warehouse.dir=/user/itv005669/warehouse;
drop table if exists avtab;
create table avtab2(id int,name string) stored as avro location '/user/itv005669/adir';
select * from avtab2;

Important points to remember

5 Areas --

1 -- We launch our own clusters Develop the Code. Complete the Development or else we terminate
by end of the by coping the required data to s3.

2 -- In our project we automated the whole EMR production deployment with EMR COMMAND
RUNNER consist of step execution.

3 -- We run 25 Node r series Cluster in our Project for production Deployment

4 -- We schedule the EMR command Runner script through EC2 scheduler


We configured NIFI as an orchestration and scheduler (Execute Process property)

5 -- Once we develop the Code . we commit to GIT - We have DEV OPS who developed CI CD script in
which we open JENKINS and build the jar for project

Sync Download task Only one

cd
mkdir zfl
touch zfl/file1
touch zfl/file2
aws s3 sync zfl/ s3://zeyoss36/sssdir/
rm -rf zfl/*
aws s3 sync s3://zeyoss36/sssdir/ zfl/

15-04-2023
Steps ------>

====================
Code Development
====================
Requirement Gathering
Create my cluster
Complete Code Development
Take the Code

====================
Jar Development/Deployment
====================
Create project Eclise/Intellij
Put the Code
Generate the jar
Copy that Jar to aws s3

====================
EMR Deployment
====================
Create a step execution to Run Jar with Spark Application

====================
Export Command Runner Script
====================

aws emr create-cluster --applications Name=Hadoop Name=Spark --ec2-attributes


'{"InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"subnet-
073c594dc96a7ab69","EmrManagedSlaveSecurityGroup":"sg-
05fcf01ba44e60b62","EmrManagedMasterSecurityGroup":"sg-052cd51216c51246f"}' --release-label
emr-5.36.0 --log-uri 's3n://aws-logs-728574490161-ap-south-1/elasticmapreduce/' --steps '[{"Args":
["spark-submit","--deploy-mode","client","--master","yarn","--deploy-mode","client","--
packages","org.apache.spark:spark-avro_2.11:2.4.7","--class","pack.obj","s3://azeyodev/
SparkProjectDep-0.0.1-
SNAPSHOT.jar"],"Type":"CUSTOM_JAR","ActionOnFailure":"CONTINUE","Jar":"command-
runner.jar","Properties":"","Name":"Spark application"}]' --instance-groups
'[{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":
{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"MASTER","In
stanceType":"m5.xlarge","Name":"Master Instance Group"}]' --configurations
'[{"Classification":"spark","Properties":{}}]' --auto-terminate --service-role EMR_DefaultRole --name
'Rajinikanth' --scale-down-behavior TERMINATE_AT_TASK_COMPLETION --region ap-south-1

16-04-2013

============================
Step 1 -- Open Gitbash/Cmd/Lab
============================

aws configure
Accesskey --- AKIA2TITMOYY4IOOJGBL
Secretkey --- a8ZhgE/za3KZPD+r9Mj+XEx78sq2LNwUtO6gPKxh
RegionName --- ap-south-1
outputformat --- json
aws s3 ls

============================
Step 2 ---- replace URNAME with some Unique name
============================

aws emr create-cluster --applications Name=Hadoop Name=Spark --ec2-attributes


'{"InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"subnet-
093afefd7e0f2f7db","EmrManagedSlaveSecurityGroup":"sg-
05fcf01ba44e60b62","EmrManagedMasterSecurityGroup":"sg-052cd51216c51246f"}' --release-label
emr-5.36.0 --log-uri 's3n://aws-logs-728574490161-ap-south-1/elasticmapreduce/' --steps '[{"Args":
["spark-submit","--deploy-mode","client","--master","yarn","--packages","org.apache.spark:spark-
avro_2.11:2.4.7","--class","pack.obj","s3://azeyodev/SparkProjectDep-0.0.1-SNAPSHOT.jar","s3://
azeyodev/
config.json","<URNAME>dir"],"Type":"CUSTOM_JAR","ActionOnFailure":"CONTINUE","Jar":"comma
nd-runner.jar","Properties":"","Name":"Spark application"}]' --instance-groups
'[{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":
{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"MASTER","In
stanceType":"r3.xlarge","Name":"Master Instance Group"}]' --configurations
'[{"Classification":"spark","Properties":{}}]' --auto-terminate --service-role EMR_DefaultRole --name
'<URNAME>Deployment' --scale-down-behavior TERMINATE_AT_TASK_COMPLETION --region ap-
south-1

=============
Step 3 ---- Replace your cluster id with below id wait for terminating state
=============

=== a cluster id gets generated

aws emr describe-cluster --cluster-id j-1K48XXXXXXHCB | grep 'State'

==================
Validate the Data you should see your folder name
==================

aws s3 ls s3://azeyodev/dest/availablecustomers/
aws s3 ls s3://azeyodev/dest/notavailablecustomers/

Develop the Code


Code --- Jar
Jar --- Step Execution
Step Execution -- Command Runner
Schedule the command Runner

package pack

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.Row
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import scala.io._
import org.json4s._
import java.io.InputStream

object obj {
def main(args:Array[String]):Unit={
val conf = new SparkConf().setAppName("ES").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("Error")
val spark = SparkSession.builder()
.getOrCreate()
import spark.implicits._
val configdf = spark
.read
.format("json")
.option("multiline","true")
.load("s3://azeyodev/config.json")

configdf.show()

val devdf = configdf.select("env.dev.*")


devdf.show()
val src = devdf.select("src").rdd.map( x =>
x.mkString("")).collect().mkString("")
println(src)
val avail = devdf.select("avail").rdd.map( x =>
x.mkString("")).collect().mkString("")
println(avail)

val notavail = devdf.select("notavail").rdd.map( x => x.mkString("")).collect().mkString("")

println(notavail)
val data = spark.read.format("avro")
.load(src)

data.show()
val html =
Source.fromURL("https://randomuser.me/api/0.8/?results=500")
val s = html.mkString

val urldf = spark.read.json(sc.parallelize(List(s)))


urldf.show()
val flatdf =
urldf.withColumn("results",explode(col("results"))).select("nationality","seed","version",
"results.user.username","results.user.cell","results.user.dob","results.user.email",
"results.user.gender","results.user.location.city","results.user.location.state",
"results.user.location.street","results.user.location.zip","results.user.md5",
"results.user.name.first","results.user.name.last","results.user.name.title",
"results.user.password","results.user.phone","results.user.picture.large","results.user.pictur
e.medium","results.user.picture.thumbnail","results.user.registered","results.user.salt","results.user
.sha1","results.user.sha256")
flatdf.show()
val rm=flatdf.withColumn("username",regexp_replace(col("username"), "([0-9])", ""))
rm.show()
val joindf = data.join(broadcast(rm),Seq("username"),"left")
joindf.show()
val dfnull = joindf.filter(col("nationality").isNull)
val dfnotnull=joindf.filter(col("nationality").isNotNull)
dfnotnull.show()
dfnull.show()
val replacenull= dfnull.na.fill("Not Available").na.fill(0)
replacenull.show()
val
noavail=replacenull.withColumn("current_date",current_date)
val
avail1=dfnotnull.withColumn("current_date",current_date)
avail1.write.format("parquet").mode("overwrite").save(avail+s"/${args(1)}")
noavail.write.format("parquet").mode("overwrite").save(notavail+s"/${args(1)}")

We will create our own EMR cluster


We open spark shell and develop our Code
We maintain config files separately
We do not hard code any paths
Once the development is completed terminate my cluster
Generate the jar along config file
Create a step execution to run this jar along with config file
Once the spark submit is done we generate command-runner script
We schedule command runner script
aws configure
Accesskey --- AKIA2TITMOYY4IOOJGBL
Secretkey --- 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
RegionName --- ap-south-1
outputformat --- json

============================
Step 1 -- Open Gitbash/Cmd/Lab
============================

aws configure
Accesskey --- AKIA2TITMOYY4IOOJGBL
Secretkey --- a8ZhgE/za3KZPD+r9Mj+XEx78sq2LNwUtO6gPKxh
RegionName --- ap-south-1
outputformat --- json
aws s3 ls

============================
Step 2 ---- replace URNAME with some Unique name
============================

aws emr create-cluster --applications Name=Hadoop Name=Spark --ec2-attributes


'{"InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"subnet-
093afefd7e0f2f7db","EmrManagedSlaveSecurityGroup":"sg-
05fcf01ba44e60b62","EmrManagedMasterSecurityGroup":"sg-052cd51216c51246f"}' --release-label
emr-5.36.0 --log-uri 's3n://aws-logs-728574490161-ap-south-1/elasticmapreduce/' --steps '[{"Args":
["spark-submit","--deploy-mode","client","--master","yarn","--packages","org.apache.spark:spark-
avro_2.11:2.4.7","--class","pack.obj","s3://azeyodev/SparkProjectDep-0.0.1-SNAPSHOT.jar","s3://
azeyodev/
config.json","<URNAME>dir"],"Type":"CUSTOM_JAR","ActionOnFailure":"CONTINUE","Jar":"comma
nd-runner.jar","Properties":"","Name":"Spark application"}]' --instance-groups
'[{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":
{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"MASTER","In
stanceType":"r3.xlarge","Name":"Master Instance Group"}]' --configurations
'[{"Classification":"spark","Properties":{}}]' --auto-terminate --service-role EMR_DefaultRole --name
'<URNAME>Deployment' --scale-down-behavior TERMINATE_AT_TASK_COMPLETION --region ap-
south-1

=============
Step 3 ---- Replace your cluster id with below id wait for terminating state
=============

=== a cluster id gets generated

aws emr describe-cluster --cluster-id j-1K48XXXXXXHCB | grep 'State'

==================
Validate the Data you should see your folder name
==================

aws s3 ls s3://azeyodev/dest/availablecustomers/
aws s3 ls s3://azeyodev/dest/notavailablecustomers/

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy