All Codes
All Codes
b = c(50,60, 70)
# is use for commenting
a+b
# Variables in R - There are two way to assign a value to
# a variable
# Accessing the element of a vecotr using square bracket indexing
# Option 1: By using = sign
age = c(38,35,40,42)
a = 10
name = c("Amit", 'Sunil', 'Raj', "Mohan")
# Option 2: By using <-
# If you want to access the single element of the vector
a <- 10
age[2]
# Rules to assign a variable name -I can not start the
# variable name with special characters
# You can access the elements based on a sequence of index values
# Kind of values you can assign to a variable
age[2:4]
# I can assign - Number, String
# You want to access specific index values
# Anything in the write in a " " will be taken as string
age[c(2,4)]
a = "10" # This is a sting 10
# If you want to access element by excluding the few elemen
b = 20 # This is a number 10
age = c(38,35,40,42)
# Vectors - 2D arrangement of data which should contain
# data of same data type. I can build a data frame from
age[c(-1,-4)] # It will exclude 1st element and 4th element
# Vectors
# Create another vector as location
# I will create a numerical vector
location = c('Mumbai', 'Bangalore', 'Kochi', 'Delhi')
age = 38 # Single element vector
# Sorting the vector - sort() command\
age = c(38,35,40,42)
# By default the arrangement is low-high
# If you want to assign a sequence of number to variable
age = sort(age)
z = 10:50
sort(location)
# If you want to assign a sequence of numbers with specific interval
# If you want to arrange it from highest to lowest
z = seq(10,50, by = 5)
sort(age, decreasing = TRUE)
z = seq(10,50,5)
sort(location, decreasing = TRUE)
# How to create a vector of string
################### Data frame ###########################
name = c("Amit", 'Sunil', 'Raj', "Mohan")
# Lets convert the name, age and location vector to a data frame
# Few key commands on vector
# The command is data.frame(v1, v2,v3....)
# Lenght command will give the number of elements in a vector
df = data.frame(age, name, location)
length(name)
# Accessing the values from a data frame by using square bracket
# nchar is used to find the number of character in each element
# I want to access the 2nd row 3 column value
nchar(age)
df[2,3]
# Math operators on vectors
# I want to access the 2nd and 4th row and 3 column
# If the vector length is same, the math operation is on index values
df[c(2,4), 3]
a = c(10,20,30,40)
# I want to access all rows from 2nd row onwards and
b = c(50,60,70,80)
# 2nd & 3rd column
a+b
df[c(2:4),c(2,3)]
# If the vector length is different - re-cyclic process
# Some key commands in data frame
a = c(10,20,30,40,20)
# View() is used to view your data # Import the 3rd user
head(df,2) library(dplyr)
# tail() is used to see the bottom rows of your data # DPLYR has 5 key function- select, filter, groupby, summarise,
# arrange and mutate
tail(df,2)
# We can use anyone of this function of a combination of them
# nrow() is used to find the number of rows # by using a pipeline function
nrow(df) # Select() - this help to select the column of your data set
# ncol() is used to find the number of columns # PS - Create a new data frame which contain only region, sale & profit
# names() is used to print the columns name # If you want select a column which contain 2 words
# str() is used to find the structure of your data types # Filter() - It will help you filter your data based on a specific
# condition
str(df)
# PS - Filter the data for south region
# data types in R
# NOTE: we need to use == for comparing the values
# R can have the following data types
df_south = orders %>% filter(Region == "South")
# 1) int
# 2) num # PS - Find the number of rows for Central with sale more than 2000 USD
# 3) char - any column containing text
# 4) factor - categorical column - which the data into # NOTE: I can use multiple filter condition by using AND / OR operator
# on-overlapping cateogries # In R - & is AND operator, | - OR operator
# 5) date and time
# 6) geo data - state, pin code, district, countries etc df1 = orders %>% filter(Region == 'Central' & Sales > 2000) %>% nrow()
df8 = orders %>% group_by(Region, `Product Category`) %>% Right = merge(x = orders, y = users, by.x = 'Region',
summarise(Avg_profit = mean(Profit), Avg_sales = mean(Sales)) %>% by.y = 'Region', all.y = TRUE)
filter(`Product Category` == 'Technology')
View(df8) # Extracting the information like month, days, year etc from
# date column
# Arrange() - This will help you to sort the data
# By default arrangement is in ascending order. If you want to # PS - Which month had the highest sales?
# arrange the data in descending order you must pass additional
# argument by putting a "-" sign orders$Month = format(orders$`Order Date`, "%B")
# PS - Find the product name with highest average sales df14 = orders %>% group_by(Month) %>% summarise(Total_sales =
sum(Sales))
df9 = orders %>% group_by(`Product Name`) %>%
summarise(Avg_sales = mean(Sales)) %>% arrange(-Avg_sales) %>% View(df14)
head(1)
View(df9) DAY3 – R
# PS- Find the customer based on customer ID with highest total sales # In this session - 1) We will see the basic ploting using ggplot2
# 2) How to build basic Predictive models
df10 = orders %>% group_by(`Customer ID`, `Customer Name`) %>%
summarise(Total_sales = sum(Sales)) %>% arrange(-Total_sales)%>% # Ploting with GGPLOT2
head(1)
library(readxl)
View(df10) library(ggplot2)
library(dplyr)
# Mutate() - This commond help to create a new column with the help
# of current columns orders = read_excel("C:/Training data/Amit/Data Set/Sample-Superstore-
Subset-Excel.xlsx")
# PS - Find the product name with highest shiping cost to sales ratio
# PS - Plot a bar chart to present total sales each region
df11 = orders %>% group_by(`Product Name`) %>%
summarise(Total_shipping_cost = sum(`Shipping Cost`), df1 = orders %>% group_by(Region) %>% summarise(Total_sales =
Total_sales = sum(Sales)) %>% sum(Sales))
mutate(Ratio = Total_shipping_cost/Total_sales) %>%
arrange(-Ratio)%>% head(1) View(df1)
View(df11)
# Plotting a bar chart using ggplot
# Merge / Join - To merge two table we can perform
# NOTE: Define your x axis and y axis inside the command aes()
# Inner Join - Will give only the common rows between data set # NOTE: In geom_bar() you need to mention addition argument -
# stats = 'Idenitity' View(test)
# Step 1: ggplot will help you to define you x - axis and y - axis test$pred_milage = predict(model, test)
# Step 2: You can use reorder command to arrange your barchart.
# Step 3: We will use geom_bar() draw our bar chart - Use stat = "Identity"
only in BAR CHART # Step 5: Find the error by using RMSE - Root mean square error
# Step 4: labs() to label my x - axis, y - axis and title of chart
# Step 5: geom_text() to put data label on you chart with vjust and hjust to test$error_squre = (test$mpg - test$pred_milage)^2
adjust the data labels
# Step 6: theme() to remove the any text from you axis by using axis.text.y
and son on #root and Mean of square error
# Upper limit
Data Modelling – R
26.78 + RMSE
# Predictive data modelling
# Based on the data from you past, you need predict the values. # Lower Limit
# Based on the MTcars data you need find what will be a milage of a car 26.78 - RMSE
with:
# 5 cyl, 120 - displ, 98 hp, 5 gears, mpg = ???
View(mtcars)
library(dplyr)
View(df)
# Step 2: Split our data into training (80%) and test (20%) data.
# This split is a random split
index
train = df[index, ]
test = df[-index, ]
summary(model)
# In decimal data type, please use the (total values including the decimal values
Day6 - SQL
# , the number of decimals)
use sda;
# Inserting the records in by table - We can use the insert into command for this
# Quick recap - What will be the command to select order ID, profit, sales from orders
# Syntax: insert into table_name (column1, Column2, Column3)
# table under SDA data base
# values ((value1, value2, value3) , (Value1, Value2, Value3));
select `Order ID`, Profit, Sales from orders;
insert into batch3(ID, Name, Age, Marks)
# Quick recap 2- Command for selecting all the columns for the rows where sales is
Values (100, 'Amit', 38, 7.35);
more
# than 1500 $
# If you want to insert multiple values
select * from orders
where Sales > 1500;
insert into batch3(ID, Name, Age, Marks)
Values (101, 'Amit', 38, 7.35), (102, 'Raj', 40, 8.35), (103, 'Sunil', 44, 8.35);
# BETWEEN command - This will help you to find the values between two numbers.
# Find all the rows of all columns where sales is between 1000 and 2000
# In SQL we will working the following key commands:
# 1) Select command - Which will help you to select the field (columns)
# OPTION 1: I can use AND opertor
# 2) Where command - Which will help you to filter your data
# 3) Order command - Which will help you to arrange your data
select * from orders
# 4) Group by command - Which will help to group the data at multiple levels
where Sales > 1000 AND Sales < 2000;
# Select query function - This is used to select the fields(columns)
# OPTION 2: Using between operator
select Sales, Profit from orders;
select * from orders
where Sales between 1000 AND 2000;
# If you want to select all the column
# We can also use a NOT Operator with Between:
select * from orders;
select * from orders
where Sales NOT BETWEEN 1000 and 2000;
# We can also select columns with alias name
# ORDER BY - This is used to sort the data in ascending or descending orders based on
select `Product Category` AS PC from orders;
# one or more columns
# PS: Select Region, Product category, Sales and Profit and order the table in # than 5 orders?
# 1) Lowest to highest of Sales
# 2) Highest to lowest of Profit select `Customer ID`, `Customer Name`, count(distinct `Order ID`) as Total from orders
where `Customer Segment` = 'Corporate' AND `State or Province` = 'California'
# Solution for 1) Arranging the lowest to highest Sales group by `Customer ID`
select Region, `Product Category`, Sales, Profit from orders having Total >5;
order by Sales;
# PS: Find the rows where the sales is greater than the overall average sales
# Solution for 2) Arranging the highest to lowest of Profit # NOTE: We use sub-queries where the output of a query is given and an input to
another
select Region, `Product Category`, Sales, Profit from orders # query. As we can not store the query with a variable name, we need to write a
order by Profit desc; # sub-query within the main query
# Arrange the same data (used in above condition) in ascending order of Region and select round(avg(Sales),0) from orders; # Q1
# desending order of Sales
select Sales from orders
select Region, `Product Category`, Sales, Profit from orders where Sales > (select round(avg(Sales),0) from orders);
order by Region, Sales desc
limit 9105; # PS: Find the list of customer where the total revenue given by the customer for
# South region is less than the average sales of south region?
# PS: Find the Product name with highest sales from South Region
# Solution:
select `Product Name` from orders
where Region = 'South' select `Customer ID`, `Customer Name`, sum(Sales) as Total from orders
order by Sales desc where Region = 'South'
limit 1; group by `Customer ID`, `Customer Name`
having Total < (select round(avg(Sales),0) from orders
# GROUP BY - This is used in collaboration with select statement to arrange data into where Region = 'South');
# Groups. This group by clause follows the WHERE clause in the select statement and
# precedes by the Order by clause. PLEASE THAT GROUP BY COMMAND MUST HAVE A ################################ JOINS IN SQL
MATH AGG ##################################
# IN SQL we have to define the Math aggr. within the select command
# We have INNER JOIN, LEFT JOIN, RIGHT JOIN in SQL
# PS: Find the total sales for each region
# Lets see how to join the table on inner join
Select Region, sum(Sales) from orders
group by Region; # Lets first see how to perform a inner join
# If you want to rename the column sum(Sales) as Total Sales select orders.`Order ID`, returns.Status
# PS: Find the average profit from each product category of South Region? from orders
select `Product Category`, round(avg(Profit),2) as 'Avg Profit' from orders inner join returns on orders.`Order ID` = returns.`Order ID`
where Region = 'South'
Group by `Product Category`; group by orders.Region;
# Find the Customer with highest number of orders? # PS: Find the list of orders ID which were not returned?
select `Customer ID`, count(distinct `Order ID`) as Total_orders from orders select orders.`Order ID`, returns.Status
group by `Customer ID` From orders
order by Total_orders desc left join returns ON orders.`Order ID` = returns.`Order ID`
limit 1; where returns.Status IS NULL;
# PS: Find the product Name with the highest total Sales to Profit Ratio?
# Basic calculative field can be decleared in select statement itself
# PS: Find the Product sub-cateogry the total sales of a product sub category
# is more than 10000 for the South Region
# NOTE: When you have to apply multiple filter, you can use WHERE command if the
filter
# is applied before the group by command and HAVING command if the filter is applied
# after group by.
# PS: Find the customer list from corporate segment of California who have placed more