Experiment 5
Experiment 5
getwd()
OUTPUT:
> getwd()
[1] "C:/Users/HP/Documents"
f1<- read.csv("marks.csv")
f1
length(f1)
summary(f1)
OUTPUT:
> length(f1)
[1] 6
> summary(f1)
#DISCRIPTIVE STATISTICS
#DATA DESCRIPTION
mean = mean(f1$CGPA)
mean
#computing median
median = median(f1$CGPA)
median
#computing mode
install.packages("modeest")
library(modeest)
mode = mfv(f1$CGPA)
print(mode)
print(mean)
print(median)
print(mode)
OUTPUT:
> mean
[1] 3.58
> median
[1] 3.65
[1] 2.9 3.1 3.2 3.4 3.6 3.7 3.8 3.9 4.0 4.2
[1] 3.58
[1] 3.65
[1] 2.9 3.1 3.2 3.4 3.6 3.7 3.8 3.9 4.0 4.2
##MEASURES OF VARIABILITY
max = max(f1$JAVA.MARKS)
min = min(f1$JAVA.MARKS)
OUTPUT:
maximum value:
95
minimum value:
70
#calculate range
range(f1$JAVA.MARKS)
ranged
range(f1$JAVA.MARKS)
max(f1$JAVA.MARKS)
OUTPUT:
> range(f1$JAVA.MARKS)
[1] 70 95
[1] 25
> range(f1$JAVA.MARKS)
[1] 70 95
> max(f1$JAVA.MARKS)
[1] 95
#Data variability
variance = var(f1$JAVA.MARKS)
variance
stdevq = sd(f1$JAVA.MARKS)
meanvq = mean(f1$JAVA.MARKS)
cv = (stdevq/meanvq)* 100
cv
varianced = var(f1$DM.MARKS)
stdeva = sd(f1$DM.MARKS)
meannd = mean(f1$DM.MARKS)
meannd
stdeva
cv1 = (stdeva/meannd)*100
cv1
}else{
OUTPUT:
Variance of the java marks:
[1] 61.6
> cv
[1] 9.388238
> meannd
[1] 86.3
> stdeva
[1] 6.360468
> cv1
[1] 7.370183
quartiles = quantile(f1$JAVA.MARKS)
quartiles
probs = seq(0,1,0.25)
quantile(f1$JAVA.MARKS,probs)
OUTPUT:
> quartiles
> quantile(f1$JAVA.MARKS,probs)
#DECILES
probs = seq(0,1,0.1)
quantile(f1$JAVA.MARKS,probs)
OUTPUT:
> quantile(f1$JAVA.MARKS,probs)
0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
70.0 74.5 77.4 79.4 82.4 84.5 86.2 88.3 89.6 92.3 95.0
#PERCENTILES
h = c(1:2000)
probs1 = seq(0,1,0.1)
quantile(h,probs)
OUTPUT:
> quantile(h,probs)
0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
1.0 200.9 400.8 600.7 800.6 1000.5 1200.4 1400.3 1600.2 1800.1 2000.0
getwd()
library("readxl")
ps = read_excel(path , skip=4)
ps
getwd()
library("readxl")
OUTPUT:
> ps
# A tibble: 6 × 6
#DATA CLEANING IN R
#Data cleaning refers to the process of transforming raw data into data that is suitable to perform
operations
library(dplyr)
df <- data.frame(
PatientName = c("John Doe", "Jane Smith", "Bob Johnson", NA, "Charlie Brown"),
DoctorName = c("Dr. Smith", "Dr. Johnson", "Dr. Davis", "Dr. Wilson", NA)
print(df)
df %>% na.omit()
OUTPUT:
> print(df)
1 Dr. Smith
2 Dr. Johnson
3 Dr. Davis
4 Dr. Wilson
5 <NA>
DoctorName
1 Dr. Smith
Output
#remove
zz=c(23,29,NA,9,21.19)
zz
length(zz)
mean(zz)
M=mean(zz,na.rm=T)
print(M)
OUTPUT:
#remove>
zz=c(23,29,NA,9,21.19)
>
> length(zz)
[1] 5
> mean(zz)
[1] NA
> M=mean(zz,na.rm=T)
> print(M)
[1] 20.5475
df <- data.frame(
print(df)
print(new_df)
OUTPUT:
> print(df)
3 Bob 23 <NA> C
5 Christie 40 Seattle B
5 Christie 40 Seattle B
City = c("New York", "Los Angeles", "New York", "Los Angeles", "Chicago", "San Francisco",
"Seattle"),
print(df_with_duplicates)
print(new_df)
OUTPUT:
> print(df_with_duplicates)
5 Bob 22 Chicago C
7 Christie 35 Seattle B
3 Bob 22 Chicago C
5 Christie 35 Seattle B
#Drop rows
df <- data.frame(
Name = c("John", "Jane", "Bob", "Alice", "Charlie"),
Age = c(25, 30, 22, 28, 35),
City = c("New York", "Los Angeles", NA, "San Francisco", "Seattle"),
Grade = c("A", "B", "C", "A", "B")
)
df %>% drop_na()
OUTPUT:
4 Charlie 35 Seattle B
df %>% drop_na(City)
OUTPUT:
4 Charlie 35 Seattle B
# to get column headings
glimpse(df)
OUTPUT:
Rows: 5
Columns: 4
$ City <chr> "New York", "Los Angeles", NA, "San Francisco", "Seattle"
#bind rows
OUTPUT:
1 Aarav 25 Mumbai A
2 Aditi 30 Delhi B
3 Arjun 22 Bangalore C
4 Ananya 28 Kolkata A
5 Ayush 35 Chennai B
1 Bhavya 27 Jaipur B
2 Chirag 32 Ahmedabad C
3 Deepika 24 Lucknow A
4 Dhruv 30 Hyderabad B
5 Esha 28 Pune A
1 Aarav 25 Mumbai A
2 Aditi 30 Delhi B
3 Arjun 22 Bangalore C
4 Ananya 28 Kolkata A
5 Ayush 35 Chennai B
6 Bhavya 27 Jaipur B
7 Chirag 32 Ahmedabad C
8 Deepika 24 Lucknow A
9 Dhruv 30 Hyderabad B
10 Esha 28 Pune A