# model including all variable
#relevel(Pclass,ref = 2)
logit2 <- step(glm(target_60 ~ income + balance +Dummy,
family='binomial', data=training),direction = "both")
summary(logit2)
summary(logit2)
# concordance and discordance
Acc=function(model){
Data = cbind(model$y, model$fitted.values)
ones = Data[Data[,1] == 1,]
zeros = Data[Data[,1] == 0,]
conc=matrix(0, dim(zeros)[1], dim(ones)[1])
disc=matrix(0, dim(zeros)[1], dim(ones)[1])
ties=matrix(0, dim(zeros)[1], dim(ones)[1])
for (j in 1:dim(zeros)[1])
{
for (i in 1:dim(ones)[1])
{
if (ones[i,2]>zeros[j,2])
{conc[j,i]=1}
else if (ones[i,2]<zeros[j,2])
{disc[j,i]=1}
else if (ones[i,2]==zeros[j,2])
{ties[j,i]=1}
}
}
Pairs=dim(zeros)[1]*dim(ones)[1]
PercentConcordance=(sum(conc)/Pairs)*100
PercentDiscordance=(sum(disc)/Pairs)*100
PercentTied=(sum(ties)/Pairs)*100
return(list("Percent Concordance"=PercentConcordance,"Percent Discordance"=PercentDiscordance,"Percent Tied"=PercentTied,"Pairs"=Pairs))
}
# accuracy of model
Acc(logit2)
# accuracy of model
Acc(logit2)
# odds Ratio
exp(coef(logit2))
# odds Ratio
exp(coef(logit2))
logit2$coefficients
View(testing)
## Prediction on testing data
testing$probs <-predict(logit2, testing, type='response')
testing$Predict<-as.factor(ifelse(testing$probs>0.70,1,0))
View(testing)
# Mathematical calculation check
y=-11.377052087+0.005707992*2220.9662+0.646278163*1
a<-exp(-y)
b<-1+a
c<-1/b
c
confusionMatrix(testing$target_60,testing$Predict)
confusionMatrix(testing$target_60,testing$Predict)
### Roc Curve
library(ROCR)
# Make predictions on training set
predictTrain = predict(logit2,testing, type="response")
# Prediction function
ROCRpred = prediction(predictTrain, testing$target_60)
# Performance function
ROCRperf = performance(ROCRpred, "tpr", "fpr")
# Plot ROC curve
plot(ROCRperf)
library(ROCR)
library(arules)
library(arulesViz)
library(datasets)
w1 = read.table("F:/R and Data Science/Market Basked Analysis/Retail.csv")
trans = read.transactions("F:/R and Data Science/Market Basked Analysis/Retail.csv", format = "basket", sep=",");
itemFrequencyPlot(trans,topN=20,type="absolute")
rules<-apriori(data=trans, parameter=list(supp=0.001,conf = 0.08),
appearance = list(default="lhs",rhs="mobile"),control = list(verbose=F))
rules<-sort(rules, decreasing=TRUE,by="confidence")
inspect(rules[1:10])
plot(rules,method="graph",shading=NA,engine='interactive')
data<-read.csv(file.choose())
View(data)
prop.table(table(data$smoker))
prop.table(table(data$smoker))*100
prop.table(table(data$smoker))
prop.table(table(data$smoker))
prop.table(table(data$smoker*data$sex))
table(data$sex,data$smoker)
prop.table(table(data$smoker,data$sex))
prop.table(table(data$smoker,data$sex)*100)
table(data$smoker,data$sex)
prop.table(table(data$smoker,data$sex))*100
try<-data[1:3]
try<-subset(data,data$sex="yes")
try<-subset(data,data$sex='yes')
try<-subset(data,data$sex='male')
try<-subset(data,data$sex="male")
try<-subset(data,data$sex="male")
View(data)
try<-subset(data,data$sex=="male")
View(data)
attach(data)
try<-subset(data,children>=1 & Month<=10,
select = c(smoker,sex))
View(try)
try<-data[order(Month),]
View(try)
# plotting distribution of Airpassengers data
hist(AirPassengers,
main = "histogram for Air Passenger",
xlab = "Passengers",
border = "red",
col = "blue",
xlim = c(100,700)
,breaks = 5)# break is used to get the number of each bar
# plotting distribution of Airpassengers data
hist(AirPassengers,
main = "histogram for Air Passenger",
xlab = "Passengers",
border = "red",
col = "blue",
xlim = c(100,700)
,breaks = 5)# break is used to get the number of each bar
plotNormalHistogram(data$charges)
library(rcompanion)
plotNormalHistogram(data$charges)
data()
#Objective: To predict whether a employee will exit or not using Decision Tree
################### Importing data ##################
bankcust<-read.csv("F:/R and Data Science/Decision tree/employee (1).csv")
################### To check Data Type ##################
str(bankcust)
names(bankcust)
#################### Just taking a subset of column for model building ##################
bankcust = subset(bankcust, select = -c(7,9,10,27,22))
str(bankcust)
################## Data Conversion#############################
# All int variable to convert to numeric
bankcust1 = subset(bankcust, select = -c(2,3,5,7,9,13,15,19))
names(bankcust1)
str(bankcust1)
bankcust1<-data.frame(apply(bankcust1, 2, as.numeric))
str(bankcust1)
# All int variable to convert to numeric
bankcust2 = subset(bankcust, select = c(2,3,5,7,9,13,15,19))
str(bankcust2)
bankcust<-data.frame(bankcust2,bankcust1)
str(bankcust)
names(bankcust)
##############333#Data Partition#789#######################
set.seed(231)
library(caret)
train<-createDataPartition(bankcust$Attrition,p=0.7,list=FALSE)
training<-bankcust[train,]
testing<-bankcust[-train,]
prop.table(table(bankcust$Attrition))
prop.table(table(training$Attrition))
prop.table(table(testing$Attrition))
################# Building Model & Plotting Model#################
library(rpart)
Model =rpart(Attrition~.,data=training,method = "class")
#            , parms = list(prior = c(.84,.16), split = "gini"))
library(rpart.plot)
rpart.plot.version1(Model, main = "Model Before Pruning",
type = 5, extra = 1,cex = 0.5,tweak =1.5,varlen = 0)
#,faclen=0) # to get full name
# type is to get variable name in nodes
# extra is used to get no. of observatioin or 6 to get %
# Cex text size of entry plot
# tweak to increase the size of nodes label & cut off point
# Varlen is used to get full name of variable in nodes
# faclen is used to get full name of levels in branches
# Prediction on Training data
training$Predicted=predict (Model,training,type ="class")
library(caret)
confusionMatrix(training$Predicted,training$Attrition)
# Doing Pre-Pruning
c<-rpart.control(minsplit =10, minbucket = 5, maxdepth = 3)
training$Predicted<-NULL
# Re-Building Model & Plotting Model
tune_fit <- rpart(Attrition~.,data=training,method ="class", control =c)
rpart.plot.version1(tune_fit, main = "Model After Pruning",
type = 5, extra = 1,cex = 0.5,tweak =2.2,
faclen = 0,varlen = 0)
# To get importance Variabe
tune_fit$variable.importance
tune_fit$parms
# Prediction on Training data
training$Predicted=predict (tune_fit,training,type ="class")
library(caret)
confusionMatrix(training$Attrition,training$Predicted)
# Prediction on Testing data
testing$Predicted=predict(tune_fit,testing,type ="class")
library(caret)
confusionMatrix(testing$Predicted,testing$Attrition)
# Random Forest
#library (randomForest)
#library(tree)
#library(MASS)
################# Model Building##################
#training$Predicted<-NULL
#names(training)
#rf_class =randomForest(Attrition~.,data=training,mtry=5, importance=TRUE)
#rf_class
################### To idendify the most importance varible##################
#varImpPlot (rf_class,colors='Red')
################### To get the Specific Tree ##################
#getTree(rf_class,3,labelVar = TRUE)
################# Model on test data##################
#testing$Predicted<-NULL
# Prediction on Testing data
#testing$Predicted=predict(rf_class,testing,type ="class")
#library(caret)
#confusionMatrix(testing$Predicted,testing$Attrition)
Model =rpart(Attrition~.,data=training,method = "class")
Model =rpart(Attrition~.,data=training,method = "class")
#            , parms = list(prior = c(.84,.16), split = "gini"))
library(rpart.plot)
rpart.plot.version1(Model, main = "Model Before Pruning",
type = 5, extra = 1,cex = 0.5,tweak =1.5,varlen = 0)
# Prediction on Training data
training$Predicted=predict (Model,training,type ="class")
library(caret)
confusionMatrix(training$Predicted,training$Attrition)
# Doing Pre-Pruning
c<-rpart.control(minsplit =10, minbucket = 5, maxdepth = 3)
training$Predicted<-NULL
# Re-Building Model & Plotting Model
tune_fit <- rpart(Attrition~.,data=training,method ="class", control =c)
rpart.plot.version1(tune_fit, main = "Model After Pruning",
type = 5, extra = 1,cex = 0.5,tweak =2.2,
faclen = 0,varlen = 0)
library(vegan)
library(permute)
library(lattice)
library(tidyverse)  # data manipulation
library(cluster)    # clustering algorithms
library(factoextra) # clustering visualization
library(dplyr)
library(dendextend) # for comparing two dendrograms
library(NbClust)
##Read the data in the file
cust_data<-read.csv("C:/Users/Ankit Dsouza/Desktop/tanushree notes/Marketing Notes/Clustering/tripadvisor_review.csv")
head(cust_data)
cust_data<- cust_data[-1]
summary(cust_data)
cust_data_f<- scale(cust_data)
head(cust_data_f)
k2 <- kmeans(cust_data_f, centers = 3, nstart = 25)
str(k2)
k2
fviz_cluster(k2, data = cust_data_f)
# Also looking at the elbow chart
mydata <- cust_data
#Determine the optimal cluster size based on within sum of squares
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(mydata,centers=i)$withinss)
#Plot the elbow chart to determine optimal cluster
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares",col="mediumseagreen",pch=12)
###Run the kmeans algorithm to generate the clusters
k1<-kmeans(cust_data_f, 4)
k1
###See the clustering results
###Fetch the group means for each variable
a<-data.frame(k1$centers)
###Fetch size/n of obs for the groups
k1$size
###Fetch the cluster for each obs
k1$cluster
cust_data$cluster=k1$cluster
View(cust_data)
write.csv(a,"try1.csv")
write.csv(mydata,"try1.csv")
setwd("C:/Users/Ankit Dsouza/Desktop/tanushree notes/Marketing Notes/Clustering")
write.csv(a,"try1.csv")
write.csv(mydata,"try1.csv")
write.csv(a,"try1.csv")
write.csv(mydata,"try.csv")
k2 <- kmeans(cust_data_f, centers = 3,iter.max = 100)
str(k2)
library(vegan)
library(permute)
library(lattice)
library(tidyverse)  # data manipulation
library(cluster)    # clustering algorithms
library(factoextra) # clustering visualization
library(dplyr)
library(dendextend) # for comparing two dendrograms
library(NbClust)
##Read the data in the file
cust_data<-read.csv("C:/Users/Ankit Dsouza/Desktop/tanushree notes/Marketing Notes/Clustering/tripadvisor_review.csv")
### Select the requried columns for clustering
head(cust_data)
cust_data<- cust_data[-1]
summary(cust_data)
#Determining Optimal Clusters
cust_data_f<- cust_data
head(cust_data_f)
k2 <- kmeans(cust_data_f, centers = 3,iter.max = 100)
str(k2)
k2
fviz_cluster(k2, data = cust_data_f)
# Also looking at the elbow chart
mydata <- cust_data
#Determine the optimal cluster size based on within sum of squares
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(mydata,centers=i)$withinss)
#Plot the elbow chart to determine optimal cluster
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares",col="mediumseagreen",pch=12)
###Run the kmeans algorithm to generate the clusters
k1<-kmeans(cust_data_f, 3)
k1
###See the clustering results
###Fetch the group means for each variable
a<-data.frame(k1$centers)
###Fetch size/n of obs for the groups
k1$size
###Fetch the cluster for each obs
k1$cluster
cust_data$cluster=k1$cluster
write.csv(a,"try1.csv")
write.csv(mydata,"try.csv")
write.csv(a,"try1.csv")
write.csv(mydata,"try.csv")
View(mydata)
mydata$cluster=k1$cluster
write.csv(mydata,"try.csv")
k2 <- kmeans(cust_data, centers = 5,iter.max = 100)
str(k2)
k2
str(k2)
k2
fviz_cluster(k2, data = cust_data_f)
# Also looking at the elbow chart
mydata <- cust_data
##Read the data in the file
cust_data<-read.csv("C:/Users/Ankit Dsouza/Desktop/tanushree notes/Marketing Notes/Clustering/tripadvisor_review.csv")
head(cust_data)
cust_data<- cust_data[-1]
summary(cust_data)
k2 <- kmeans(cust_data, centers = 5,iter.max = 100)
k2
fviz_cluster(k2, data = cust_data_f)
# Also looking at the elbow chart
mydata <- cust_data
#Determine the optimal cluster size based on within sum of squares
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(mydata,centers=i)$withinss)
#Plot the elbow chart to determine optimal cluster
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares",col="mediumseagreen",pch=12)
###Run the kmeans algorithm to generate the clusters
k1<-kmeans(cust_data_f, 3)
k1
###Run the kmeans algorithm to generate the clusters
k1<-kmeans(cust_data, 3)
k1
###See the clustering results
###Fetch the group means for each variable
a<-data.frame(k1$centers)
###Fetch size/n of obs for the groups
k1$size
###Fetch the cluster for each obs
k1$cluster
mydata$cluster=k1$cluster
View(mydata)
write.csv(a,"try1.csv")
write.csv(mydata,"try.csv")
library(vegan)
library(permute)
library(lattice)
library(tidyverse)  # data manipulation
library(cluster)    # clustering algorithms
library(factoextra) # clustering visualization
library(dplyr)
library(dendextend) # for comparing two dendrograms
library(NbClust)
##Read the data in the file
cust_data<-read.csv("C:/Users/Ankit Dsouza/Desktop/tanushree notes/Marketing Notes/Clustering/tripadvisor_review.csv")
head(cust_data)
cust_data<- cust_data[-1]
summary(cust_data)
Model_Segment <- kmeans(cust_data, centers = 5,iter.max = 100)
Model_Segment
fviz_cluster(Model_Segment, data = cust_data_f)
library(vegan)
library(permute)
library(lattice)
library(tidyverse)  # data manipulation
library(cluster)    # clustering algorithms
library(factoextra) # clustering visualization
library(dplyr)
library(dendextend) # for comparing two dendrograms
library(NbClust)
##Read the data in the file
cust_data<-read.csv("C:/Users/Ankit Dsouza/Desktop/tanushree notes/Marketing Notes/Clustering/tripadvisor_review.csv")
head(cust_data)
cust_data<- cust_data[-1]
summary(cust_data)
Model_Segment <- kmeans(cust_data, centers = 5,iter.max = 100)
Model_Segment
fviz_cluster(Model_Segment, data = cust_data)
# Also looking at the elbow chart
mydata <- cust_data
#Determine the optimal cluster size based on within sum of squares
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(mydata,centers=i)$withinss)
#Plot the elbow chart to determine optimal cluster
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares",col="mediumseagreen",pch=12)
###Run the kmeans algorithm to generate the clusters
Final_Model_segment<-kmeans(cust_data, 3)
Final_Model_segment
###See the clustering results
###Fetch the group means for each variable
a<-data.frame(Final_Model_segment$centers)
###Fetch size/n of obs for the groups
Final_Model_segment$size
###Fetch the cluster for each obs
Final_Model_segment$cluster
mydata$Cluster=Final_Model_segment$cluster
library(vegan)
library(permute)
library(lattice)
library(tidyverse)  # data manipulation
library(cluster)    # clustering algorithms
library(factoextra) # clustering visualization
library(dplyr)
library(dendextend) # for comparing two dendrograms
library(NbClust)
##Read the data in the file
cust_data<-read.csv("C:/Users/Ankit Dsouza/Desktop/tanushree notes/Marketing Notes/Clustering/tripadvisor_review.csv")
head(cust_data)
cust_data<- cust_data[-1]
summary(cust_data)
Model_Segment <- kmeans(cust_data, centers = 5,iter.max = 100)
Model_Segment
fviz_cluster(Model_Segment, data = cust_data)
# Also looking at the elbow chart
mydata <- cust_data
#Determine the optimal cluster size based on within sum of squares
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(mydata,centers=i)$withinss)
#Plot the elbow chart to determine optimal cluster
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares",col="mediumseagreen",pch=12)
###Run the kmeans algorithm to generate the clusters
Final_Model_segment<-kmeans(cust_data, 3)
Final_Model_segment
###See the clustering results
###Fetch the group means for each variable
data.frame(Final_Model_segment$centers)
###See the clustering results
###Fetch the group means for each variable
Final_Model_segment$centers
###See the clustering results
###Fetch the group means for each variable or Center Point of cluster
Final_Model_segment$centers
###Fetch size/n of obs for the groups
Final_Model_segment$size
###Fetch the cluster for each obs
Final_Model_segment$cluster
###Fetch the cluster for each obs
table(Final_Model_segment$cluster)
###Fetch size/n of obs for the groups
Final_Model_segment$size
###Fetch size/n of obs for the groups
Final_Model_segment$size
###Fetch the Total cluster  & Count of obs in cluster
table(Final_Model_segment$cluster)
mydata$Cluster=Final_Model_segment$cluster
View(mydata)
library(vegan)
library(permute)
library(lattice)
library(tidyverse)  # data manipulation
library(cluster)    # clustering algorithms
library(factoextra) # clustering visualization
library(dplyr)
library(dendextend) # for comparing two dendrograms
library(NbClust)
##Read the data in the file
cust_data<-read.csv("C:/Users/Ankit Dsouza/Desktop/tanushree notes/Marketing Notes/Clustering/tripadvisor_review.csv")
### Select the requried columns for clustering
head(cust_data)
cust_data<- cust_data[-1]
summary(cust_data)
#Determining Optimal Clusters
#the three most popular methods for determining the optimal clusters, which includes:
Model_Segment <- kmeans(cust_data, centers = 5,iter.max = 100)
Model_Segment
fviz_cluster(Model_Segment, data = cust_data)
# Also looking at the elbow chart
mydata <- cust_data
#Determine the optimal cluster size based on within sum of squares
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(mydata,centers=i)$withinss)
#Plot the elbow chart to determine optimal cluster
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares",col="mediumseagreen",pch=12)
###Run the kmeans algorithm to generate the clusters
Final_Model_segment<-kmeans(cust_data, 3)
Final_Model_segment
###See the clustering results
###Fetch the group means for each variable or Center Point of cluster
Final_Model_segment$centers
###Fetch the Total cluster  & Count of obs in cluster
table(Final_Model_segment$cluster)
mydata$Cluster=Final_Model_segment$cluster
View(mydata)
# Verification of Clustering in excel
a<-data.frame(Final_Model_segment$centers)
write.csv(a,"try1.csv")
write.csv(mydata,"try.csv")
