Please answer all questions in the word file and provide all related R code.
HELP: Reading the data
# Bringing the data
train.x <- read.table(file = “train.txt”, header = FALSE)
train.y <- read.table(file = “train_id.txt”, header = FALSE)
test.x <- read.table(file = “test.txt”, header = FALSE)
test.y <- read.table(file = “test_id.txt”, header = FALSE)
# Combining the data
train.dt <- cbind(train.y, train.x)
test.dt <- cbind(test.y, test.x)
# Assign names for the columns
colnames(train.dt) <- c(“Y”, paste(“X”, 1:4, sep = “”))
colnames(test.dt) <- c(“Y”, paste(“X”, 1:4, sep = “”))
# Converting target Y to (0, 1) standard look (this is optional)
train.dt$Y <- ifelse(train.dt$Y == 1, 0, 1)
test.dt$Y <- ifelse(test.dt$Y == 1, 0, 1)
Sample Code:
Illustration with Stock Market data
Attached Files:
Illustration with Stock Market data.pdf (820.749 KB)
### Discriminant Alanalysis
### Comparing Log Regression, LDA and QDA
### Package ISLR
install.packages(“ISLR”)
library(“ISLR”)
head(Smarket)
class(Smarket)
# verify the format of data is data.frame
# automatic variable assignment
attach(Smarket)
# checking dependence of observations
plot(Lag1[1:1249] ~ Lag1[2:1250])
# spliting the data
train <- subset(Smarket, Year < 2005)
test <- subset(Smarket, Year == 2005)
### Logistic Regression (full model)
log.reg <- glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = train, family = “binomial”)
summary(log.reg)
# predictions
pred.log.reg <- predict(log.reg, test, type=”response”)
# confusion matrix
# use the function ifelse(condition, if the condition is satisfied then “”, if the condition is not satisfied then “”)
table(test$Direction, ifelse(pred.log.reg > 0.5, “Up”, “Down”))
# accuracy rate
accu.log <- mean(ifelse(pred.log.reg > 0.5, “Up”, “Down”) == test$Direction)
accu.log
# misclassification rate
misc.log <- 1 – accu.log
misc.log
# another way to check the error rate (i.e. misc rate)
# mean(ifelse(pred.log.reg > 0.5, “Up”, “Down”) != test$Direction)
#### LDA model full
library(MASS)
lda.model <- lda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = train)
lda.model
# predictions
pred.lda <- predict(lda.model, test, type=”response”)
# confusion matrix
table(test$Direction, pred.lda$class)
# accuracy rate
accu.lda <- mean(pred.lda$class == test$Direction)
accu.lda
# misclassification rate
misc.lda <- 1 – accu.lda
misc.lda
#### QDA model full
qda.model <- qda(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume, data = train)
qda.model
# predictions
pred.qda <- predict(qda.model, test, type=”response”)
# confusion matrix
table(test$Direction, pred.qda$class)
# accuracy rate
accu.qda <- mean(pred.qda$class == test$Direction)
accu.qda
# misclassification rate
misc.qda <- 1 – accu.qda
misc.qda
###########################
###### Reduced Models #######
###########################
### Logistic Regression (reduced)
log.reg.reduced <- glm(Direction ~ Lag1 + Lag2, data = train, family = “binomial”)
summary(log.reg.reduced)
# predictions
pred.log.reg.reduced <- predict(log.reg.reduced, test, type=”response”)
# confusion matrix
table(test$Direction, ifelse(pred.log.reg.reduced > 0.5, “Up”, “Down”))
# accuracy rate
accu.log <- mean(ifelse(pred.log.reg.reduced > 0.5, “Up”, “Down”) == test$Direction)
accu.log
# misclassification rate
misc.log <- 1 – accu.log
misc.log
#### LDA model (reduced)
lda.reduced <- lda(Direction ~ Lag1 + Lag2, data = train)
lda.reduced
# predictions
pred.lda.reduced <- predict(lda.reduced, test, type=”response”)
# confusion matrix
table(test$Direction, pred.lda.reduced$class)
# accuracy rate
accu.lda <- mean(pred.lda.reduced$class == test$Direction)
accu.lda
# misclassification rate
misc.lda <- 1 – accu.lda
misc.lda
#### QDA model (reduced)
qda.reduced <- qda(Direction ~ Lag1 + Lag2, data = train)
qda.reduced
# predictions
pred.qda.reduced <- predict(qda.reduced, test, type=”response”)
# confusion matrix
table(test$Direction, pred.qda.reduced$class)
# accuracy rate
accu.qda <- mean(pred.qda.reduced$class == test$Direction)
accu.qda
# misclassification rate
misc.qda <- 1 – accu.qda
misc.qda
#### ROC plots
install.packages(“ROCR”)
library(“ROCR”)
pred_LM <- prediction(pred.log.reg.reduced, test$Direction)
LM <- performance(pred_LM, measure = “tpr”, x.measure = “fpr”)
pred_LDA <- prediction(pred.lda.reduced$posterior[,2], test$Direction)
LDA <- performance(pred_LDA, measure = “tpr”, x.measure = “fpr”)
pred_QDA <- prediction(pred.qda.reduced$posterior[,2], test$Direction)
QDA <- performance(pred_QDA, measure = “tpr”, x.measure = “fpr”)
plot(LM, col = “black”)
plot(LDA, add = TRUE, col = “orange”)
plot(QDA, add = TRUE, col = “blue”)