Predicting Survival on Titanic

PROBLEM STATEMENT :

Predict the survival rate on the Titanic.

Files to use :  Titanic Dataset

RESPONSE VARIABLE : Survived (1) or Perished (0)

FACTOR VARIABLES :

PassengerID
PClass
Name
Sex
Age
SibSp
Parch
Ticket
Fare
Cabin
Embarked

DATA CLEANING :

I have done some Feature engineering to get more valuable information from the given data, like extracting the Family Size and the Titles of all the people on board. I used Random Forest to build the model to predict who will survive and who will perish on the Titanic.

The R code is as follows :


#load raw data

train <- read.csv("train.csv", header = TRUE)
test <- read.csv("test.csv", header = TRUE)

#Add a "survived" variable to test set to allow for combining data sets
test.survived <- data.frame(Survived = rep("none",nrow(test)), test[,])

#Combine datasets
data.combined <- rbind(train, test.survived)

#chk the data types in the data frame
str(data.combined)

data.combined$Name <- as.character(data.combined$Name)
data.combined$Survived <- as.factor(data.combined$Survived)
data.combined$Pclass <- as.factor(data.combined$Pclass)

#tabulate the gross survival rate
table(data.combined$Survived)

#distribution across classes
table(data.combined$Pclass)

#How many unique names are there in both train and test sets
length(unique(data.combined$Name))

#Two duplicated names, take a closer look
#First get the duplicate names and store them as vectors
dup.names <- data.combined[which(duplicated(data.combined$Name)), "Name"]

#Next take a look at the duplicated names in data.combined
dup.names
data.combined[which(data.combined$Name %in% dup.names),]

#Extract title from name
strsplit(data.combined$Name[1], split='[,.]')

strsplit(data.combined$Name[1], split='[,.]')[[1]]
strsplit(data.combined$Name[1], split='[,.]')[[1]][2]

data.combined$Title <- sapply(data.combined$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][2]})

data.combined$Title <- sub(' ', '', data.combined$Title)

table(data.combined$Title)

data.combined$Title[data.combined$Title %in% c('Mme', 'Mlle', 'Dona', 'Lady', 'the Countess')] <- 'Lady'
data.combined$Title[data.combined$Title %in% c('Capt', 'Don', 'Major', 'Sir', 'Jonkheer', 'Dr', 'Col', 'Rev')] <- 'Sir'
data.combined$Title[data.combined$Title %in% c('Miss', 'Ms')] <- 'Ms'

data.combined$Title <- as.factor(data.combined$Title)

data.combined$FamilySize <- data.combined$SibSp + data.combined$Parch + 1

data.combined$alone <- 0
data.combined$alone[which(data.combined$FamilySize == 1)] <- 1

data.combined$alone <- as.factor(data.combined$alone)

table(data.combined$Sex)
table(data.combined$Age)

library(randomForest)
#train a random forest with default parameters using pclass and titles

table(data.combined$Title)
table(data.combined$SibSp)
table(data.combined$Parch)
table(data.combined$Fare)
table(data.combined$Embarked)

data.combined[is.na(data.combined$Embarked),]
data.combined[is.na(data.combined$Fare),]

#Filling the missing values for Fare

class3avg <- mean(data.combined[which(data.combined$Pclass == 3),]$Fare, na.rm = TRUE)
data.combined[which(is.na(data.combined$Fare)),"Fare"] <- class3avg

#Filling the missing values for Age

training.missing <- data.combined[which(!is.na(data.combined$Age)), c("Age", "Title", "alone", "Fare", "Pclass")]
training.missing
missing.age <- data.combined[which(is.na(data.combined$Age)), c("Age", "Title", "alone", "Fare", "Pclass")]
missing.age

missing.age$Age <- "none"
lin.fit <- lm(Age ~ Fare + Pclass + Title + alone, data = training.missing)
predicted.ages <- predict.lm(lin.fit, missing.age)
missing.age[,"Age"] <- predicted.ages

missing.age

data.combined[which(is.na(data.combined$Age)), "Age"] <- missing.age$Age
data.combined[which(is.na(data.combined$Age)), "Age"]

rf.label <- as.factor(train$Survived)

rf.train.1 <- data.combined[1:891, c("Pclass", "Title", "Sex", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "alone")]
set.seed(1333)
rf.1 <- randomForest(x = rf.train.1, y = rf.label, importance = TRUE, ntree = 100, mtry = 2)
rf.1
varImpPlot(rf.1)

rf.train.2 <- data.combined[1:891, c("Pclass", "Title", "Sex", "Fare", "Embarked", "alone")]
set.seed(1333)
rf.2 <- randomForest(x = rf.train.2, y = rf.label, importance = TRUE, ntree = 100, mtry = 2)
rf.2
varImpPlot(rf.2)

rf.train.3 <- data.combined[1:891, c("Pclass", "Title", "Sex", "Fare", "alone")]
set.seed(1333)
rf.3 <- randomForest(x = rf.train.3, y = rf.label, importance = TRUE, ntree = 100, mtry = 3)
rf.3
varImpPlot(rf.3)

rf.train.4 <- data.combined[1:891, c("Pclass", "Title", "Sex", "Fare", "FamilySize")]
set.seed(1333)
rf.4 <- randomForest(x = rf.train.4, y = rf.label, importance = TRUE, ntree = 100, mtry = 2)
rf.4
varImpPlot(rf.4)

rf.train.5 <- data.combined[1:891, c("Pclass", "Title", "Fare", "FamilySize")]
set.seed(1333)
rf.5 <- randomForest(x = rf.train.5, y = rf.label, importance = TRUE, ntree = 100, mtry = 3)
rf.5
varImpPlot(rf.5)

rf.train.6 <- data.combined[1:891, c("Pclass", "Sex", "Fare", "FamilySize")]
set.seed(1333)
rf.6 <- randomForest(x = rf.train.6, y = rf.label, importance = TRUE, ntree = 100, mtry = 2)
rf.6
varImpPlot(rf.6)

rf.train.7 <- data.combined[1:891, c("Pclass", "Title", "Fare", "alone")]
set.seed(1333)
rf.7 <- randomForest(x = rf.train.7, y = rf.label, importance = TRUE, ntree = 100, mtry = 2)
rf.7
varImpPlot(rf.7)

rf.train.8 <- data.combined[1:891, c("Pclass", "Title", "FamilySize")]
set.seed(1333)
rf.8 <- randomForest(x = rf.train.8, y = rf.label, importance = TRUE, ntree = 100, mtry = 2)
rf.8
varImpPlot(rf.8)

rf.train.9 <- data.combined[1:891, c("Pclass", "Title", "Fare", "FamilySize", "Age")]
set.seed(1333)
rf.9 <- randomForest(x = rf.train.9, y = rf.label, importance = TRUE, ntree = 100, mtry = 2)
rf.9
varImpPlot(rf.9)

rf.train.10 <- data.combined[1:891, c("Pclass", "Title", "Fare", "FamilySize", "Age", "Sex")]
set.seed(1333)
rf.10 <- randomForest(x = rf.train.10, y = rf.label, importance = TRUE, ntree = 100, mtry = 2)
rf.10
varImpPlot(rf.10)

rf.train.11 <- data.combined[1:891, c("Pclass", "Title", "FamilySize", "Sex")]
set.seed(1333)
rf.11 <- randomForest(x = rf.train.11, y = rf.label, importance = TRUE, ntree = 100, mtry = 2)
rf.11
varImpPlot(rf.11)

#apply the best model to test set
test.submit.df <- data.combined[892:1309, c("Pclass", "Title", "Fare", "FamilySize", "Age", "Sex")]

#Make predictions
rf.preds <- predict(rf.10, test.submit.df)
table(rf.preds)

#Write a csv file to submit in Kaggle
submit.df <- data.frame(PassengerId = rep(892:1309), Survived = rf.preds)
write.csv(submit.df, file = "Submit_01272017_5.csv", row.names = FALSE)

#Cross Validation
library(caret)
library(doSNOW)

#10 fold cross validation repeated 10 times

#Leverage caret to create 100 folds, but ensure that the ratio of survived to perished in each fold
#matches the overall training set. This is known as STRATIFIED VALIDATION

set.seed(1333)
cv.10.folds <- createMultiFolds(rf.label, k = 10, times = 10)

#check stratification
table(rf.label)
342 / 549

table(rf.label[cv.10.folds[[33]]])
308 / 494

#Set up Caret's trainControl object per above
ctrl.1 <- trainControl(method = "repeatedcv", number = 10, repeats = 10, index = cv.10.folds)

#Set doSNOW package for multi-core training.
cl <- makeCluster(2, type = "SOCK")
registerDoSNOW(cl)

#Set seed for reproducability and train
install.packages('caret', dependencies = TRUE)

set.seed(1333)
rf.5.cv.1 <- train(x = rf.train.11, y = rf.label, method = "rf", tuneLength = 2, ntree = 100, trControl = ctrl.1)

#Shutdown cluster
stopCluster(cl)

#Checkout results
rf.5.cv.1

#The above results are slightly more pessimistics than rf.5 OOB prediction. Lets try 5 fold cv repeated 10 times
set.seed(1333)
cv.5.folds <- createMultiFolds(rf.label, k = 5, times = 10)

ctrl.2 <- trainControl(method = "repeatedcv", number = 5, repeats = 10, index = cv.5.folds)

cl <- makeCluster(2, type = "SOCK")
registerDoSNOW(cl)

set.seed(1333)
rf.5.cv.2 <- train(x = rf.train.11, y = rf.label, method = "rf", tuneLength = 2, ntree = 100, trControl = ctrl.2)

#Shutdown cluster
stopCluster(cl)

#Checkout results
rf.5.cv.2

#5 fold is not better, so lets move to 3 fold
set.seed(1333)
cv.3.folds <- createMultiFolds(rf.label, k = 3, times = 10)

ctrl.3 <- trainControl(method = "repeatedcv", number = 3, repeats = 10, index = cv.3.folds)

cl <- makeCluster(2, type = "SOCK")
registerDoSNOW(cl)

set.seed(1333)
rf.5.cv.3 <- train(x = rf.train.11, y = rf.label, method = "rf", tuneLength = 2, ntree = 100, trControl = ctrl.3)

#Shutdown cluster
stopCluster(cl)

#Checkout results
rf.5.cv.3

#Make predictions
rf.preds1 <- predict(rf.5.cv.1, test.submit.df)
table(rf.preds1)

#Write a csv file to submit in Kaggle
submit.df <- data.frame(PassengerId = rep(892:1309), Survived = rf.preds1)
write.csv(submit.df, file = "Submit_01272017_10.csv", row.names = FALSE)

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s