Predict customer’s choice of Insurance Plan – AllState Insurance Dataset

PROBLEM STATEMENT :

Predict which customer will buy the last quoted plan, given the base line that most of the customers tend to buy the plan that was last quoted to them.

BUSINESS VALUE :

Knowing which customer will buy the last quoted plan to them can help in strategizing regarding choosing the optimum plan to be quoted last.

Files to use : All State Insurance data.

The R code for the problem can be found below:


library(dplyr)
library(caret)
library(ROCR)
library(randomForest)
library(nnet)
library(ggplot2)

#Load raw data
train <- read.csv("train.csv", header = TRUE)
test <- read.csv("test.csv", header = TRUE)

#Combined Datasets
data.combined <- rbind(train, test)

#Check the data type in the data frame
str(data.combined)

#Check and change necessary variables as factor
data.combined$record_type <- as.factor(data.combined$record_type)
data.combined$day <- as.factor(data.combined$day)
data.combined$state <- as.factor(data.combined$state)
data.combined$group_size <- as.integer(data.combined$group_size)
data.combined$homeowner <- as.integer(data.combined$homeowner)
data.combined$risk_factor <- as.integer(data.combined$risk_factor)
data.combined$married_couple <- as.integer(data.combined$married_couple)
data.combined$C_previous <- as.factor(data.combined$C_previous)
data.combined$time <- as.integer(data.combined$time)
data.combined$A <- as.integer(data.combined$A)
data.combined$B <- as.integer(data.combined$B)
data.combined$C <- as.integer(data.combined$C)
data.combined$D <- as.integer(data.combined$D)
data.combined$E <- as.integer(data.combined$E)
data.combined$F <- as.integer(data.combined$F)
data.combined$G <- as.integer(data.combined$G)

table(data.combined$C_previous)
table(data.combined$group_size)
table(data.combined$car_age)
table(data.combined$married_couple)
table(data.combined$homeowner)
table(data.combined$risk_factor)

#Feature Engineering
#Add an "insurance.plan" variable to combine all 7 options
data.combined$insurance.plan <- paste0(data.combined$A, data.combined$B, data.combined$C,
data.combined$D, data.combined$E, data.combined$F, data.combined$G)
#Extract the hour from time and find the time of the day
data.combined$extracted.hr <- substr(data.combined$time, 1, 2)
data.combined$time.of.day <- as.factor(ifelse(data.combined$extracted.hr %in% 6:16, "day",
ifelse(data.combined$extracted.hr %in% 17:19, "evening", "night")))
data.combined$isweekend <- as.factor(ifelse(data.combined$day %in% 0:5, "No", "Yes"))
data.combined$isfamily <- as.factor(ifelse(data.combined$group_size > 2 & data.combined$age_youngest < 25 &
data.combined$married_couple == 2, "Yes", "No"))
data.combined$iscouple <- as.factor(ifelse(data.combined$group_size ==2 & data.combined$married_couple ==2,
"Yes", "No"))
data.combined$isindividual <- as.factor(ifelse(data.combined$group_size == 1, "Yes", "No"))

# Replace NA's for duration_previous and C_previous
data.combined$duration_previous[is.na(data.combined$duration_previous)] <- 0
levels(data.combined$C_previous) <- c("1", "2", "3", "4", "none")
data.combined$C_previous[is.na(data.combined$C_previous)] <- "none"

#NA for riskfactor
data_noriskfactor <- data.combined[is.na(data.combined$risk_factor), ]
data_riskfactor <- data.combined[!is.na(data.combined$risk_factor), ]
risk.fit <- lm(risk_factor ~ age_youngest * group_size + married_couple + homeowner, data = data_riskfactor)
risk.pred <- predict(risk.fit, newdata = data_noriskfactor)
data.combined$risk_factor[is.na(data.combined$risk_factor)] <- round(risk.pred, 0)

#Assuming all car users' behavior above car_age 50 will be similar
data.combined$car_age[data.combined$car_age > 50] <- 50

train1 <- data.combined[1:665249, ]
test1 <- data.combined[665250:864105, ]

#train.purchase is subset of train that only includes purchases
train.purchase <- train1[!duplicated(train1$customer_ID, fromLast=TRUE), ]

# train.quote is subset of train that excludes purchases
train.quote <- train1[duplicated(train1$customer_ID, fromLast=TRUE), ]

# train.lastquote only includes last quote before purchase
train.lastquote <- train.quote[!duplicated(train.quote$customer_ID, fromLast=TRUE), ]

#Hypothesis : People tend to purchase the last quote
#See who changed from their last quote
lastquote.changed <- ifelse(train.purchase$insurance.plan == train.lastquote$insurance.plan, "No", "Yes")
train.purchase$lastquote.changed <- as.factor(lastquote.changed)
train.lastquote$lastquote.changed <- as.factor(lastquote.changed)

#Do some plotting and visualize

#Effect of time of day on last quote changed
ggplot(train.purchase, aes(x = lastquote.changed, fill = time.of.day)) +
stat_count(width = 0.5) +
ggtitle("Effect of Time of day on last quote changed") +
xlab("last quote changed") +
ylab("Total count") +
labs(fill = "time.of.day")

#Effect of weekend on last quote changed
ggplot(train.purchase, aes(x = lastquote.changed, fill = isweekend)) +
stat_count(width = 0.5) +
ggtitle("Effect of weekend on last quote changed") +
xlab("last quote changed") +
ylab("Total count") +
labs(fill = "isweekend")

#Effect of isfamily on last quote changed
ggplot(train.purchase, aes(x = lastquote.changed, fill = isfamily)) +
stat_count(width = 0.5) +
ggtitle("Effect of isfamily on last quote changed") +
xlab("last quote changed") +
ylab("Total count") +
labs(fill = "isfamily")

#Effect of iscouple on last quote changed
ggplot(train.purchase, aes(x = lastquote.changed, fill = iscouple)) +
stat_count(width = 0.5) +
ggtitle("Effect of iscouple on last quote changed") +
xlab("last quote changed") +
ylab("Total count") +
labs(fill = "iscouple")

#Effect of isindividual on last quote changed
ggplot(train.purchase, aes(x = lastquote.changed, fill = isindividual)) +
stat_count(width = 0.5) +
ggtitle("Effect of isindividual on last quote changed") +
xlab("last quote changed") +
ylab("Total count") +
labs(fill = "isindividual")

#Effect of risk_factor on last quote changed
ggplot(train.purchase, aes(x = risk_factor, fill = lastquote.changed)) +
stat_count(width = 0.1) +
ggtitle("Effect of risk_factor on last quote changed") +
xlab("risk_factor") +
ylab("Total count") +
labs(fill = "last quote change")

#Effect of car_age on last quote changed
ggplot(train.purchase, aes(x = car_age, fill = lastquote.changed)) +
stat_count(width = 0.5) +
ggtitle("Effect of car_age on last quote changed") +
xlab("car_age") +
ylab("Total count") +
labs(fill = "last quote change")

#Effect of state on last quote changed
ggplot(train.purchase, aes(x = state, fill = lastquote.changed)) +
stat_count(width = 0.5) +
theme(text = element_text(size = 7),
axis.text.x = element_text(angle=90, vjust=1)) +
ggtitle("Effect of state on last quote changed") +
xlab("state") +
ylab("Total count") +
labs(fill = "last quote change")

#Effect of home ownership on last quote changed
ggplot(train.purchase, aes(x = homeowner, fill = lastquote.changed)) +
stat_count(width = 0.5) +
ggtitle("Effect of home ownership on last quote changed") +
xlab("home owner") +
ylab("Total count") +
labs(fill = "last quote change")

#Train a random forest with default parameters using car_age, state, married couple, A, B, C, D, E, F, G to predict
#lastquote.changed
rf.train.1 <- train.purchase[, c("state", "car_age", "married_couple", "risk_factor", "C_previous", "A",
"B", "C", "D", "E", "F", "G")]
rf.label <- train.purchase$lastquote.changed

set.seed(1234)
rf.1 <- randomForest(x = rf.train.1, y = rf.label, importance = TRUE, ntree = 1000)
rf.1
varImpPlot(rf.1)

rf.train.2 <- train.purchase[, c("car_age", "car_value", "cost", "time.of.day", "isweekend", "isfamily", "isindividual",
"C_previous", "iscouple", "duration_previous", "A", "B", "C", "D", "E", "F", "G")]

set.seed(1234)
rf.2 <- randomForest(x = rf.train.2, y = rf.label, importance = TRUE, ntree = 1000)
rf.2
varImpPlot(rf.2)

rf.train.3 <- train.purchase[, c("car_age", "car_value", "cost", "C_previous", "duration_previous",
"A", "B", "C", "D", "E", "F", "G")]

set.seed(1234)
rf.3 <- randomForest(x = rf.train.3, y = rf.label, importance = TRUE, ntree = 1000)
rf.3
varImpPlot(rf.3)

rf.train.4 <- train.purchase[, c("car_age", "C_previous", "A", "B", "C", "D", "E", "F", "G")]

set.seed(1234)
rf.4 <- randomForest(x = rf.train.4, y = rf.label, importance = TRUE, ntree = 1000)
rf.4
varImpPlot(rf.4)

rf.train.5 <- train.purchase[, c("car_age", "C_previous", "A", "C", "D", "F")]

set.seed(1234)
rf.5 <- randomForest(x = rf.train.5, y = rf.label, importance = TRUE, ntree = 1000)
rf.5
varImpPlot(rf.5)

rf.train.6 <- train.purchase[, c("C_previous", "A", "C", "F")]

set.seed(1234)
rf.6 <- randomForest(x = rf.train.6, y = rf.label, importance = TRUE, ntree = 1000)
rf.6
varImpPlot(rf.6)

#Validation
#Apply the best model to test set
test.submit.df <- test1[, c("C_previous", "A", "C", "F")]

#Make Predictions
rf.6.preds <- predict(rf.6, test.submit.df)
table(rf.6.preds)

submit1.df <- data.frame(customer_ID = test1$customer_ID, lastquote.changed = rf.6.preds)
write.csv(submit1.df, file = "AllState_1.csv", row.names = FALSE)
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s