library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(e1071)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(tidyverse)
##
## v tibble 3.0.3 v purrr 0.3.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
##
## x plotly::filter() masks dplyr::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
library(class)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
origData = read.csv("/Users/ysojd/Desktop/GRAD SCHOOL COURSEWORK/MSDS_6306_Doing-Data-Science-Master/MSDS_6306_Doing-Data-Science-Master/Unit 14 and 15 Case Study 2/CaseStudy2-data.csv",header = TRUE)
NoSalary <- readxl::read_xlsx("/Users/ysojd/Desktop/GRAD SCHOOL COURSEWORK/MSDS_6306_Doing-Data-Science-Master/MSDS_6306_Doing-Data-Science-Master/Unit 14 and 15 Case Study 2/CaseStudy2CompSet No Salary.xlsx")
NoAttrition <- read.csv("/Users/ysojd/Desktop/GRAD SCHOOL COURSEWORK/MSDS_6306_Doing-Data-Science-Master/MSDS_6306_Doing-Data-Science-Master/Unit 14 and 15 Case Study 2/CaseStudy2CompSet No Attrition.csv",header = TRUE)
to.be.deleted = which(sapply(origData,function(col) length(unique(col))==1))
origData = origData[,-to.be.deleted]
origData <- mutate_if(origData, is.character, as.factor)
##Analysis Plots
#Overall Atrition Count
ggplot(data=origData)+
geom_bar(mapping=aes(x=Attrition))
#Employee attrition count by Department
origData %>% ggplot() + geom_bar(mapping=aes(x=Department, fill=Attrition)) + coord_flip()
#Male vs Female Attrition Count
origData %>% ggplot() + geom_bar(mapping=aes(x=Gender, fill=Attrition)) + coord_flip()
#Satisfaction
origData %>% ggplot()+ geom_bar(aes(x=EnvironmentSatisfaction,fill=Attrition),position="fill")+
scale_y_continuous(labels = scales::percent)+
ggtitle("Attrition Due to Environmental Satisfaction")+ylab("Attrition Rate")+xlab("Satisfaction")
# Effects of Job Level, Salary Hike, and Monthly Income on Attrition
pl <- ggplot(origData, aes(x=MonthlyIncome, y=PercentSalaryHike)) + geom_point(shape=2)+ ggtitle("Effect of Job Level(1-5), PercentSalaryHike and MonthlyIncome on Attrition(Y/N)")
pl + facet_grid(Attrition ~ JobLevel)
##Create Classification Model #NB
library(lattice)
library(caret)
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.0.3
library(rsample)
## Warning: package 'rsample' was built under R version 4.0.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.3
## corrplot 0.84 loaded
library(h2o)
## Warning: package 'h2o' was built under R version 4.0.3
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(ggplot2)
library(dplyr)
library(tidyr)
control = trainControl(method="repeatedcv", number=10, repeats=3)
#train the model
model = train(Attrition~.,data=origData,method="lvq",preProcess="scale", trControl=control)
#Determine Variable Priority
importance = varImp(model,scale=FALSE)
#summarize importance
print(importance)
## ROC curve variable importance
##
## only 20 most important variables shown (out of 32)
##
## Importance
## OverTime 0.6679
## MonthlyIncome 0.6567
## TotalWorkingYears 0.6563
## YearsAtCompany 0.6470
## StockOptionLevel 0.6455
## MaritalStatus 0.6438
## JobLevel 0.6406
## YearsInCurrentRole 0.6403
## YearsWithCurrManager 0.6291
## Age 0.6265
## JobInvolvement 0.6159
## JobSatisfaction 0.5833
## JobRole 0.5829
## Department 0.5605
## DistanceFromHome 0.5586
## EnvironmentSatisfaction 0.5532
## WorkLifeBalance 0.5491
## TrainingTimesLastYear 0.5428
## Education 0.5384
## ID 0.5371
origData <- origData %>%
mutate(
JobLevel = factor(JobLevel),
StockOptionLevel = factor(StockOptionLevel),
TrainingTimesLastYear = factor(TrainingTimesLastYear)
)
set.seed(13)
split <- initial_split(origData, prop = .7, strata = "Attrition")
train <- training(split)
test <- testing(split)
# distribution of Attrition rates across train & test set
table(train$Attrition) %>% prop.table()
##
## No Yes
## 0.8390805 0.1609195
table(test$Attrition) %>% prop.table()
##
## No Yes
## 0.8390805 0.1609195
train %>%
filter(Attrition == "Yes") %>%
select_if(is.numeric) %>%
cor() %>%
corrplot::corrplot()
train %>% dplyr::select(MonthlyIncome, Education, PerformanceRating, TotalWorkingYears, HourlyRate, JobInvolvement) %>% gather(metric, value) %>% ggplot(aes(value, fill = metric)) + geom_density(show.legend = FALSE) + facet_wrap(~ metric, scales = "free")
features <- setdiff(names(train), "Attrition")
x <- train[, features]
y <- train$Attrition
# set up 10-fold cross validation procedure
train_control <- trainControl(
method = "repeatedcv",
number = 10
)
# train model
nb.m1 <-suppressWarnings(train(
x = x,
y = y,
method = "nb",
trControl = train_control
))
# results
confusionMatrix(nb.m1)
## Cross-Validated (10 fold, repeated 1 times) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction No Yes
## No 81.3 11.8
## Yes 2.6 4.3
##
## Accuracy (average) : 0.8555
search_grid <- expand.grid(
usekernel = c(TRUE, FALSE),
fL = 0:5,
adjust = seq(0, 5, by = 1)
)
# train model
nb.m2 <-suppressWarnings(train(
x = x,
y = y,
method = "nb",
trControl = train_control,
tuneGrid = search_grid,
preProc = c("BoxCox", "center", "scale", "pca")
))
# top 5 modesl
nb.m2$results %>%
top_n(5, wt = Accuracy) %>%
arrange(desc(Accuracy))
## usekernel fL adjust Accuracy Kappa AccuracySD KappaSD
## 1 FALSE 2 0 0.8668808 0.4233602 0.04442369 0.1925121
## 2 FALSE 2 1 0.8668808 0.4233602 0.04442369 0.1925121
## 3 FALSE 2 2 0.8668808 0.4233602 0.04442369 0.1925121
## 4 FALSE 2 3 0.8668808 0.4233602 0.04442369 0.1925121
## 5 FALSE 2 4 0.8668808 0.4233602 0.04442369 0.1925121
## 6 FALSE 2 5 0.8668808 0.4233602 0.04442369 0.1925121
## usekernel fL adjust Accuracy Kappa AccuracySD KappaSD
## 1 TRUE 1 3 0.8737864 0.4435322 0.02858175 0.1262286
## 2 TRUE 0 2 0.8689320 0.4386202 0.02903618 0.1155707
## 3 TRUE 2 3 0.8689320 0.4750282 0.02830559 0.0970368
## 4 TRUE 2 4 0.8689320 0.4008608 0.02432572 0.1234943
## 5 TRUE 4 5 0.8689320 0.4439767 0.02867321 0.1354681
# plot search grid results
plot(nb.m2)
confusionMatrix(nb.m2)
## Cross-Validated (10 fold, repeated 1 times) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction No Yes
## No 80.1 9.5
## Yes 3.8 6.6
##
## Accuracy (average) : 0.867
pred <-suppressWarnings(predict(nb.m2, newdata = test))
confusionMatrix(pred, as.factor(test$Attrition))
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 201 17
## Yes 18 25
##
## Accuracy : 0.8659
## 95% CI : (0.8185, 0.9048)
## No Information Rate : 0.8391
## P-Value [Acc > NIR] : 0.1358
##
## Kappa : 0.5082
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.9178
## Specificity : 0.5952
## Pos Pred Value : 0.9220
## Neg Pred Value : 0.5814
## Prevalence : 0.8391
## Detection Rate : 0.7701
## Detection Prevalence : 0.8352
## Balanced Accuracy : 0.7565
##
## 'Positive' Class : No
##
#Get Average Accuracy, Sensitivity, and Specificity
set.seed(13)
iterations = 100
masterAcc = matrix(nrow = iterations,ncol=3)
splitPerc = .75 #Training / Test split Percentage
for(j in 1:iterations)
{
trainIndices = sample(1:dim(origData)[1],round(splitPerc * dim(origData)[1]))
train = origData[trainIndices,]
test = origData[-trainIndices,]
model = naiveBayes(Attrition~.,data=train,laplace = 1)
CM = confusionMatrix(table(predict(model,test),test$Attrition))
masterAcc[j,1] = CM$overall[1]
masterAcc[j,2]=CM$byClass[1]
masterAcc[j,3] = CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanAcc
## [1] 0.8286697 0.8719328 0.6083380
##Use trained predictive model to measure attrition against NoAttrition dataset.
str(NoAttrition)
## 'data.frame': 300 obs. of 35 variables:
## $ ID : int 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 ...
## $ Age : int 35 33 26 55 29 51 52 39 31 31 ...
## $ BusinessTravel : chr "Travel_Rarely" "Travel_Rarely" "Travel_Rarely" "Travel_Rarely" ...
## $ DailyRate : int 750 147 1330 1311 1246 1456 585 1387 1062 534 ...
## $ Department : chr "Research & Development" "Human Resources" "Research & Development" "Research & Development" ...
## $ DistanceFromHome : int 28 2 21 2 19 1 29 10 24 20 ...
## $ Education : int 3 3 3 3 3 4 4 5 3 3 ...
## $ EducationField : chr "Life Sciences" "Human Resources" "Medical" "Life Sciences" ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 1596 1207 1107 505 1497 145 2019 1618 1252 587 ...
## $ EnvironmentSatisfaction : int 2 2 1 3 3 1 1 2 3 1 ...
## $ Gender : chr "Male" "Male" "Male" "Female" ...
## $ HourlyRate : int 46 99 37 97 77 30 40 76 96 66 ...
## $ JobInvolvement : int 4 3 3 3 2 2 3 3 2 3 ...
## $ JobLevel : int 2 1 1 4 2 3 1 2 2 3 ...
## $ JobRole : chr "Laboratory Technician" "Human Resources" "Laboratory Technician" "Manager" ...
## $ JobSatisfaction : int 3 3 3 4 3 1 4 1 1 3 ...
## $ MaritalStatus : chr "Married" "Married" "Divorced" "Single" ...
## $ MonthlyIncome : int 3407 3600 2377 16659 8620 7484 3482 5377 6812 9824 ...
## $ MonthlyRate : int 25348 8429 19373 23258 23757 25796 19788 3835 17198 22908 ...
## $ NumCompaniesWorked : int 1 1 1 2 1 3 2 2 1 3 ...
## $ Over18 : chr "Y" "Y" "Y" "Y" ...
## $ OverTime : chr "No" "No" "No" "Yes" ...
## $ PercentSalaryHike : int 17 13 20 13 14 20 15 13 19 12 ...
## $ PerformanceRating : int 3 3 4 3 3 4 3 3 3 3 ...
## $ RelationshipSatisfaction: int 4 4 3 3 3 3 2 4 2 1 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 2 1 1 0 2 0 2 3 0 0 ...
## $ TotalWorkingYears : int 10 5 1 30 10 23 16 10 10 12 ...
## $ TrainingTimesLastYear : int 3 2 0 2 3 1 3 3 2 2 ...
## $ WorkLifeBalance : int 2 3 2 3 3 2 2 3 3 3 ...
## $ YearsAtCompany : int 10 5 1 5 10 13 9 7 10 1 ...
## $ YearsInCurrentRole : int 9 4 1 4 7 12 8 7 9 0 ...
## $ YearsSinceLastPromotion : int 6 1 0 1 0 12 0 7 1 0 ...
## $ YearsWithCurrManager : int 8 4 0 2 4 8 0 7 8 0 ...
to.be.deleted2 = which(sapply(NoAttrition,function(col) length(unique(col))==1))
NoAttrition = NoAttrition[,-to.be.deleted2]
NoAttrition <- mutate_if(NoAttrition, is.character, as.factor)
str(NoAttrition)
## 'data.frame': 300 obs. of 32 variables:
## $ ID : int 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 ...
## $ Age : int 35 33 26 55 29 51 52 39 31 31 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 3 3 3 2 1 3 3 2 ...
## $ DailyRate : int 750 147 1330 1311 1246 1456 585 1387 1062 534 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 2 1 2 2 3 2 3 2 2 2 ...
## $ DistanceFromHome : int 28 2 21 2 19 1 29 10 24 20 ...
## $ Education : int 3 3 3 3 3 4 4 5 3 3 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 1 4 2 2 4 2 4 4 2 ...
## $ EmployeeNumber : int 1596 1207 1107 505 1497 145 2019 1618 1252 587 ...
## $ EnvironmentSatisfaction : int 2 2 1 3 3 1 1 2 3 1 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 2 1 2 1 2 2 1 2 ...
## $ HourlyRate : int 46 99 37 97 77 30 40 76 96 66 ...
## $ JobInvolvement : int 4 3 3 3 2 2 3 3 2 3 ...
## $ JobLevel : int 2 1 1 4 2 3 1 2 2 3 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 3 2 3 4 8 1 9 5 1 1 ...
## $ JobSatisfaction : int 3 3 3 4 3 1 4 1 1 3 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 2 2 1 3 1 3 1 2 3 2 ...
## $ MonthlyIncome : int 3407 3600 2377 16659 8620 7484 3482 5377 6812 9824 ...
## $ MonthlyRate : int 25348 8429 19373 23258 23757 25796 19788 3835 17198 22908 ...
## $ NumCompaniesWorked : int 1 1 1 2 1 3 2 2 1 3 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ PercentSalaryHike : int 17 13 20 13 14 20 15 13 19 12 ...
## $ PerformanceRating : int 3 3 4 3 3 4 3 3 3 3 ...
## $ RelationshipSatisfaction: int 4 4 3 3 3 3 2 4 2 1 ...
## $ StockOptionLevel : int 2 1 1 0 2 0 2 3 0 0 ...
## $ TotalWorkingYears : int 10 5 1 30 10 23 16 10 10 12 ...
## $ TrainingTimesLastYear : int 3 2 0 2 3 1 3 3 2 2 ...
## $ WorkLifeBalance : int 2 3 2 3 3 2 2 3 3 3 ...
## $ YearsAtCompany : int 10 5 1 5 10 13 9 7 10 1 ...
## $ YearsInCurrentRole : int 9 4 1 4 7 12 8 7 9 0 ...
## $ YearsSinceLastPromotion : int 6 1 0 1 0 12 0 7 1 0 ...
## $ YearsWithCurrManager : int 8 4 0 2 4 8 0 7 8 0 ...
predNoAttrition = suppressWarnings(predict(nb.m2,NoAttrition))
NoAttrition$Attrition <- predNoAttrition
a = NoAttrition %>% dplyr::select(ID,Attrition)
write.csv(a, "/Users/ysojd/Desktop/GRAD SCHOOL COURSEWORK/MSDS_6306_Doing-Data-Science-Master/MSDS_6306_Doing-Data-Science-Master/Unit 14 and 15 Case Study 2/Case2PredictionsYvanSojdehei Attrition.csv", row.names = FALSE)
##Analysis and creation of Monthly Income Regession Model
histogram.curve <- hist(origData$MonthlyIncome, breaks = 10, col = "purple", xlab = "Monthly Income", main = "Histogram with Normal Curve")
# Adding normal curve to the histogram
xfit <- seq(min(origData[,19]), max(origData[,19]), length=40)
yfit <- dnorm(xfit, mean=mean(origData[,19]), sd=sd((origData[,19])))
yfit <- yfit*diff(histogram.curve$mids[1:2])*length(origData$MonthlyIncome)
lines(xfit, yfit, col ="black", lwd=2)
#Create Regression Model
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
set.seed(24)
train.control <- trainControl(method = "cv", number = 10)
reg.model = train(MonthlyIncome~., data=origData,
method="lmStepAIC",
trControl = train.control,
trace=FALSE)
reg.model$results
## parameter RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 none 1018.389 0.9495696 783.2915 133.111 0.01821594 91.06654
reg.model$finalModel
##
## Call:
## lm(formula = .outcome ~ ID + BusinessTravelTravel_Frequently +
## BusinessTravelTravel_Rarely + DailyRate + DepartmentSales +
## JobLevel2 + JobLevel3 + JobLevel4 + JobLevel5 + `JobRoleHuman Resources` +
## `JobRoleLaboratory Technician` + JobRoleManager + `JobRoleResearch Director` +
## `JobRoleResearch Scientist` + `JobRoleSales Executive` +
## `JobRoleSales Representative` + TotalWorkingYears, data = dat)
##
## Coefficients:
## (Intercept) ID
## 3346.1958 -0.2286
## BusinessTravelTravel_Frequently BusinessTravelTravel_Rarely
## 191.8221 344.2855
## DailyRate DepartmentSales
## 0.1743 -533.4219
## JobLevel2 JobLevel3
## 1709.8489 4941.0322
## JobLevel4 JobLevel5
## 8268.5069 10901.0339
## `JobRoleHuman Resources` `JobRoleLaboratory Technician`
## -1147.7209 -1280.4637
## JobRoleManager `JobRoleResearch Director`
## 3535.9047 3453.8536
## `JobRoleResearch Scientist` `JobRoleSales Executive`
## -1090.2156 457.3952
## `JobRoleSales Representative` TotalWorkingYears
## -767.4799 45.3283
summary(reg.model$finalModel)
##
## Call:
## lm(formula = .outcome ~ ID + BusinessTravelTravel_Frequently +
## BusinessTravelTravel_Rarely + DailyRate + DepartmentSales +
## JobLevel2 + JobLevel3 + JobLevel4 + JobLevel5 + `JobRoleHuman Resources` +
## `JobRoleLaboratory Technician` + JobRoleManager + `JobRoleResearch Director` +
## `JobRoleResearch Scientist` + `JobRoleSales Executive` +
## `JobRoleSales Representative` + TotalWorkingYears, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3208.9 -612.0 -67.1 614.8 4188.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.346e+03 2.090e+02 16.008 < 2e-16 ***
## ID -2.286e-01 1.363e-01 -1.678 0.09374 .
## BusinessTravelTravel_Frequently 1.918e+02 1.314e+02 1.460 0.14468
## BusinessTravelTravel_Rarely 3.443e+02 1.113e+02 3.092 0.00205 **
## DailyRate 1.743e-01 8.503e-02 2.050 0.04066 *
## DepartmentSales -5.334e+02 2.887e+02 -1.848 0.06496 .
## JobLevel2 1.710e+03 1.382e+02 12.375 < 2e-16 ***
## JobLevel3 4.941e+03 1.857e+02 26.611 < 2e-16 ***
## JobLevel4 8.269e+03 2.801e+02 29.519 < 2e-16 ***
## JobLevel5 1.090e+04 3.309e+02 32.947 < 2e-16 ***
## `JobRoleHuman Resources` -1.148e+03 2.368e+02 -4.847 1.49e-06 ***
## `JobRoleLaboratory Technician` -1.280e+03 1.532e+02 -8.358 2.58e-16 ***
## JobRoleManager 3.536e+03 2.514e+02 14.062 < 2e-16 ***
## `JobRoleResearch Director` 3.454e+03 1.926e+02 17.937 < 2e-16 ***
## `JobRoleResearch Scientist` -1.090e+03 1.569e+02 -6.950 7.26e-12 ***
## `JobRoleSales Executive` 4.574e+02 3.080e+02 1.485 0.13785
## `JobRoleSales Representative` -7.675e+02 3.539e+02 -2.168 0.03040 *
## TotalWorkingYears 4.533e+01 7.688e+00 5.896 5.36e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 998.1 on 852 degrees of freedom
## Multiple R-squared: 0.9538, Adjusted R-squared: 0.9529
## F-statistic: 1035 on 17 and 852 DF, p-value: < 2.2e-16
#Utilize Model to predict monthly incomes for No Salary Case Study Data.
to.be.deleted = which(sapply(NoSalary,function(col) length(unique(col))==1))
NoSalary = NoSalary[,-to.be.deleted]
NoSalary <- mutate_if(NoSalary, is.character, as.factor)
NoSalary <- mutate_if(NoSalary, is.double, as.numeric)
NoSalary$JobLevel <- as.factor(NoSalary$JobLevel)
NoSalary$StockOptionLevel <- as.factor(NoSalary$StockOptionLevel)
NoSalary$TrainingTimesLastYear <- as.factor(NoSalary$TrainingTimesLastYear)
MonthlyIncome = predict(reg.model,NoSalary)
MonthlyIncome
## 1 2 3 4 5 6 7 8
## 5537.881 2707.907 12817.795 2328.054 2754.226 4916.403 5272.877 2191.234
## 9 10 11 12 13 14 15 16
## 2928.824 13317.230 9688.979 2583.020 5599.392 5340.885 5786.174 5486.635
## 17 18 19 20 21 22 23 24
## 5822.705 5543.723 4484.079 2703.294 4605.533 8984.025 8875.897 5485.202
## 25 26 27 28 29 30 31 32
## 10001.142 9093.672 8885.804 15983.484 5660.432 2723.825 2579.018 5530.996
## 33 34 35 36 37 38 39 40
## 6019.529 2660.622 16148.205 5315.105 9334.795 5763.148 2542.821 2581.705
## 41 42 43 44 45 46 47 48
## 19130.669 2750.799 2625.961 12829.073 5649.465 4225.467 2738.743 5440.777
## 49 50 51 52 53 54 55 56
## 2424.218 2974.414 2432.323 2458.787 4431.849 4181.562 17017.034 2855.716
## 57 58 59 60 61 62 63 64
## 5670.939 12814.809 2094.424 2582.746 5686.775 12503.024 8907.897 2316.144
## 65 66 67 68 69 70 71 72
## 2264.071 2291.291 9300.133 9280.971 4931.568 2424.298 2992.644 9093.581
## 73 74 75 76 77 78 79 80
## 9859.226 5549.724 2315.074 5477.001 2277.147 2754.664 2610.219 5442.606
## 81 82 83 84 85 86 87 88
## 2553.725 5521.576 5566.695 2537.502 4041.725 7703.864 10333.906 2493.308
## 89 90 91 92 93 94 95 96
## 9498.353 19688.877 2768.448 4724.741 5699.073 4600.736 2693.422 19311.435
## 97 98 99 100 101 102 103 104
## 2687.408 8816.051 16819.098 2667.474 5585.904 5738.769 2399.628 2590.082
## 105 106 107 108 109 110 111 112
## 5614.286 2714.628 6086.489 2711.396 9284.081 19137.316 2681.657 9207.006
## 113 114 115 116 117 118 119 120
## 2687.867 2594.680 5656.967 2626.494 2486.417 5520.485 9369.562 2581.035
## 121 122 123 124 125 126 127 128
## 4856.387 2701.390 4785.675 5377.617 5051.005 4923.578 2598.320 16668.455
## 129 130 131 132 133 134 135 136
## 9024.777 2291.917 4913.737 4172.702 4704.393 6248.709 3016.922 5630.384
## 137 138 139 140 141 142 143 144
## 2448.905 5743.089 19288.460 4708.101 9106.348 4802.405 5784.473 4274.981
## 145 146 147 148 149 150 151 152
## 5405.964 2712.933 2677.682 8891.428 9099.865 2072.814 5596.625 5210.959
## 153 154 155 156 157 158 159 160
## 4539.083 9402.125 16209.372 18518.766 4444.247 5471.234 5391.164 8576.045
## 161 162 163 164 165 166 167 168
## 5356.890 2802.638 12304.046 2493.218 4153.138 2646.155 5513.110 8040.938
## 169 170 171 172 173 174 175 176
## 12474.197 5775.698 2345.043 2842.838 8867.409 6063.293 2435.979 2239.625
## 177 178 179 180 181 182 183 184
## 5635.665 12415.694 2746.600 5534.549 10038.201 2490.359 5778.980 2804.709
## 185 186 187 188 189 190 191 192
## 2581.595 2560.490 5360.459 15897.050 5087.709 2882.824 4727.411 5487.282
## 193 194 195 196 197 198 199 200
## 5910.675 2356.743 5424.484 2506.252 3471.021 19452.645 5229.040 2381.351
## 201 202 203 204 205 206 207 208
## 2596.404 5828.186 2348.926 5661.239 5602.038 16229.052 19283.198 4380.907
## 209 210 211 212 213 214 215 216
## 4604.917 4864.711 2946.193 2584.238 2146.059 5864.301 8721.930 9162.407
## 217 218 219 220 221 222 223 224
## 5522.597 2670.211 2743.716 5382.810 5417.277 5462.756 12251.872 5410.141
## 225 226 227 228 229 230 231 232
## 5195.335 5590.131 3297.324 2515.828 2706.508 5666.625 18999.996 5567.292
## 233 234 235 236 237 238 239 240
## 5548.428 10004.799 2743.036 2642.986 2947.403 4646.415 5452.093 5461.384
## 241 242 243 244 245 246 247 248
## 5518.228 12204.191 12815.483 5831.028 9715.496 2752.434 6188.346 4230.550
## 249 250 251 252 253 254 255 256
## 9685.290 2531.710 9277.236 16337.940 4550.293 16099.162 2663.815 2129.953
## 257 258 259 260 261 262 263 264
## 2496.207 4500.900 15758.800 2370.814 12313.600 16259.804 5703.007 2205.409
## 265 266 267 268 269 270 271 272
## 2783.880 2455.741 3081.197 5542.402 9086.411 16011.846 5847.775 5636.950
## 273 274 275 276 277 278 279 280
## 9830.256 5387.184 4577.906 5490.122 4239.076 2410.974 2649.620 5686.229
## 281 282 283 284 285 286 287 288
## 2486.771 5409.324 4962.378 2450.370 5943.452 12701.155 2411.147 4800.249
## 289 290 291 292 293 294 295 296
## 2551.295 5490.079 2798.026 8848.423 2590.334 9048.613 2698.700 2101.859
## 297 298 299 300
## 9259.382 5517.849 2744.078 2745.956
NoSalary$MonthlyIncome <- MonthlyIncome
b = NoSalary%>% dplyr::select(ID,MonthlyIncome)
write.csv(b,"/Users/ysojd/Desktop/GRAD SCHOOL COURSEWORK/MSDS_6306_Doing-Data-Science-Master/MSDS_6306_Doing-Data-Science-Master/Unit 14 and 15 Case Study 2/Case2PredictionsYvanSojdehei Salary.csv" ,row.names = FALSE)
#Additional Analysis
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(dplyr)
origData = read.csv("/Users/ysojd/Desktop/GRAD SCHOOL COURSEWORK/MSDS_6306_Doing-Data-Science-Master/MSDS_6306_Doing-Data-Science-Master/Unit 14 and 15 Case Study 2/CaseStudy2-data.csv",header = TRUE)
set.seed(1)
IncomePerDegree<-data.frame(origData$MonthlyIncome/origData$Education)
#normalize <- function(x) {
# return ((x - min(x)) / (max(x) - min(x))) }
#IncomePerDegree <- as.data.frame(lapply(IncomePerDegree, normalize))
IncomePerDegree <- as.data.frame(IncomePerDegree)
IncomePerDegree <- setNames(IncomePerDegree, "Incomeperdegree")
origData<-cbind(origData,IncomePerDegree)
# Effect of Gender, Education and Income Per Level of Education on Attrition(Y/N)
pl <- ggplot(origData, aes(x=origData$Incomeperdegree, y=Education, color=EducationField)) + geom_point(size=3, shape=2)+ ggtitle("Effect of Gender, Education and Income Per Level of Education on Attrition(Y/N)") + xlab("Monthly Income By Degree")
pl + facet_grid(Attrition ~ Gender)
## Warning: Use of `origData$Incomeperdegree` is discouraged. Use `Incomeperdegree`
## instead.
#Education Level as it relates to Income
origData %>% dplyr::select(Incomeperdegree,Education,Attrition,JobRole) %>%
ggplot(aes(x=Education, y=Incomeperdegree, color=JobRole)) +
geom_point(size=3, shape=20, position="jitter") +
geom_smooth(method=lm) +
ggtitle("Education Level vs Income")+xlab("Education Level")+ylab("Income Per Degree")
## `geom_smooth()` using formula 'y ~ x'
#Look at Gender as it relates to Attrition
origData$Attrition <- as.factor(origData$Attrition)
origData$Gender <- as.factor(origData$Gender)
origData %>%
ggplot(aes(x=Gender, y=Attrition ,fill=Department)) +
geom_point(size=2, shape=23, position="jitter") +
geom_smooth(method=lm) +
ggtitle("Gender vs. Attrition")+xlab("Gender")+ylab("Attrition")
## `geom_smooth()` using formula 'y ~ x'