Data Wrangling and Initial Analysis

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(e1071)
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(tidyverse)
## 
## v tibble  3.0.3     v purrr   0.3.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## 
## x plotly::filter() masks dplyr::filter(), stats::filter()
## x dplyr::lag()     masks stats::lag()
library(class)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
origData = read.csv("/Users/ysojd/Desktop/GRAD SCHOOL COURSEWORK/MSDS_6306_Doing-Data-Science-Master/MSDS_6306_Doing-Data-Science-Master/Unit 14 and 15 Case Study 2/CaseStudy2-data.csv",header = TRUE)

NoSalary <- readxl::read_xlsx("/Users/ysojd/Desktop/GRAD SCHOOL COURSEWORK/MSDS_6306_Doing-Data-Science-Master/MSDS_6306_Doing-Data-Science-Master/Unit 14 and 15 Case Study 2/CaseStudy2CompSet No Salary.xlsx")


NoAttrition <- read.csv("/Users/ysojd/Desktop/GRAD SCHOOL COURSEWORK/MSDS_6306_Doing-Data-Science-Master/MSDS_6306_Doing-Data-Science-Master/Unit 14 and 15 Case Study 2/CaseStudy2CompSet No Attrition.csv",header = TRUE)
to.be.deleted = which(sapply(origData,function(col) length(unique(col))==1))
origData = origData[,-to.be.deleted]
origData <- mutate_if(origData, is.character, as.factor)

##Analysis Plots

#Overall Atrition Count
ggplot(data=origData)+
  geom_bar(mapping=aes(x=Attrition))

#Employee attrition count by Department
origData %>% ggplot() + geom_bar(mapping=aes(x=Department, fill=Attrition)) + coord_flip()

#Male vs Female Attrition Count
origData %>% ggplot() + geom_bar(mapping=aes(x=Gender, fill=Attrition)) + coord_flip()

#Satisfaction
origData %>% ggplot()+ geom_bar(aes(x=EnvironmentSatisfaction,fill=Attrition),position="fill")+
  scale_y_continuous(labels = scales::percent)+
  ggtitle("Attrition Due to Environmental Satisfaction")+ylab("Attrition Rate")+xlab("Satisfaction")

# Effects of Job Level, Salary Hike, and Monthly Income on Attrition
pl <- ggplot(origData, aes(x=MonthlyIncome, y=PercentSalaryHike)) + geom_point(shape=2)+ ggtitle("Effect of Job Level(1-5), PercentSalaryHike and MonthlyIncome on Attrition(Y/N)")
pl + facet_grid(Attrition ~ JobLevel)

##Create Classification Model #NB

library(lattice)
library(caret)
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.0.3
library(rsample)
## Warning: package 'rsample' was built under R version 4.0.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.3
## corrplot 0.84 loaded
library(h2o)
## Warning: package 'h2o' was built under R version 4.0.3
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
library(ggplot2)
library(dplyr)
library(tidyr)


control = trainControl(method="repeatedcv", number=10, repeats=3)
#train the model
model = train(Attrition~.,data=origData,method="lvq",preProcess="scale", trControl=control)
#Determine Variable Priority
importance = varImp(model,scale=FALSE)
#summarize importance
print(importance)
## ROC curve variable importance
## 
##   only 20 most important variables shown (out of 32)
## 
##                         Importance
## OverTime                    0.6679
## MonthlyIncome               0.6567
## TotalWorkingYears           0.6563
## YearsAtCompany              0.6470
## StockOptionLevel            0.6455
## MaritalStatus               0.6438
## JobLevel                    0.6406
## YearsInCurrentRole          0.6403
## YearsWithCurrManager        0.6291
## Age                         0.6265
## JobInvolvement              0.6159
## JobSatisfaction             0.5833
## JobRole                     0.5829
## Department                  0.5605
## DistanceFromHome            0.5586
## EnvironmentSatisfaction     0.5532
## WorkLifeBalance             0.5491
## TrainingTimesLastYear       0.5428
## Education                   0.5384
## ID                          0.5371
origData <- origData %>%
  mutate(
    JobLevel = factor(JobLevel),
    StockOptionLevel = factor(StockOptionLevel),
    TrainingTimesLastYear = factor(TrainingTimesLastYear)
  )

set.seed(13)
split <- initial_split(origData, prop = .7, strata = "Attrition")
train <- training(split)
test  <- testing(split)

# distribution of Attrition rates across train & test set
table(train$Attrition) %>% prop.table()
## 
##        No       Yes 
## 0.8390805 0.1609195
table(test$Attrition) %>% prop.table()
## 
##        No       Yes 
## 0.8390805 0.1609195
train %>%
  filter(Attrition == "Yes") %>%
  select_if(is.numeric) %>%
  cor() %>%
  corrplot::corrplot()

train %>% dplyr::select(MonthlyIncome, Education, PerformanceRating, TotalWorkingYears, HourlyRate, JobInvolvement) %>% gather(metric, value) %>% ggplot(aes(value, fill = metric)) + geom_density(show.legend = FALSE) + facet_wrap(~ metric, scales = "free")

features <- setdiff(names(train), "Attrition")
x <- train[, features]
y <- train$Attrition

# set up 10-fold cross validation procedure
train_control <- trainControl(
  method = "repeatedcv", 
  number = 10
  )

# train model
nb.m1 <-suppressWarnings(train(
  x = x,
  y = y,
  method = "nb",
  trControl = train_control
  ))

# results
confusionMatrix(nb.m1)
## Cross-Validated (10 fold, repeated 1 times) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##           Reference
## Prediction   No  Yes
##        No  81.3 11.8
##        Yes  2.6  4.3
##                             
##  Accuracy (average) : 0.8555
search_grid <- expand.grid(
  usekernel = c(TRUE, FALSE),
  fL = 0:5,
  adjust = seq(0, 5, by = 1)
)

# train model
nb.m2 <-suppressWarnings(train(
  x = x,
  y = y,
  method = "nb",
  trControl = train_control,
  tuneGrid = search_grid,
  preProc = c("BoxCox", "center", "scale", "pca")
  ))

# top 5 modesl
nb.m2$results %>% 
  top_n(5, wt = Accuracy) %>%
  arrange(desc(Accuracy))
##   usekernel fL adjust  Accuracy     Kappa AccuracySD   KappaSD
## 1     FALSE  2      0 0.8668808 0.4233602 0.04442369 0.1925121
## 2     FALSE  2      1 0.8668808 0.4233602 0.04442369 0.1925121
## 3     FALSE  2      2 0.8668808 0.4233602 0.04442369 0.1925121
## 4     FALSE  2      3 0.8668808 0.4233602 0.04442369 0.1925121
## 5     FALSE  2      4 0.8668808 0.4233602 0.04442369 0.1925121
## 6     FALSE  2      5 0.8668808 0.4233602 0.04442369 0.1925121
##   usekernel fL adjust  Accuracy     Kappa AccuracySD   KappaSD
## 1      TRUE  1      3 0.8737864 0.4435322 0.02858175 0.1262286
## 2      TRUE  0      2 0.8689320 0.4386202 0.02903618 0.1155707
## 3      TRUE  2      3 0.8689320 0.4750282 0.02830559 0.0970368
## 4      TRUE  2      4 0.8689320 0.4008608 0.02432572 0.1234943
## 5      TRUE  4      5 0.8689320 0.4439767 0.02867321 0.1354681

# plot search grid results
plot(nb.m2)

confusionMatrix(nb.m2)
## Cross-Validated (10 fold, repeated 1 times) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##           Reference
## Prediction   No  Yes
##        No  80.1  9.5
##        Yes  3.8  6.6
##                            
##  Accuracy (average) : 0.867
pred <-suppressWarnings(predict(nb.m2, newdata = test))
confusionMatrix(pred, as.factor(test$Attrition))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  201  17
##        Yes  18  25
##                                           
##                Accuracy : 0.8659          
##                  95% CI : (0.8185, 0.9048)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : 0.1358          
##                                           
##                   Kappa : 0.5082          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.9178          
##             Specificity : 0.5952          
##          Pos Pred Value : 0.9220          
##          Neg Pred Value : 0.5814          
##              Prevalence : 0.8391          
##          Detection Rate : 0.7701          
##    Detection Prevalence : 0.8352          
##       Balanced Accuracy : 0.7565          
##                                           
##        'Positive' Class : No              
## 

#Get Average Accuracy, Sensitivity, and Specificity

set.seed(13)
iterations = 100
masterAcc = matrix(nrow = iterations,ncol=3)
splitPerc = .75 #Training / Test split Percentage
for(j in 1:iterations)
{
  
  trainIndices = sample(1:dim(origData)[1],round(splitPerc * dim(origData)[1]))
  train = origData[trainIndices,]
  test = origData[-trainIndices,]
  
  model = naiveBayes(Attrition~.,data=train,laplace = 1)
  CM = confusionMatrix(table(predict(model,test),test$Attrition))
  masterAcc[j,1] = CM$overall[1]
  masterAcc[j,2]=CM$byClass[1]
  masterAcc[j,3] = CM$byClass[2]
}

MeanAcc = colMeans(masterAcc)
MeanAcc
## [1] 0.8286697 0.8719328 0.6083380

##Use trained predictive model to measure attrition against NoAttrition dataset.

str(NoAttrition)
## 'data.frame':    300 obs. of  35 variables:
##  $ ID                      : int  1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 ...
##  $ Age                     : int  35 33 26 55 29 51 52 39 31 31 ...
##  $ BusinessTravel          : chr  "Travel_Rarely" "Travel_Rarely" "Travel_Rarely" "Travel_Rarely" ...
##  $ DailyRate               : int  750 147 1330 1311 1246 1456 585 1387 1062 534 ...
##  $ Department              : chr  "Research & Development" "Human Resources" "Research & Development" "Research & Development" ...
##  $ DistanceFromHome        : int  28 2 21 2 19 1 29 10 24 20 ...
##  $ Education               : int  3 3 3 3 3 4 4 5 3 3 ...
##  $ EducationField          : chr  "Life Sciences" "Human Resources" "Medical" "Life Sciences" ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  1596 1207 1107 505 1497 145 2019 1618 1252 587 ...
##  $ EnvironmentSatisfaction : int  2 2 1 3 3 1 1 2 3 1 ...
##  $ Gender                  : chr  "Male" "Male" "Male" "Female" ...
##  $ HourlyRate              : int  46 99 37 97 77 30 40 76 96 66 ...
##  $ JobInvolvement          : int  4 3 3 3 2 2 3 3 2 3 ...
##  $ JobLevel                : int  2 1 1 4 2 3 1 2 2 3 ...
##  $ JobRole                 : chr  "Laboratory Technician" "Human Resources" "Laboratory Technician" "Manager" ...
##  $ JobSatisfaction         : int  3 3 3 4 3 1 4 1 1 3 ...
##  $ MaritalStatus           : chr  "Married" "Married" "Divorced" "Single" ...
##  $ MonthlyIncome           : int  3407 3600 2377 16659 8620 7484 3482 5377 6812 9824 ...
##  $ MonthlyRate             : int  25348 8429 19373 23258 23757 25796 19788 3835 17198 22908 ...
##  $ NumCompaniesWorked      : int  1 1 1 2 1 3 2 2 1 3 ...
##  $ Over18                  : chr  "Y" "Y" "Y" "Y" ...
##  $ OverTime                : chr  "No" "No" "No" "Yes" ...
##  $ PercentSalaryHike       : int  17 13 20 13 14 20 15 13 19 12 ...
##  $ PerformanceRating       : int  3 3 4 3 3 4 3 3 3 3 ...
##  $ RelationshipSatisfaction: int  4 4 3 3 3 3 2 4 2 1 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  2 1 1 0 2 0 2 3 0 0 ...
##  $ TotalWorkingYears       : int  10 5 1 30 10 23 16 10 10 12 ...
##  $ TrainingTimesLastYear   : int  3 2 0 2 3 1 3 3 2 2 ...
##  $ WorkLifeBalance         : int  2 3 2 3 3 2 2 3 3 3 ...
##  $ YearsAtCompany          : int  10 5 1 5 10 13 9 7 10 1 ...
##  $ YearsInCurrentRole      : int  9 4 1 4 7 12 8 7 9 0 ...
##  $ YearsSinceLastPromotion : int  6 1 0 1 0 12 0 7 1 0 ...
##  $ YearsWithCurrManager    : int  8 4 0 2 4 8 0 7 8 0 ...
to.be.deleted2 = which(sapply(NoAttrition,function(col) length(unique(col))==1))
NoAttrition = NoAttrition[,-to.be.deleted2]


NoAttrition <- mutate_if(NoAttrition, is.character, as.factor)
str(NoAttrition)
## 'data.frame':    300 obs. of  32 variables:
##  $ ID                      : int  1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 ...
##  $ Age                     : int  35 33 26 55 29 51 52 39 31 31 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 3 3 3 2 1 3 3 2 ...
##  $ DailyRate               : int  750 147 1330 1311 1246 1456 585 1387 1062 534 ...
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 2 1 2 2 3 2 3 2 2 2 ...
##  $ DistanceFromHome        : int  28 2 21 2 19 1 29 10 24 20 ...
##  $ Education               : int  3 3 3 3 3 4 4 5 3 3 ...
##  $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 1 4 2 2 4 2 4 4 2 ...
##  $ EmployeeNumber          : int  1596 1207 1107 505 1497 145 2019 1618 1252 587 ...
##  $ EnvironmentSatisfaction : int  2 2 1 3 3 1 1 2 3 1 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 2 2 2 1 2 1 2 2 1 2 ...
##  $ HourlyRate              : int  46 99 37 97 77 30 40 76 96 66 ...
##  $ JobInvolvement          : int  4 3 3 3 2 2 3 3 2 3 ...
##  $ JobLevel                : int  2 1 1 4 2 3 1 2 2 3 ...
##  $ JobRole                 : Factor w/ 9 levels "Healthcare Representative",..: 3 2 3 4 8 1 9 5 1 1 ...
##  $ JobSatisfaction         : int  3 3 3 4 3 1 4 1 1 3 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 2 2 1 3 1 3 1 2 3 2 ...
##  $ MonthlyIncome           : int  3407 3600 2377 16659 8620 7484 3482 5377 6812 9824 ...
##  $ MonthlyRate             : int  25348 8429 19373 23258 23757 25796 19788 3835 17198 22908 ...
##  $ NumCompaniesWorked      : int  1 1 1 2 1 3 2 2 1 3 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ PercentSalaryHike       : int  17 13 20 13 14 20 15 13 19 12 ...
##  $ PerformanceRating       : int  3 3 4 3 3 4 3 3 3 3 ...
##  $ RelationshipSatisfaction: int  4 4 3 3 3 3 2 4 2 1 ...
##  $ StockOptionLevel        : int  2 1 1 0 2 0 2 3 0 0 ...
##  $ TotalWorkingYears       : int  10 5 1 30 10 23 16 10 10 12 ...
##  $ TrainingTimesLastYear   : int  3 2 0 2 3 1 3 3 2 2 ...
##  $ WorkLifeBalance         : int  2 3 2 3 3 2 2 3 3 3 ...
##  $ YearsAtCompany          : int  10 5 1 5 10 13 9 7 10 1 ...
##  $ YearsInCurrentRole      : int  9 4 1 4 7 12 8 7 9 0 ...
##  $ YearsSinceLastPromotion : int  6 1 0 1 0 12 0 7 1 0 ...
##  $ YearsWithCurrManager    : int  8 4 0 2 4 8 0 7 8 0 ...
predNoAttrition = suppressWarnings(predict(nb.m2,NoAttrition))
NoAttrition$Attrition <- predNoAttrition

a = NoAttrition %>% dplyr::select(ID,Attrition)

write.csv(a, "/Users/ysojd/Desktop/GRAD SCHOOL COURSEWORK/MSDS_6306_Doing-Data-Science-Master/MSDS_6306_Doing-Data-Science-Master/Unit 14 and 15 Case Study 2/Case2PredictionsYvanSojdehei Attrition.csv", row.names = FALSE)

##Analysis and creation of Monthly Income Regession Model

histogram.curve <- hist(origData$MonthlyIncome, breaks = 10, col = "purple", xlab = "Monthly Income", main = "Histogram with Normal Curve")
# Adding normal curve to the histogram
xfit <- seq(min(origData[,19]), max(origData[,19]), length=40)
yfit <- dnorm(xfit, mean=mean(origData[,19]), sd=sd((origData[,19])))
yfit <- yfit*diff(histogram.curve$mids[1:2])*length(origData$MonthlyIncome)
lines(xfit, yfit, col ="black", lwd=2)

#Create Regression Model

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
set.seed(24)
train.control <- trainControl(method = "cv", number = 10)
reg.model = train(MonthlyIncome~., data=origData,
                   method="lmStepAIC",
                   trControl = train.control,
                   trace=FALSE)
reg.model$results
##   parameter     RMSE  Rsquared      MAE  RMSESD RsquaredSD    MAESD
## 1      none 1018.389 0.9495696 783.2915 133.111 0.01821594 91.06654
reg.model$finalModel
## 
## Call:
## lm(formula = .outcome ~ ID + BusinessTravelTravel_Frequently + 
##     BusinessTravelTravel_Rarely + DailyRate + DepartmentSales + 
##     JobLevel2 + JobLevel3 + JobLevel4 + JobLevel5 + `JobRoleHuman Resources` + 
##     `JobRoleLaboratory Technician` + JobRoleManager + `JobRoleResearch Director` + 
##     `JobRoleResearch Scientist` + `JobRoleSales Executive` + 
##     `JobRoleSales Representative` + TotalWorkingYears, data = dat)
## 
## Coefficients:
##                     (Intercept)                               ID  
##                       3346.1958                          -0.2286  
## BusinessTravelTravel_Frequently      BusinessTravelTravel_Rarely  
##                        191.8221                         344.2855  
##                       DailyRate                  DepartmentSales  
##                          0.1743                        -533.4219  
##                       JobLevel2                        JobLevel3  
##                       1709.8489                        4941.0322  
##                       JobLevel4                        JobLevel5  
##                       8268.5069                       10901.0339  
##        `JobRoleHuman Resources`   `JobRoleLaboratory Technician`  
##                      -1147.7209                       -1280.4637  
##                  JobRoleManager       `JobRoleResearch Director`  
##                       3535.9047                        3453.8536  
##     `JobRoleResearch Scientist`         `JobRoleSales Executive`  
##                      -1090.2156                         457.3952  
##   `JobRoleSales Representative`                TotalWorkingYears  
##                       -767.4799                          45.3283
summary(reg.model$finalModel)
## 
## Call:
## lm(formula = .outcome ~ ID + BusinessTravelTravel_Frequently + 
##     BusinessTravelTravel_Rarely + DailyRate + DepartmentSales + 
##     JobLevel2 + JobLevel3 + JobLevel4 + JobLevel5 + `JobRoleHuman Resources` + 
##     `JobRoleLaboratory Technician` + JobRoleManager + `JobRoleResearch Director` + 
##     `JobRoleResearch Scientist` + `JobRoleSales Executive` + 
##     `JobRoleSales Representative` + TotalWorkingYears, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3208.9  -612.0   -67.1   614.8  4188.1 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      3.346e+03  2.090e+02  16.008  < 2e-16 ***
## ID                              -2.286e-01  1.363e-01  -1.678  0.09374 .  
## BusinessTravelTravel_Frequently  1.918e+02  1.314e+02   1.460  0.14468    
## BusinessTravelTravel_Rarely      3.443e+02  1.113e+02   3.092  0.00205 ** 
## DailyRate                        1.743e-01  8.503e-02   2.050  0.04066 *  
## DepartmentSales                 -5.334e+02  2.887e+02  -1.848  0.06496 .  
## JobLevel2                        1.710e+03  1.382e+02  12.375  < 2e-16 ***
## JobLevel3                        4.941e+03  1.857e+02  26.611  < 2e-16 ***
## JobLevel4                        8.269e+03  2.801e+02  29.519  < 2e-16 ***
## JobLevel5                        1.090e+04  3.309e+02  32.947  < 2e-16 ***
## `JobRoleHuman Resources`        -1.148e+03  2.368e+02  -4.847 1.49e-06 ***
## `JobRoleLaboratory Technician`  -1.280e+03  1.532e+02  -8.358 2.58e-16 ***
## JobRoleManager                   3.536e+03  2.514e+02  14.062  < 2e-16 ***
## `JobRoleResearch Director`       3.454e+03  1.926e+02  17.937  < 2e-16 ***
## `JobRoleResearch Scientist`     -1.090e+03  1.569e+02  -6.950 7.26e-12 ***
## `JobRoleSales Executive`         4.574e+02  3.080e+02   1.485  0.13785    
## `JobRoleSales Representative`   -7.675e+02  3.539e+02  -2.168  0.03040 *  
## TotalWorkingYears                4.533e+01  7.688e+00   5.896 5.36e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 998.1 on 852 degrees of freedom
## Multiple R-squared:  0.9538, Adjusted R-squared:  0.9529 
## F-statistic:  1035 on 17 and 852 DF,  p-value: < 2.2e-16

#Utilize Model to predict monthly incomes for No Salary Case Study Data.

to.be.deleted = which(sapply(NoSalary,function(col) length(unique(col))==1))
NoSalary = NoSalary[,-to.be.deleted]
NoSalary <- mutate_if(NoSalary, is.character, as.factor)
NoSalary <- mutate_if(NoSalary, is.double, as.numeric)
NoSalary$JobLevel <- as.factor(NoSalary$JobLevel)
NoSalary$StockOptionLevel <- as.factor(NoSalary$StockOptionLevel)
NoSalary$TrainingTimesLastYear <- as.factor(NoSalary$TrainingTimesLastYear)

MonthlyIncome = predict(reg.model,NoSalary)
MonthlyIncome
##         1         2         3         4         5         6         7         8 
##  5537.881  2707.907 12817.795  2328.054  2754.226  4916.403  5272.877  2191.234 
##         9        10        11        12        13        14        15        16 
##  2928.824 13317.230  9688.979  2583.020  5599.392  5340.885  5786.174  5486.635 
##        17        18        19        20        21        22        23        24 
##  5822.705  5543.723  4484.079  2703.294  4605.533  8984.025  8875.897  5485.202 
##        25        26        27        28        29        30        31        32 
## 10001.142  9093.672  8885.804 15983.484  5660.432  2723.825  2579.018  5530.996 
##        33        34        35        36        37        38        39        40 
##  6019.529  2660.622 16148.205  5315.105  9334.795  5763.148  2542.821  2581.705 
##        41        42        43        44        45        46        47        48 
## 19130.669  2750.799  2625.961 12829.073  5649.465  4225.467  2738.743  5440.777 
##        49        50        51        52        53        54        55        56 
##  2424.218  2974.414  2432.323  2458.787  4431.849  4181.562 17017.034  2855.716 
##        57        58        59        60        61        62        63        64 
##  5670.939 12814.809  2094.424  2582.746  5686.775 12503.024  8907.897  2316.144 
##        65        66        67        68        69        70        71        72 
##  2264.071  2291.291  9300.133  9280.971  4931.568  2424.298  2992.644  9093.581 
##        73        74        75        76        77        78        79        80 
##  9859.226  5549.724  2315.074  5477.001  2277.147  2754.664  2610.219  5442.606 
##        81        82        83        84        85        86        87        88 
##  2553.725  5521.576  5566.695  2537.502  4041.725  7703.864 10333.906  2493.308 
##        89        90        91        92        93        94        95        96 
##  9498.353 19688.877  2768.448  4724.741  5699.073  4600.736  2693.422 19311.435 
##        97        98        99       100       101       102       103       104 
##  2687.408  8816.051 16819.098  2667.474  5585.904  5738.769  2399.628  2590.082 
##       105       106       107       108       109       110       111       112 
##  5614.286  2714.628  6086.489  2711.396  9284.081 19137.316  2681.657  9207.006 
##       113       114       115       116       117       118       119       120 
##  2687.867  2594.680  5656.967  2626.494  2486.417  5520.485  9369.562  2581.035 
##       121       122       123       124       125       126       127       128 
##  4856.387  2701.390  4785.675  5377.617  5051.005  4923.578  2598.320 16668.455 
##       129       130       131       132       133       134       135       136 
##  9024.777  2291.917  4913.737  4172.702  4704.393  6248.709  3016.922  5630.384 
##       137       138       139       140       141       142       143       144 
##  2448.905  5743.089 19288.460  4708.101  9106.348  4802.405  5784.473  4274.981 
##       145       146       147       148       149       150       151       152 
##  5405.964  2712.933  2677.682  8891.428  9099.865  2072.814  5596.625  5210.959 
##       153       154       155       156       157       158       159       160 
##  4539.083  9402.125 16209.372 18518.766  4444.247  5471.234  5391.164  8576.045 
##       161       162       163       164       165       166       167       168 
##  5356.890  2802.638 12304.046  2493.218  4153.138  2646.155  5513.110  8040.938 
##       169       170       171       172       173       174       175       176 
## 12474.197  5775.698  2345.043  2842.838  8867.409  6063.293  2435.979  2239.625 
##       177       178       179       180       181       182       183       184 
##  5635.665 12415.694  2746.600  5534.549 10038.201  2490.359  5778.980  2804.709 
##       185       186       187       188       189       190       191       192 
##  2581.595  2560.490  5360.459 15897.050  5087.709  2882.824  4727.411  5487.282 
##       193       194       195       196       197       198       199       200 
##  5910.675  2356.743  5424.484  2506.252  3471.021 19452.645  5229.040  2381.351 
##       201       202       203       204       205       206       207       208 
##  2596.404  5828.186  2348.926  5661.239  5602.038 16229.052 19283.198  4380.907 
##       209       210       211       212       213       214       215       216 
##  4604.917  4864.711  2946.193  2584.238  2146.059  5864.301  8721.930  9162.407 
##       217       218       219       220       221       222       223       224 
##  5522.597  2670.211  2743.716  5382.810  5417.277  5462.756 12251.872  5410.141 
##       225       226       227       228       229       230       231       232 
##  5195.335  5590.131  3297.324  2515.828  2706.508  5666.625 18999.996  5567.292 
##       233       234       235       236       237       238       239       240 
##  5548.428 10004.799  2743.036  2642.986  2947.403  4646.415  5452.093  5461.384 
##       241       242       243       244       245       246       247       248 
##  5518.228 12204.191 12815.483  5831.028  9715.496  2752.434  6188.346  4230.550 
##       249       250       251       252       253       254       255       256 
##  9685.290  2531.710  9277.236 16337.940  4550.293 16099.162  2663.815  2129.953 
##       257       258       259       260       261       262       263       264 
##  2496.207  4500.900 15758.800  2370.814 12313.600 16259.804  5703.007  2205.409 
##       265       266       267       268       269       270       271       272 
##  2783.880  2455.741  3081.197  5542.402  9086.411 16011.846  5847.775  5636.950 
##       273       274       275       276       277       278       279       280 
##  9830.256  5387.184  4577.906  5490.122  4239.076  2410.974  2649.620  5686.229 
##       281       282       283       284       285       286       287       288 
##  2486.771  5409.324  4962.378  2450.370  5943.452 12701.155  2411.147  4800.249 
##       289       290       291       292       293       294       295       296 
##  2551.295  5490.079  2798.026  8848.423  2590.334  9048.613  2698.700  2101.859 
##       297       298       299       300 
##  9259.382  5517.849  2744.078  2745.956
NoSalary$MonthlyIncome <- MonthlyIncome
b = NoSalary%>% dplyr::select(ID,MonthlyIncome)

write.csv(b,"/Users/ysojd/Desktop/GRAD SCHOOL COURSEWORK/MSDS_6306_Doing-Data-Science-Master/MSDS_6306_Doing-Data-Science-Master/Unit 14 and 15 Case Study 2/Case2PredictionsYvanSojdehei Salary.csv" ,row.names = FALSE)

#Additional Analysis

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(dplyr)
origData = read.csv("/Users/ysojd/Desktop/GRAD SCHOOL COURSEWORK/MSDS_6306_Doing-Data-Science-Master/MSDS_6306_Doing-Data-Science-Master/Unit 14 and 15 Case Study 2/CaseStudy2-data.csv",header = TRUE)

set.seed(1)
IncomePerDegree<-data.frame(origData$MonthlyIncome/origData$Education)
#normalize <- function(x) {
#  return ((x - min(x)) / (max(x) - min(x))) }
#IncomePerDegree <- as.data.frame(lapply(IncomePerDegree, normalize))
IncomePerDegree <- as.data.frame(IncomePerDegree)
IncomePerDegree <- setNames(IncomePerDegree, "Incomeperdegree")
origData<-cbind(origData,IncomePerDegree)

# Effect of Gender, Education and Income Per Level of Education on Attrition(Y/N)
pl <- ggplot(origData, aes(x=origData$Incomeperdegree, y=Education, color=EducationField)) + geom_point(size=3, shape=2)+ ggtitle("Effect of Gender, Education and Income Per Level of Education on Attrition(Y/N)") + xlab("Monthly Income By Degree")
pl + facet_grid(Attrition ~ Gender)
## Warning: Use of `origData$Incomeperdegree` is discouraged. Use `Incomeperdegree`
## instead.

#Education Level as it relates to Income
origData %>% dplyr::select(Incomeperdegree,Education,Attrition,JobRole) %>%
  ggplot(aes(x=Education, y=Incomeperdegree, color=JobRole)) +
  geom_point(size=3, shape=20, position="jitter") +
  geom_smooth(method=lm) +
  ggtitle("Education Level vs Income")+xlab("Education Level")+ylab("Income Per Degree")
## `geom_smooth()` using formula 'y ~ x'

#Look at Gender as it relates to Attrition
origData$Attrition <- as.factor(origData$Attrition)
origData$Gender <- as.factor(origData$Gender)
origData %>% 
  ggplot(aes(x=Gender, y=Attrition ,fill=Department)) +
  geom_point(size=2, shape=23,  position="jitter") +
  geom_smooth(method=lm) +
  ggtitle("Gender vs. Attrition")+xlab("Gender")+ylab("Attrition")
## `geom_smooth()` using formula 'y ~ x'