Recent College Graduates Salary

Wait WAT?

library(reshape2)
library(useful)

## Warning: package 'useful' was built under R version 3.4.4

## Loading required package: ggplot2

library(ggplot2)
library(scales)

setwd('E:\\GMU\\Spring 2018\\STAT 463\\Final')

collegeSalaries<-read.csv('salaries-by-region.csv')
head(collegeSalaries,10)

##                                          sn State     Region
## 1                       Stanford University    CA California
## 2       California Institute of Technology     CA California
## 3                       Harvey Mudd College    CA California
## 4        University of California, Berkeley    CA California
## 5                        Occidental College    CA California
## 6                  Cal Poly San Luis Obispo    CA California
## 7 University of California at Los Angeles     CA California
## 8      University of California, San Diego     CA California
## 9                            Pomona College    CA California
## 10       University of Southern California     CA California
##    Starting.Mean.Salary Mid.Career.Median.Salary
## 1                 70400                   129000
## 2                 75500                   123000
## 3                 71800                   122000
## 4                 59900                   112000
## 5                 51900                   105000
## 6                 57200                   101000
## 7                 52600                   101000
## 8                 51100                   101000
## 9                 48600                   101000
## 10                54800                    99600

colnames(collegeSalaries)<-c("sn","state","region","sms","mcms")
sapply(collegeSalaries,class)

## sn state region sms mcms
## "factor" "factor" "factor" "integer" "integer"

cs_ordered<-collegeSalaries[order(collegeSalaries[,4],decreasing = TRUE),]
cs_top50<-head(cs_ordered,50)
cs_2nd50<-cs_ordered[51:100,]

ggplot(cs_top50,aes(x=reorder(sn,sms),y=sms))+
geom_bar(stat="identity",width=0.5 ,fill="cyan")+
geom_text(aes(label=sms),position = position_dodge(width = 0.5))+
coord_flip()+
xlab("School Name")+
ylab("Starting Average Salary")+
ggtitle('Top 50 Starting Average Salary')

Please Join Me

ggplot(cs_2nd50,aes(x=reorder(sn,sms),y=sms))+
geom_bar(stat="identity",width=0.5 ,fill="cyan")+
geom_text(aes(label=sms),position = position_dodge(width = 0.5))+
coord_flip()+
xlab("School Name")+
ylab("Starting Average Salary")+
ggtitle('Second 50 Starting Average Salary')

library(ggforce)

## Warning: package 'ggforce' was built under R version 3.4.4

calsal<-subset(cs_ordered,region=="California")
ggplot(calsal,aes(x=reorder(sn,sms),y=sms))+
geom_bar(stat="identity",fill="blue")+
geom_text(aes(label=sms),hjust = -0.1, size = 2,
position = position_dodge(width = 1),
inherit.aes = TRUE)+
xlab("School Name")+
ylab("Starting Average Salary")+
ggtitle('Starting Average Salary, California')+
coord_flip()

westsal<-subset(cs_ordered,region=="Western")
ggplot(westsal,aes(x=reorder(sn,sms),y=sms))+
geom_bar(stat="identity",fill="blue")+
geom_text(aes(label=sms),hjust = -0.1, size = 2,
position = position_dodge(width = 1),
inherit.aes = TRUE)+
xlab("School Name")+
ylab("Starting Average Salary")+
ggtitle('Starting Average Salary, Western')+
coord_flip()

#nesal<-subset(cs_ordered,region=="Northeastern")
#ggplot(nesal,aes(x=reorder(sn,sms),y=sms))+
#geom_bar(stat="identity",width=0.77,fill="blue")+
#geom_text(aes(label=sms),hjust = -0.1, size = 2,
            #position = position_dodge(width = 1),
            #inherit.aes = TRUE)+
#xlab("School Name")+
#ylab("Starting Average Salary")+
#ggtitle('Starting Average Salary, Northeastern')+
#coord_flip()

sousal<-subset(cs_ordered,region=="Southern")
ggplot(sousal,aes(x=reorder(sn,sms),y=sms))+
geom_bar(stat="identity",width=0.77,fill="blue")+
geom_text(aes(label=sms),hjust = -0.1, size = 2,
            position = position_dodge(width = 1),
            inherit.aes = TRUE)+
xlab("School Name")+
ylab("Starting Average Salary")+
ggtitle('Starting Average Salary, Southern')+
coord_flip()

statesal<-read.csv('state_sal.csv')
head(statesal)

##   State       Region mean increase   percent
## 1    AL     Southern 42980    35320 0.8217776
## 2    AR     Southern 40667    32867 0.8081967
## 3    AZ      Western 47450    37650 0.7934668
## 4    CA      Western 51032    42100 0.8249703
## 5    CO      Western 46414    38157 0.8220991
## 6    CT Northeastern 47833    43400 0.9073171

growth<-ggplot(statesal)+
geom_point(aes(x=mean,y=reorder(State,mean)))+
geom_segment(data=statesal,aes(x=mean,y=State,xend=mean+increase,yend=State),arrow = arrow())+
facet_wrap(~Region,ncol=1,scales = "free")

growth

majorsal<-read.csv('recent-grads.csv')
majorsal$Rank<-NULL
majorsal$Major_code<-NULL
head(majorsal)

##                                       Major Major_category Total
## 1                     PETROLEUM ENGINEERING    Engineering 2339
## 2            MINING AND MINERAL ENGINEERING    Engineering   756
## 3                 METALLURGICAL ENGINEERING    Engineering   856
## 4 NAVAL ARCHITECTURE AND MARINE ENGINEERING    Engineering 1258
## 5                      CHEMICAL ENGINEERING    Engineering 32260
## 6                       NUCLEAR ENGINEERING    Engineering 2573
##   Sample_size   Men Women ShareWomen Employed Full_time Part_time
## 1          36 2057   282 0.1205643     1976      1849       270
## 2           7   679    77 0.1018519      640       556       170
## 3           3   725   131 0.1530374      648       558       133
## 4          16 1123   135 0.1073132      758      1069       150
## 5         289 21239 11021 0.3416305    25694     23170      5180
## 6          17 2200   373 0.1449670     1857      2038       264
##   Full_time_year_round Unemployed Unemployment_rate   Mean College_jobs
## 1                 1207         37        0.01838053 110000         1534
## 2                  388         85        0.11724138 75000          350
## 3                  340         16        0.02409639 73000          456
## 4                  692         40        0.05012531 70000          529
## 5                16697       1672        0.06109771 65000        18314
## 6                 1449        400        0.17722641 65000         1142
##   Non_college_jobs Low_wage_jobs
## 1              364           193
## 2              257            50
## 3              176             0
## 4              102             0
## 5             4440           972
## 6              657           244

majorsal$Major<-factor(majorsal$Major,levels = majorsal$Major[order(majorsal$Mean)])
colnames((majorsal))

## [1] "Major"                "Major_category"       "Total"
## [4] "Sample_size"          "Men"                  "Women"
## [7] "ShareWomen"           "Employed"             "Full_time"
## [10] "Part_time"            "Full_time_year_round" "Unemployed"
## [13] "Unemployment_rate"    "Mean"                 "College_jobs"
## [16] "Non_college_jobs"     "Low_wage_jobs"

majors_ordered<-majorsal[order(majorsal[,14],decreasing = TRUE),]

top50majors<-head(majorsal,50)
top50majors<-top50majors[order(top50majors[,14],decreasing = TRUE),]
ggplot(top50majors,aes(x=Major,y=Mean))+
geom_bar(stat="identity",width=0.5 ,fill="cyan")+
geom_text(aes(label=Mean),position = position_dodge(width = 0.5))+
coord_flip()+
xlab("Major")+
ylab("Starting Meidan Salary")+
ggtitle('Top 50 Best Paid Major')

tail15majors<-tail(majorsal,15)
ggplot(tail15majors,aes(x=Major,y=Mean))+
geom_bar(stat="identity",width=0.5 ,fill="cyan")+
geom_text(aes(label=Mean),position = position_dodge(width = 0.5))+
coord_flip()+
xlab("Major")+
ylab("Starting Meidan Salary")+
ggtitle('15 Worst Paid Major')

data<-majorsal
head(data)

##                                       Major Major_category Total
## 1                     PETROLEUM ENGINEERING    Engineering 2339
## 2            MINING AND MINERAL ENGINEERING    Engineering   756
## 3                 METALLURGICAL ENGINEERING    Engineering   856
## 4 NAVAL ARCHITECTURE AND MARINE ENGINEERING    Engineering 1258
## 5                      CHEMICAL ENGINEERING    Engineering 32260
## 6                       NUCLEAR ENGINEERING    Engineering 2573
##   Sample_size   Men Women ShareWomen Employed Full_time Part_time
## 1          36 2057   282 0.1205643     1976      1849       270
## 2           7   679    77 0.1018519      640      556       170
## 3           3   725   131 0.1530374      648       558       133
## 4          16 1123   135 0.1073132      758      1069       150
## 5         289 21239 11021 0.3416305    25694     23170      5180
## 6          17 2200   373 0.1449670     1857      2038       264
##   Full_time_year_round Unemployed Unemployment_rate   Mean College_jobs
## 1                 1207         37        0.01838053 110000         1534
## 2                  388         85        0.11724138 75000          350
## 3                  340         16        0.02409639 73000          456
## 4                  692         40        0.05012531 70000          529
## 5                16697       1672        0.06109771 65000        18314
## 6                 1449        400        0.17722641 65000         1142
##   Non_college_jobs Low_wage_jobs
## 1              364           193
## 2              257            50
## 3              176             0
## 4              102             0
## 5             4440           972
## 6              657           244

data$perctcoljob<-data$College_jobs/data$Employed
ggplot(data,aes(x=perctcoljob,y=Mean))+
geom_point(aes(color=Major_category))+
facet_wrap(~Major_category,scales = "free")

## Warning: Removed 1 rows containing missing values (geom_point).

data1<-majorsal
data1$coljob_rate<-data1$College_jobs/data1$Total
data1$lowwage_rate<-data1$Low_wage_jobs/data1$Total

library(GGally)

## Warning: package 'GGally' was built under R version 3.4.4

GGally::ggpairs(data1[,c(3,7,13,18,19)])

## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removing 1 row that contained a missing value

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_density).

## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removing 1 row that contained a missing value

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

datalog<-data1[,c(3,7,13,18,19)]
datalog$Total<-log(datalog$Total)
GGally::ggpairs(datalog)

## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removing 1 row that contained a missing value

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_density).

## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removing 1 row that contained a missing value

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(data1,aes(x=ShareWomen,y=lowwage_rate))+
         geom_point()+
         geom_smooth(method='lm')+
         labs(x="Percentage of Female Graduates",y='Percentage of Low Wage Job')

wlwjLM<-lm(lowwage_rate~ShareWomen,data=data1)
summary(wlwjLM)

##
## Call:
## lm(formula = lowwage_rate ~ ShareWomen, data = data1)
##
## Residuals:
##       Min        1Q    Median        3Q       Max
## -0.105273 -0.029879 -0.006586 0.026705 0.219897
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.056644   0.009146   6.193 4.25e-09 ***
## ShareWomen 0.063615   0.016021   3.971 0.000105 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04844 on 171 degrees of freedom
## Multiple R-squared: 0.08442,    Adjusted R-squared: 0.07907
## F-statistic: 15.77 on 1 and 171 DF, p-value: 0.0001053

wlwjLM_1<-lm(lowwage_rate~ShareWomen+Major_category,data=data1)
library(coefplot)

## Warning: package 'coefplot' was built under R version 3.4.4

coefplot(wlwjLM_1)

The engineering major asks, “How does it work?”
The science major askd, “why does it work?”
The business major asks, “How much does it cost?”
The liberal arts major asks, “Would you like fries with that?”

ggplot(data1,aes(x=ShareWomen,y=lowwage_rate))+
geom_point(aes(color=Major_category))+
facet_wrap(~Major_category,scales = "free")

data1$Sample_size<-NULL
data3<-data1[,c(3,6,12,13,17,18)]
GGally::ggpairs(data3)

## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removing 1 row that contained a missing value

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_density).

## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removing 1 row that contained a missing value

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

salariesLM<-lm(Mean~Total+ShareWomen+Unemployment_rate+coljob_rate+lowwage_rate,data=data1)
summary(salariesLM)

##
## Call:
## lm(formula = Mean ~ Total + ShareWomen + Unemployment_rate +
##     coljob_rate + lowwage_rate, data = data1)
##
## Residuals:
##    Min     1Q Median     3Q    Max
## -19431 -4997   -125   4336 53352
##
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)        5.397e+04 3.339e+03 16.166 < 2e-16 ***
## Total             -5.585e-03 1.019e-02 -0.548 0.58438
## ShareWomen        -2.817e+04 2.941e+03 -9.581 < 2e-16 ***
## Unemployment_rate -5.449e+03 2.288e+04 -0.238 0.81208
## coljob_rate        1.445e+04 4.704e+03   3.071 0.00249 **
## lowwage_rate      -3.984e+04 1.544e+04 -2.580 0.01073 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8345 on 166 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared: 0.4892, Adjusted R-squared: 0.4738
## F-statistic: 31.79 on 5 and 166 DF, p-value: < 2.2e-16

salariesLM<-lm(Mean~ShareWomen+coljob_rate+lowwage_rate,data=data1)
summary(salariesLM)

##
## Call:
## lm(formula = Mean ~ ShareWomen + coljob_rate + lowwage_rate,
##     data = data1)
##
## Residuals:
##    Min     1Q Median     3Q    Max
## -19066 -4809    -79   4511 53595
##
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept)     53139       2560 20.754 < 2e-16 ***
## ShareWomen     -28220       2903 -9.720 < 2e-16 ***
## coljob_rate     15192       4331   3.508 0.000579 ***
## lowwage_rate   -39926      14774 -2.702 0.007586 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8280 on 169 degrees of freedom
## Multiple R-squared: 0.488, Adjusted R-squared: 0.4789
## F-statistic: 53.69 on 3 and 169 DF, p-value: < 2.2e-16

coefplot(salariesLM)

salariesLM_1<-lm(Mean~ShareWomen+coljob_rate+lowwage_rate+Major_category,data=data1)
coefplot(salariesLM_1)

data3log<-data3
data3log$Total<-log(data3log$Total)
data3log$Mean<-log(data3log$Mean)
GGally::ggpairs(data3log)

## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removing 1 row that contained a missing value

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_density).

## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removing 1 row that contained a missing value

salariesLM_3<-lm(Mean~ShareWomen+Total,data=data3log)
coefplot(salariesLM_3)

GLM Model

dataglm<-data1
dataglm$ab[dataglm$Mean>=48127]<-1
dataglm$ab[dataglm$Mean<48127]<-0
salary1<-glm(ab~ShareWomen+Total+coljob_rate+lowwage_rate,data=dataglm)
summary(salary1)

##
## Call:
## glm(formula = ab ~ ShareWomen + Total + coljob_rate + lowwage_rate,
##     data = dataglm)
##
## Deviance Residuals:
##      Min        1Q    Median        3Q       Max
## -0.64667 -0.20234 -0.04371   0.17965   0.82983
##
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)   4.530e-01 9.885e-02   4.583 8.90e-06 ***
## ShareWomen   -8.919e-01 1.104e-01 -8.076 1.23e-13 ***
## Total        -4.964e-07 3.834e-07 -1.295    0.197
## coljob_rate   7.248e-01 1.656e-01   4.377 2.11e-05 ***
## lowwage_rate -2.909e-01 5.618e-01 -0.518    0.605
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.09901853)
##
##     Null deviance: 27.318 on 172 degrees of freedom
## Residual deviance: 16.635 on 168 degrees of freedom
## AIC: 97.826
##
## Number of Fisher Scoring iterations: 2

salary2<-glm(ab~ShareWomen+coljob_rate+lowwage_rate,data=dataglm)
summary(salary2)

##
## Call:
## glm(formula = ab ~ ShareWomen + coljob_rate + lowwage_rate, data = dataglm)
##
## Deviance Residuals:
##      Min        1Q    Median        3Q       Max
## -0.63647 -0.21758 -0.04467   0.17888   0.85362
##
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept)    0.4305     0.0975   4.416 1.79e-05 ***
## ShareWomen    -0.8980     0.1105 -8.123 9.14e-14 ***
## coljob_rate    0.7487     0.1649   4.540 1.07e-05 ***
## lowwage_rate -0.3147     0.5626 -0.559    0.577
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.09941484)
##
##     Null deviance: 27.318 on 172 degrees of freedom
## Residual deviance: 16.801 on 169 degrees of freedom
## AIC: 97.543
##
## Number of Fisher Scoring iterations: 2

salary3<-glm(ab~ShareWomen+coljob_rate,data=dataglm)
summary(salary3)

##
## Call:
## glm(formula = ab ~ ShareWomen + coljob_rate, data = dataglm)
##
## Deviance Residuals:
##     Min       1Q   Median       3Q      Max
## -0.6392 -0.2104 -0.0448   0.1895   0.8147
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.39782    0.07788   5.108 8.65e-07 ***
## ShareWomen -0.91853   0.10407 -8.826 1.28e-15 ***
## coljob_rate 0.79168    0.14560   5.437 1.85e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.09901309)
##
##     Null deviance: 27.318 on 172 degrees of freedom
## Residual deviance: 16.832 on 170 degrees of freedom
## AIC: 95.863
##
## Number of Fisher Scoring iterations: 2

salary4<-glm(ab~ShareWomen+coljob_rate+Major_category,data=dataglm)
coefplot(salary4)

womenstem<-read.csv('womenstem.csv')
colnames(womenstem)

## [1] "Major_code" "Major" "Major_category" "Total"
## [5] "Men" "Women" "ShareWomen" "Mean"

ggplot(womenstem,aes(x=Mean,y=ShareWomen))+
geom_point(aes(color=Major_category))+
xlab("Mean Starting Salary")+
ylab("Percentage of Femal Students")