Recent College Graduates Salary
Wait WAT?
library(reshape2)
library(useful)
## Warning: package 'useful' was built under R version 3.4.4
## Loading required package: ggplot2
library(ggplot2)
library(scales)
setwd('E:\\GMU\\Spring 2018\\STAT 463\\Final')
collegeSalaries<-read.csv('salaries-by-region.csv')
head(collegeSalaries,10)
##
sn State Region
##
1 Stanford University CA California
## 2 California
Institute of Technology CA California
##
3 Harvey Mudd College CA California
## 4 University
of California, Berkeley CA California
##
5 Occidental College CA California
## 6
Cal Poly San Luis Obispo CA California
## 7 University of
California at Los Angeles CA California
## 8 University of
California, San Diego CA California
##
9 Pomona College CA California
## 10 University
of Southern California CA California
##
Starting.Mean.Salary Mid.Career.Median.Salary
## 1
70400 129000
## 2
75500 123000
## 3
71800 122000
## 4
59900 112000
## 5
51900 105000
## 6
57200 101000
## 7
52600 101000
## 8
51100 101000
## 9 48600
101000
## 10
54800 99600
colnames(collegeSalaries)<-c("sn","state","region","sms","mcms")
sapply(collegeSalaries,class)
##
sn state region sms mcms
## "factor"
"factor" "factor" "integer" "integer"
cs_ordered<-collegeSalaries[order(collegeSalaries[,4],decreasing = TRUE),]
cs_top50<-head(cs_ordered,50)
cs_2nd50<-cs_ordered[51:100,]
ggplot(cs_top50,aes(x=reorder(sn,sms),y=sms))+
geom_bar(stat="identity",width=0.5 ,fill="cyan")+
geom_text(aes(label=sms),position = position_dodge(width = 0.5))+
coord_flip()+
xlab("School Name")+
ylab("Starting Average
Salary")+
ggtitle('Top 50 Starting Average Salary')
Please Join Me
ggplot(cs_2nd50,aes(x=reorder(sn,sms),y=sms))+
geom_bar(stat="identity",width=0.5 ,fill="cyan")+
geom_text(aes(label=sms),position = position_dodge(width = 0.5))+
coord_flip()+
xlab("School Name")+
ylab("Starting Average
Salary")+
ggtitle('Second 50 Starting Average
Salary')
library(ggforce)
## Warning: package 'ggforce' was built under R version 3.4.4
calsal<-subset(cs_ordered,region=="California")
ggplot(calsal,aes(x=reorder(sn,sms),y=sms))+
geom_bar(stat="identity",fill="blue")+
geom_text(aes(label=sms),hjust = -0.1, size = 2,
position = position_dodge(width = 1),
inherit.aes = TRUE)+
xlab("School Name")+
ylab("Starting Average
Salary")+
ggtitle('Starting Average Salary,
California')+
coord_flip()
westsal<-subset(cs_ordered,region=="Western")
ggplot(westsal,aes(x=reorder(sn,sms),y=sms))+
geom_bar(stat="identity",fill="blue")+
geom_text(aes(label=sms),hjust = -0.1, size = 2,
position = position_dodge(width = 1),
inherit.aes = TRUE)+
xlab("School Name")+
ylab("Starting Average
Salary")+
ggtitle('Starting Average Salary,
Western')+
coord_flip()
#nesal<-subset(cs_ordered,region=="Northeastern")
#ggplot(nesal,aes(x=reorder(sn,sms),y=sms))+
#geom_bar(stat="identity",width=0.77,fill="blue")+
#geom_text(aes(label=sms),hjust
= -0.1, size = 2,
#position =
position_dodge(width = 1),
#inherit.aes = TRUE)+
#xlab("School Name")+
#ylab("Starting Average
Salary")+
#ggtitle('Starting Average
Salary, Northeastern')+
#coord_flip()
sousal<-subset(cs_ordered,region=="Southern")
ggplot(sousal,aes(x=reorder(sn,sms),y=sms))+
geom_bar(stat="identity",width=0.77,fill="blue")+
geom_text(aes(label=sms),hjust = -0.1, size = 2,
position = position_dodge(width = 1),
inherit.aes = TRUE)+
xlab("School Name")+
ylab("Starting Average
Salary")+
ggtitle('Starting Average Salary,
Southern')+
coord_flip()
statesal<-read.csv('state_sal.csv')
head(statesal)
##
State Region mean increase percent
## 1 AL Southern
42980 35320 0.8217776
## 2 AR Southern
40667 32867 0.8081967
## 3 AZ Western
47450 37650 0.7934668
## 4 CA Western
51032 42100 0.8249703
## 5 CO Western
46414 38157 0.8220991
## 6 CT Northeastern
47833 43400 0.9073171
growth<-ggplot(statesal)+
geom_point(aes(x=mean,y=reorder(State,mean)))+
geom_segment(data=statesal,aes(x=mean,y=State,xend=mean+increase,yend=State),arrow = arrow())+
facet_wrap(~Region,ncol=1,scales = "free")
growth
majorsal<-read.csv('recent-grads.csv')
majorsal$Rank<-NULL
majorsal$Major_code<-NULL
head(majorsal)
##
Major Major_category Total
##
1 PETROLEUM ENGINEERING Engineering 2339
## 2 MINING
AND MINERAL ENGINEERING Engineering 756
## 3
METALLURGICAL ENGINEERING Engineering 856
## 4 NAVAL ARCHITECTURE
AND MARINE ENGINEERING Engineering 1258
##
5 CHEMICAL ENGINEERING Engineering 32260
##
6 NUCLEAR ENGINEERING Engineering 2573
## Sample_size Men
Women ShareWomen Employed Full_time Part_time
## 1 36
2057 282 0.1205643 1976 1849 270
## 2 7
679 77 0.1018519 640 556 170
## 3 3
725 131 0.1530374 648 558 133
## 4 16
1123 135 0.1073132 758 1069 150
## 5 289 21239
11021 0.3416305 25694 23170 5180
## 6 17
2200 373 0.1449670 1857 2038 264
##
Full_time_year_round Unemployed Unemployment_rate Mean College_jobs
## 1
1207 37 0.01838053 110000 1534
## 2
388 85 0.11724138 75000 350
## 3
340 16 0.02409639 73000 456
## 4
692 40 0.05012531 70000 529
## 5
16697 1672 0.06109771 65000 18314
## 6 1449
400 0.17722641 65000 1142
## Non_college_jobs
Low_wage_jobs
## 1
364 193
## 2
257 50
## 3
176 0
## 4
102 0
## 5 4440
972
## 6
657 244
majorsal$Major<-factor(majorsal$Major,levels = majorsal$Major[order(majorsal$Mean)])
colnames((majorsal))
##
[1] "Major" "Major_category"
"Total"
## [4] "Sample_size"
"Men" "Women"
## [7]
"ShareWomen" "Employed"
"Full_time"
## [10]
"Part_time" "Full_time_year_round"
"Unemployed"
## [13]
"Unemployment_rate" "Mean"
"College_jobs"
## [16]
"Non_college_jobs" "Low_wage_jobs"
majors_ordered<-majorsal[order(majorsal[,14],decreasing = TRUE),]
top50majors<-head(majorsal,50)
top50majors<-top50majors[order(top50majors[,14],decreasing = TRUE),]
ggplot(top50majors,aes(x=Major,y=Mean))+
geom_bar(stat="identity",width=0.5 ,fill="cyan")+
geom_text(aes(label=Mean),position = position_dodge(width = 0.5))+
coord_flip()+
xlab("Major")+
ylab("Starting Meidan
Salary")+
ggtitle('Top 50 Best Paid Major')
tail15majors<-tail(majorsal,15)
ggplot(tail15majors,aes(x=Major,y=Mean))+
geom_bar(stat="identity",width=0.5 ,fill="cyan")+
geom_text(aes(label=Mean),position = position_dodge(width = 0.5))+
coord_flip()+
xlab("Major")+
ylab("Starting Meidan
Salary")+
ggtitle('15 Worst Paid Major')
data<-majorsal
head(data)
##
Major Major_category Total
##
1 PETROLEUM ENGINEERING Engineering 2339
## 2 MINING
AND MINERAL ENGINEERING Engineering 756
## 3
METALLURGICAL ENGINEERING Engineering 856
## 4 NAVAL ARCHITECTURE
AND MARINE ENGINEERING Engineering 1258
##
5 CHEMICAL ENGINEERING Engineering 32260
##
6 NUCLEAR ENGINEERING Engineering 2573
## Sample_size Men
Women ShareWomen Employed Full_time Part_time
## 1 36
2057 282 0.1205643 1976 1849 270
## 2 7
679 77 0.1018519 640 556 170
## 3 3
725 131 0.1530374 648 558 133
## 4 16
1123 135 0.1073132 758 1069 150
## 5 289 21239
11021 0.3416305 25694 23170 5180
## 6 17
2200 373 0.1449670 1857 2038 264
##
Full_time_year_round Unemployed Unemployment_rate Mean College_jobs
## 1
1207 37 0.01838053 110000 1534
## 2
388 85 0.11724138 75000 350
## 3
340 16 0.02409639 73000 456
## 4
692 40 0.05012531 70000 529
## 5
16697 1672 0.06109771 65000 18314
## 6 1449
400 0.17722641 65000 1142
## Non_college_jobs
Low_wage_jobs
## 1
364 193
## 2
257 50
## 3
176 0
## 4
102 0
## 5 4440
972
## 6
657 244
data$perctcoljob<-data$College_jobs/data$Employed
ggplot(data,aes(x=perctcoljob,y=Mean))+
geom_point(aes(color=Major_category))+
facet_wrap(~Major_category,scales = "free")
## Warning: Removed 1 rows containing missing values (geom_point).
data1<-majorsal
data1$coljob_rate<-data1$College_jobs/data1$Total
data1$lowwage_rate<-data1$Low_wage_jobs/data1$Total
library(GGally)
## Warning: package 'GGally' was built under R version 3.4.4
GGally::ggpairs(data1[,c(3,7,13,18,19)])
##
Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson",
: Removing 1 row that contained a missing value
##
Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson",
: Removing 1 row that contained a missing value
##
Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1
rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_density).
##
Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson",
: Removing 1 row that contained a missing value
##
Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson",
: Removing 1 row that contained a missing value
##
Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1
rows containing missing values (geom_point).
datalog<-data1[,c(3,7,13,18,19)]
datalog$Total<-log(datalog$Total)
GGally::ggpairs(datalog)
##
Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson",
: Removing 1 row that contained a missing value
##
Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson",
: Removing 1 row that contained a missing value
##
Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1
rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_density).
##
Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson",
: Removing 1 row that contained a missing value
##
Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson",
: Removing 1 row that contained a missing value
##
Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1
rows containing missing values (geom_point).
ggplot(data1,aes(x=ShareWomen,y=lowwage_rate))+
geom_point()+
geom_smooth(method='lm')+
labs(x="Percentage of Female
Graduates",y='Percentage of Low Wage Job')
wlwjLM<-lm(lowwage_rate~ShareWomen,data=data1)
summary(wlwjLM)
##
## Call:
## lm(formula =
lowwage_rate ~ ShareWomen, data = data1)
##
## Residuals:
## Min
1Q Median 3Q Max
## -0.105273 -0.029879
-0.006586 0.026705 0.219897
##
## Coefficients:
## Estimate
Std. Error t value Pr(>|t|)
## (Intercept)
0.056644 0.009146 6.193 4.25e-09 ***
## ShareWomen 0.063615
0.016021 3.971 0.000105 ***
## ---
## Signif. codes: 0
'***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard
error: 0.04844 on 171 degrees of freedom
## Multiple R-squared:
0.08442, Adjusted R-squared: 0.07907
## F-statistic: 15.77
on 1 and 171 DF, p-value: 0.0001053
wlwjLM_1<-lm(lowwage_rate~ShareWomen+Major_category,data=data1)
library(coefplot)
## Warning: package 'coefplot' was built under R version 3.4.4
coefplot(wlwjLM_1)
The engineering major asks, “How does it work?”
The science major askd, “why does it work?”
The business major asks, “How much does it cost?”
The liberal arts major asks, “Would you like fries with that?”
ggplot(data1,aes(x=ShareWomen,y=lowwage_rate))+
geom_point(aes(color=Major_category))+
facet_wrap(~Major_category,scales = "free")
data1$Sample_size<-NULL
data3<-data1[,c(3,6,12,13,17,18)]
GGally::ggpairs(data3)
##
Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson",
: Removing 1 row that contained a missing value
##
Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson",
: Removing 1 row that contained a missing value
##
Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1
rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_density).
##
Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson",
: Removing 1 row that contained a missing value
##
Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson",
: Removing 1 row that contained a missing value
##
Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson",
: Removing 1 row that contained a missing value
##
Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1
rows containing missing values (geom_point).
## Warning: Removed 1
rows containing missing values (geom_point).
salariesLM<-lm(Mean~Total+ShareWomen+Unemployment_rate+coljob_rate+lowwage_rate,data=data1)
summary(salariesLM)
##
## Call:
## lm(formula = Mean ~
Total + ShareWomen + Unemployment_rate +
## coljob_rate +
lowwage_rate, data = data1)
##
## Residuals:
## Min 1Q
Median 3Q Max
## -19431 -4997
-125 4336 53352
##
## Coefficients:
##
Estimate Std. Error t value Pr(>|t|)
## (Intercept)
5.397e+04 3.339e+03 16.166 < 2e-16 ***
## Total
-5.585e-03 1.019e-02 -0.548 0.58438
## ShareWomen
-2.817e+04 2.941e+03 -9.581 < 2e-16 ***
## Unemployment_rate
-5.449e+03 2.288e+04 -0.238 0.81208
## coljob_rate
1.445e+04 4.704e+03 3.071 0.00249 **
## lowwage_rate
-3.984e+04 1.544e+04 -2.580 0.01073 *
## ---
## Signif. codes: 0
'***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard
error: 8345 on 166 degrees of freedom
## (1 observation
deleted due to missingness)
## Multiple R-squared:
0.4892, Adjusted R-squared: 0.4738
## F-statistic: 31.79
on 5 and 166 DF, p-value: < 2.2e-16
salariesLM<-lm(Mean~ShareWomen+coljob_rate+lowwage_rate,data=data1)
summary(salariesLM)
##
## Call:
## lm(formula = Mean ~
ShareWomen + coljob_rate + lowwage_rate,
## data = data1)
##
## Residuals:
## Min 1Q
Median 3Q Max
## -19066 -4809
-79 4511 53595
##
## Coefficients:
##
Estimate Std. Error t value Pr(>|t|)
## (Intercept)
53139 2560 20.754 < 2e-16 ***
## ShareWomen
-28220 2903 -9.720 < 2e-16 ***
## coljob_rate
15192 4331 3.508 0.000579 ***
## lowwage_rate
-39926 14774 -2.702 0.007586 **
## ---
## Signif. codes: 0
'***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard
error: 8280 on 169 degrees of freedom
## Multiple R-squared:
0.488, Adjusted R-squared: 0.4789
## F-statistic: 53.69
on 3 and 169 DF, p-value: < 2.2e-16
coefplot(salariesLM)
salariesLM_1<-lm(Mean~ShareWomen+coljob_rate+lowwage_rate+Major_category,data=data1)
coefplot(salariesLM_1)
data3log<-data3
data3log$Total<-log(data3log$Total)
data3log$Mean<-log(data3log$Mean)
GGally::ggpairs(data3log)
##
Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson",
: Removing 1 row that contained a missing value
##
Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson",
: Removing 1 row that contained a missing value
##
Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1
rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_density).
##
Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson",
: Removing 1 row that contained a missing value
##
Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson",
: Removing 1 row that contained a missing value
##
Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson",
: Removing 1 row that contained a missing value
##
Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1
rows containing missing values (geom_point).
## Warning: Removed 1
rows containing missing values (geom_point).
salariesLM_3<-lm(Mean~ShareWomen+Total,data=data3log)
coefplot(salariesLM_3)
GLM Model
dataglm<-data1
dataglm$ab[dataglm$Mean>=48127]<-1
dataglm$ab[dataglm$Mean<48127]<-0
salary1<-glm(ab~ShareWomen+Total+coljob_rate+lowwage_rate,data=dataglm)
summary(salary1)
##
## Call:
## glm(formula = ab ~
ShareWomen + Total + coljob_rate + lowwage_rate,
## data = dataglm)
##
## Deviance Residuals:
## Min
1Q Median 3Q Max
## -0.64667 -0.20234
-0.04371 0.17965 0.82983
##
## Coefficients:
##
Estimate Std. Error t value Pr(>|t|)
## (Intercept)
4.530e-01 9.885e-02 4.583 8.90e-06 ***
## ShareWomen
-8.919e-01 1.104e-01 -8.076 1.23e-13 ***
## Total
-4.964e-07 3.834e-07 -1.295 0.197
## coljob_rate
7.248e-01 1.656e-01 4.377 2.11e-05 ***
## lowwage_rate
-2.909e-01 5.618e-01 -0.518 0.605
## ---
## Signif. codes: 0
'***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion
parameter for gaussian family taken to be 0.09901853)
##
## Null deviance:
27.318 on 172 degrees of freedom
## Residual deviance:
16.635 on 168 degrees of freedom
## AIC: 97.826
##
## Number of Fisher
Scoring iterations: 2
salary2<-glm(ab~ShareWomen+coljob_rate+lowwage_rate,data=dataglm)
summary(salary2)
##
## Call:
## glm(formula = ab ~
ShareWomen + coljob_rate + lowwage_rate, data = dataglm)
##
## Deviance Residuals:
## Min
1Q Median 3Q Max
## -0.63647 -0.21758
-0.04467 0.17888 0.85362
##
## Coefficients:
##
Estimate Std. Error t value Pr(>|t|)
## (Intercept)
0.4305 0.0975 4.416 1.79e-05 ***
## ShareWomen
-0.8980 0.1105 -8.123 9.14e-14 ***
## coljob_rate
0.7487 0.1649 4.540 1.07e-05 ***
## lowwage_rate
-0.3147 0.5626 -0.559 0.577
## ---
## Signif. codes: 0
'***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion
parameter for gaussian family taken to be 0.09941484)
##
## Null deviance:
27.318 on 172 degrees of freedom
## Residual deviance:
16.801 on 169 degrees of freedom
## AIC: 97.543
##
## Number of Fisher
Scoring iterations: 2
salary3<-glm(ab~ShareWomen+coljob_rate,data=dataglm)
summary(salary3)
##
## Call:
## glm(formula = ab ~
ShareWomen + coljob_rate, data = dataglm)
##
## Deviance Residuals:
## Min 1Q
Median 3Q Max
## -0.6392 -0.2104
-0.0448 0.1895 0.8147
##
## Coefficients:
## Estimate
Std. Error t value Pr(>|t|)
## (Intercept)
0.39782 0.07788 5.108 8.65e-07 ***
## ShareWomen
-0.91853 0.10407 -8.826 1.28e-15 ***
## coljob_rate
0.79168 0.14560 5.437 1.85e-07 ***
## ---
## Signif. codes: 0
'***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion
parameter for gaussian family taken to be 0.09901309)
##
## Null deviance:
27.318 on 172 degrees of freedom
## Residual deviance:
16.832 on 170 degrees of freedom
## AIC: 95.863
##
## Number of Fisher
Scoring iterations: 2
salary4<-glm(ab~ShareWomen+coljob_rate+Major_category,data=dataglm)
coefplot(salary4)
womenstem<-read.csv('womenstem.csv')
colnames(womenstem)
##
[1] "Major_code" "Major"
"Major_category" "Total"
## [5]
"Men" "Women"
"ShareWomen" "Mean"
ggplot(womenstem,aes(x=Mean,y=ShareWomen))+
geom_point(aes(color=Major_category))+
xlab("Mean Starting Salary")+
ylab("Percentage of Femal
Students")