Data preprocessing
salary.0 = read.csv("/Users/luoyan1999/Desktop/WISC/479Spring/data_cleaned_2021.csv",
header = T) %>%
filter(job_title_sim != "na")
salary = salary.0 %>%
select(Company.Name,Industry,
Lower.Salary,Upper.Salary,Avg.Salary.K.,
Location,job_title_sim,Degree,Industry) %>%
separate("Location",c("City","State"),sep=", ") %>%
mutate(cityname0 = paste(City,State))
salary$Degree = if_else(salary$Degree=="M","Master",
if_else(salary$Degree=="P","PhD/Postdoc","Is Not Required"))
# combine the two data sets together so as to create spatial data
g.salary = left_join(salary,us.cities,by=c("cityname0"="name")) %>%
filter(lat != "NA") %>%
select(-c(country.etc,capital,pop)) %>%
mutate(job_title_sim= as.factor(job_title_sim),
Degree=as.factor(Degree))
A view of the processed data
head(g.salary)
Company.Name Industry Lower.Salary Upper.Salary
1 Tecolote Research\n3.8 Aerospace & Defense 53 91
2 KnowBe4\n4.8 Security Services 80 90
3 PNNL\n3.8 Energy 56 97
4 Affinity Solutions\n2.9 Advertising & Marketing 86 143
5 CyrusOne\n3.4 Real Estate 71 119
6 ClearOne Advantage\n4.1 Banks & Credit Unions 54 93
Avg.Salary.K. City State job_title_sim Degree cityname0
1 72.0 Albuquerque NM data scientist Master Albuquerque NM
2 85.0 Clearwater FL data scientist Master Clearwater FL
3 76.5 Richland WA data scientist Is Not Required Richland WA
4 114.5 New York NY data scientist Is Not Required New York NY
5 95.0 Dallas TX data scientist Is Not Required Dallas TX
6 73.5 Baltimore MD data scientist Is Not Required Baltimore MD
lat long
1 35.12 -106.62
2 27.98 -82.77
3 46.29 -119.29
4 40.67 -73.94
5 32.79 -96.77
6 39.30 -76.61
Data visualisation
An interactive map using leaflet
factpal <- colorFactor("YlOrRd", g.salary$Degree)
leaflet(g.salary) %>%
addTiles() %>%
addCircleMarkers(
lng = ~long,
lat = ~lat,
color = ~factpal(Degree), # colors to differ the required degrees
radius = ~Avg.Salary.K./15, # radius of the circle is proportional to average salaries
popup = ~paste(paste("COMPANY: ",Company.Name,sep=""),
paste("SALARY: $",Lower.Salary,"K-$",Upper.Salary,"K",sep=""),
paste("INDUSTRY: ",Industry,sep=""),
sep="\n") # add popups to provide more detailed information
) %>%
addLegend("bottomright",pal=factpal,values=~Degree)
Polygons using ggplot2
state <- map_data("state") %>%
mutate(state = state.abb[match(region,tolower(state.name))])
g.salary2 = g.salary %>%
group_by(State) %>%
summarise(avg.avg.salary = round(mean(Avg.Salary.K.),3)) %>%
full_join(state, by=c("State"="state"))
ggplot(g.salary2, aes(x=long, y=lat, fill=avg.avg.salary, group=group)) +
geom_polygon(color = "blue") +
labs(x = "longitude",y = "latitude") +
scale_fill_gradient(low="khaki1",high="indianred3",na.value = "white") +
theme_bw()