Data preprocessing

salary.0 = read.csv("/Users/luoyan1999/Desktop/WISC/479Spring/data_cleaned_2021.csv",
                header = T) %>%
  filter(job_title_sim != "na")
salary = salary.0 %>%
  select(Company.Name,Industry,
         Lower.Salary,Upper.Salary,Avg.Salary.K.,
         Location,job_title_sim,Degree,Industry) %>%
  separate("Location",c("City","State"),sep=", ") %>%
  mutate(cityname0 = paste(City,State)) 
salary$Degree = if_else(salary$Degree=="M","Master",
                          if_else(salary$Degree=="P","PhD/Postdoc","Is Not Required"))
# combine the two data sets together so as to create spatial data 
g.salary = left_join(salary,us.cities,by=c("cityname0"="name")) %>%
  filter(lat != "NA") %>%
  select(-c(country.etc,capital,pop)) %>%
  mutate(job_title_sim= as.factor(job_title_sim),
         Degree=as.factor(Degree))

A view of the processed data

head(g.salary)
             Company.Name                Industry Lower.Salary Upper.Salary
1  Tecolote Research\n3.8     Aerospace & Defense           53           91
2            KnowBe4\n4.8       Security Services           80           90
3               PNNL\n3.8                  Energy           56           97
4 Affinity Solutions\n2.9 Advertising & Marketing           86          143
5           CyrusOne\n3.4             Real Estate           71          119
6 ClearOne Advantage\n4.1   Banks & Credit Unions           54           93
  Avg.Salary.K.        City State  job_title_sim          Degree      cityname0
1          72.0 Albuquerque    NM data scientist          Master Albuquerque NM
2          85.0  Clearwater    FL data scientist          Master  Clearwater FL
3          76.5    Richland    WA data scientist Is Not Required    Richland WA
4         114.5    New York    NY data scientist Is Not Required    New York NY
5          95.0      Dallas    TX data scientist Is Not Required      Dallas TX
6          73.5   Baltimore    MD data scientist Is Not Required   Baltimore MD
    lat    long
1 35.12 -106.62
2 27.98  -82.77
3 46.29 -119.29
4 40.67  -73.94
5 32.79  -96.77
6 39.30  -76.61

Data visualisation

An interactive map using leaflet

factpal <- colorFactor("YlOrRd", g.salary$Degree)
leaflet(g.salary) %>% 
  addTiles() %>% 
  addCircleMarkers(
    lng = ~long, 
    lat = ~lat, 
    color = ~factpal(Degree),   # colors to differ the required degrees
    radius = ~Avg.Salary.K./15,  # radius of the circle is proportional to average salaries
    popup = ~paste(paste("COMPANY: ",Company.Name,sep=""),
                   paste("SALARY: $",Lower.Salary,"K-$",Upper.Salary,"K",sep=""),
                   paste("INDUSTRY: ",Industry,sep=""),
                   sep="\n")    # add popups to provide more detailed information
  ) %>%
  addLegend("bottomright",pal=factpal,values=~Degree)

Polygons using ggplot2

state <- map_data("state") %>% 
  mutate(state = state.abb[match(region,tolower(state.name))])

g.salary2 = g.salary %>%
  group_by(State) %>%
  summarise(avg.avg.salary = round(mean(Avg.Salary.K.),3)) %>%
  full_join(state, by=c("State"="state"))

ggplot(g.salary2, aes(x=long, y=lat, fill=avg.avg.salary, group=group)) + 
  geom_polygon(color = "blue") + 
  labs(x = "longitude",y = "latitude") + 
  scale_fill_gradient(low="khaki1",high="indianred3",na.value = "white") + 
  theme_bw()