library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.1.3
library(ggplot2)
Introduction:
I wasn’t satisfied with the Shiny App that I made for Portfolio 2 and I felt that the dataset that I used didn’t offer anything important, so for Portfolio 3, I decided to switch things up and use a different dataset from BaseballSavant.com
I’m currently working for the Minnesota Twins as a Business Strategy and Sales Operations intern, so in this portfolio, I’m going to do some visualizations of the hitting stats of one of the Twins hitters. The player that I chose is Byron Buxton, the center fielder of the Twins, since he’s currently one of the more recognizable on the Twins and he had a good start last season before suffering from injuries. The dataset can be downloaded from here and you must download the CSV file in order for the code to run properly:
In the 1st static visualization, I did a spray chart of all of the hits that Buxton had for 2021. I downloaded the dataset and drew a strike zone with the x and z coordinates. I then changed the pitch_type variable to the pitch descriptions. I then had 2 labels: “Feet Above Homeplate” for the y-coordinate, “Feet From Homeplate” for the x-coordinate. The 1st scatterplot contained all of the pitches that Buxton faced in 2021 and the 2nd scatterplot contained the pitches that Buxton got a hit off of. Based on the 2nd scatterplot, Buxton has great power hitting inside pitchers especially high and inside fastballs and low and inside sliders.
Buxton <- read_csv("https://baseballsavant.mlb.com/statcast_search/csv?all=true&hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2021%7C&hfSit=&player_type=batter&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfInfield=&team=&position=&hfOutfield=&hfRO=&home_road=&batters_lookup%5B%5D=621439&hfFlag=&hfBBT=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc&min_pas=0&type=details&")
## New names:
## * pitcher -> pitcher...8
## * fielder_2 -> fielder_2...42
## * pitcher -> pitcher...60
## * fielder_2 -> fielder_2...61
## Rows: 925 Columns: 92
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (16): pitch_type, player_name, events, description, des, game_type, sta...
## dbl (67): release_speed, release_pos_x, release_pos_z, batter, pitcher...8,...
## lgl (8): spin_dir, spin_rate_deprecated, break_angle_deprecated, break_len...
## date (1): game_date
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Buxton
## # A tibble: 925 x 92
## pitch_type game_date release_speed release_pos_x release_pos_z player_name
## <chr> <date> <dbl> <dbl> <dbl> <chr>
## 1 FF 2021-10-03 94.4 -0.9 5.87 Buxton, Byron
## 2 FF 2021-10-03 94.4 -1.32 5.95 Buxton, Byron
## 3 FF 2021-10-03 94.6 -1.13 5.9 Buxton, Byron
## 4 FF 2021-10-03 94.9 -0.91 5.82 Buxton, Byron
## 5 SL 2021-10-03 85.6 -0.98 5.81 Buxton, Byron
## 6 FF 2021-10-03 96.3 -2.04 5.9 Buxton, Byron
## 7 FF 2021-10-03 97.5 -2.08 5.95 Buxton, Byron
## 8 SL 2021-10-03 82.5 -2.34 5.86 Buxton, Byron
## 9 CH 2021-10-03 85 -0.97 5.97 Buxton, Byron
## 10 SL 2021-10-03 84.4 -0.96 5.91 Buxton, Byron
## # ... with 915 more rows, and 86 more variables: batter <dbl>,
## # pitcher...8 <dbl>, events <chr>, description <chr>, spin_dir <lgl>,
## # spin_rate_deprecated <lgl>, break_angle_deprecated <lgl>,
## # break_length_deprecated <lgl>, zone <dbl>, des <chr>, game_type <chr>,
## # stand <chr>, p_throws <chr>, home_team <chr>, away_team <chr>, type <chr>,
## # hit_location <dbl>, bb_type <chr>, balls <dbl>, strikes <dbl>,
## # game_year <dbl>, pfx_x <dbl>, pfx_z <dbl>, plate_x <dbl>, ...
##Drawing The Strike Zone
x <- c(-.95,.95,.95,-.95,-.95)
z <- c(1.6,1.6,3.5,3.5,1.6)
#store in dataframe
sz <- data.frame(x,z)
##Changing Pitch Names
pitch_desc <- Buxton$pitch_type
##Changing Pitch Names
pitch_desc[which(pitch_desc=='CH')] <- "Changeup"
pitch_desc[which(pitch_desc=='CU')] <- "Curveball"
pitch_desc[which(pitch_desc=='FC')] <- "Cutter"
pitch_desc[which(pitch_desc=='FF')] <- "Four seam"
pitch_desc[which(pitch_desc=='FS')] <- "Split Flinger"
pitch_desc[which(pitch_desc=='FT')] <- "Two-Seam"
pitch_desc[which(pitch_desc=='KC')] <- "Kuckle-Curve"
pitch_desc[which(pitch_desc=='SI')] <- "Sinker"
pitch_desc[which(pitch_desc=='SL')] <- "Slider"
library(ggplot2)
library(viridis)
## Warning: package 'viridis' was built under R version 4.1.3
## Loading required package: viridisLite
ggplot() +
geom_path(data = sz, aes(x=x, y=z)) +
coord_equal() +
geom_point(data = Buxton, aes(x = plate_x, y = plate_z, size = release_speed, color = pitch_desc)) +
scale_size(range = c(-1.0,2.5))+
scale_color_viridis(discrete = TRUE, option = "C") +
labs(size = "Speed",
color = "Pitch Type",
title = "Byron Buxton - Pitch Chart") +
ylab("Feet Above Homeplate") +
xlab("Feet From Homeplate") +
theme(plot.title=element_text(face="bold",hjust=-.015,vjust=0,colour="#3C3C3C",size=20),
plot.subtitle=element_text(face="plain", hjust= -.015, vjust= .09, colour="#3C3C3C", size = 12)) +
theme(axis.text.x=element_text(vjust = .5, size=11,colour="#535353",face="bold")) +
theme(axis.text.y=element_text(size=11,colour="#535353",face="bold")) +
theme(axis.title.y=element_text(size=11,colour="#535353",face="bold",vjust=1.5)) +
theme(axis.title.x=element_text(size=11,colour="#535353",face="bold",vjust=0)) +
theme(panel.grid.major.y = element_line(color = "#bad2d4", size = .5)) +
theme(panel.grid.major.x = element_line(color = "#bdd2d4", size = .5)) +
theme(panel.background = element_rect(fill = "white"))
hits <- Buxton %>%
filter(events %in% c("single", "double", "triple", "home run"))
x <- c(-.95,.95,.95,-.95,-.95)
z <- c(1.6,1.6,3.5,3.5,1.6)
#store in dataframe
sz <- data.frame(x,z)
##Changing Pitch Names
pitch_desc <- hits$pitch_type
##Changing Pitch Names
pitch_desc[which(pitch_desc=='CH')] <- "Changeup"
pitch_desc[which(pitch_desc=='CU')] <- "Curveball"
pitch_desc[which(pitch_desc=='FC')] <- "Cutter"
pitch_desc[which(pitch_desc=='FF')] <- "Four seam"
pitch_desc[which(pitch_desc=='FS')] <- "Split Flinger"
pitch_desc[which(pitch_desc=='FT')] <- "Two-Seam"
pitch_desc[which(pitch_desc=='KC')] <- "Kuckle-Curve"
pitch_desc[which(pitch_desc=='SI')] <- "Sinker"
pitch_desc[which(pitch_desc=='SL')] <- "Slider"
library(ggplot2)
library(viridis)
ggplot() +
geom_path(data = sz, aes(x=x, y=z)) +
coord_equal() +
geom_point(data = hits, aes(x = plate_x, y = plate_z, size = release_speed, color = pitch_desc)) +
scale_size(range = c(-1.0,2.5))+
scale_color_viridis(discrete = TRUE, option = "C") +
labs(size = "Speed",
color = "Pitch Type",
title = "Byron Buxton - Hits Chart") +
ylab("Feet Above Homeplate") +
xlab("Feet From Homeplate") +
theme(plot.title=element_text(face="bold",hjust=-.015,vjust=0,colour="#3C3C3C",size=20),
plot.subtitle=element_text(face="plain", hjust= -.015, vjust= .09, colour="#3C3C3C", size = 12)) +
theme(axis.text.x=element_text(vjust = .5, size=11,colour="#535353",face="bold")) +
theme(axis.text.y=element_text(size=11,colour="#535353",face="bold")) +
theme(axis.title.y=element_text(size=11,colour="#535353",face="bold",vjust=1.5)) +
theme(axis.title.x=element_text(size=11,colour="#535353",face="bold",vjust=0)) +
theme(panel.grid.major.y = element_line(color = "#bad2d4", size = .5)) +
theme(panel.grid.major.x = element_line(color = "#bdd2d4", size = .5)) +
theme(panel.background = element_rect(fill = "white"))
In this 2nd static visualization, I did a silhouette statistic plot based on the lecture notes and I used that to measure 2 variables: release_pos_x (horizontal Release Position of the ball measured in feet from the catcher’s perspective), and release_pos_z(Vertical Release Position of the ball measured in feet from the catcher’s perspective). I used k=5 for the k-mean values.
I also visualize the histogram of silhouette statistics within each cluster. The silhouette statistics for cluster 3 are generally higher than those for 1, 2, and 4 clusters, we can conclude that it is well-defined. Cluster 5 is right behind cluster 3 in terms of being well defined.
library("cluster")
library("stringr")
library("dplyr")
library("tidymodels")
## Warning: package 'tidymodels' was built under R version 4.1.3
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom 0.7.12 v rsample 0.1.1
## v dials 0.1.0 v tibble 3.1.6
## v infer 1.0.0 v tune 0.2.0
## v modeldata 0.1.1 v workflows 0.2.6
## v parsnip 0.2.1 v workflowsets 0.2.1
## v purrr 0.3.4 v yardstick 0.0.9
## v recipes 0.2.0
## Warning: package 'dials' was built under R version 4.1.3
## Warning: package 'infer' was built under R version 4.1.3
## Warning: package 'modeldata' was built under R version 4.1.3
## Warning: package 'parsnip' was built under R version 4.1.3
## Warning: package 'recipes' was built under R version 4.1.3
## Warning: package 'rsample' was built under R version 4.1.3
## Warning: package 'tune' was built under R version 4.1.3
## Warning: package 'workflows' was built under R version 4.1.3
## Warning: package 'workflowsets' was built under R version 4.1.3
## Warning: package 'yardstick' was built under R version 4.1.3
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## * Learn how to get started at https://www.tidymodels.org/start/
library("readr")
library("ggplot2")
theme_set(theme_bw())
set.seed(123)
Buxton <- read_csv("savant_data.csv") %>%
mutate(id = row_number())
## New names:
## * pitcher -> pitcher...8
## * fielder_2 -> fielder_2...42
## * pitcher -> pitcher...60
## * fielder_2 -> fielder_2...61
## Rows: 925 Columns: 92
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (16): pitch_type, player_name, events, description, des, game_type, sta...
## dbl (67): release_speed, release_pos_x, release_pos_z, batter, pitcher...8,...
## lgl (8): spin_dir, spin_rate_deprecated, break_angle_deprecated, break_len...
## date (1): game_date
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
cluster_Buxton <- function(penguins, K) {
x <- Buxton %>%
select(matches("release_speed|release_pos_x|release_pos_z")) %>%
scale()
kmeans(x, center = K) %>%
augment(Buxton) %>% # creates column ".cluster" with cluster label
mutate(silhouette = silhouette(as.integer(.cluster), dist(x))[, "sil_width"])
}
cur_id <- 2
Buxton5 <- cluster_Buxton(Buxton, K = 5)
obs_i <- Buxton5 %>%
filter(id == cur_id)
ggplot(Buxton5, aes(x = release_pos_x, y = release_pos_z, col = .cluster, size = silhouette)) +
geom_point(data = obs_i, size = 5, col = "black") +
geom_point() +
scale_color_brewer(palette = "Set2") +
scale_size(range = c(5, 1))
ggplot(Buxton5) +
geom_histogram(aes(x = silhouette), binwidth = 0.05) +
theme(axis.text = element_text(size = 12)) +
theme(axis.title = element_text(size = 20)) +
facet_grid(~ .cluster)