Lead concetration in Flint water samples

bar graph
social
Author

Ana Luisa Bodevan

Published

November 4, 2025

This week challenge dataset in on Lead concetration in Flint water samples. Check the TidyTuesday GitHub repo for the data.

1. SETUP

1.1 Load libraries and data

Code
library(pacman)
pacman :: p_load(tidytuesdayR, tidyverse, dplyr, janitor, ggtext, showtext, scales, glue, cowplot, ggdist)

tuesdata <- tidytuesdayR::tt_load('2025-11-04')

1.2 Set theme

Code
my_theme <- function(gridline_x = TRUE, gridline_y = TRUE) {
  gridline <- element_line(
    linetype = "dashed",
    linewidth = 0.15,
    color = "#999999"
  )

  gridline_x <- if (isTRUE(gridline_x)) gridline else element_blank()
  gridline_y <- if (isTRUE(gridline_y)) gridline else element_blank()

  theme_minimal() +
    theme(
      plot.title = element_text(size = 18, face = "bold", color = "#333333", margin = margin(b = 10)),
      plot.subtitle = element_text(size = 14, color = "#999999", margin = margin(b = 10)),
      plot.caption = element_text(size = 13, color = "#777777", margin = margin(t = 15), hjust = 0),
      axis.text = element_text(size = 11, color = "#333333"),
      plot.title.position = "plot",
      plot.caption.position = "plot",
      panel.grid.minor = element_blank(),
      panel.grid.major.x = gridline_x,
      panel.grid.major.y = gridline_y,
      axis.ticks.x = element_line(linetype = "solid", linewidth = 0.25, color = "#999999"),
      axis.ticks.length.x = unit(4, units = "pt")
    )
}

2. DATA WRANGLING

2.1 Inital cleaning

Code
flint_mdeq <- tuesdata$flint_mdeq |> clean_names()
flint_vt <- tuesdata$flint_vt |> clean_names()
rm(tuesdata)

2.2 Tidy values and prepare stats table

Code
flint_mdeq <- flint_mdeq |> 
  mutate(
    excluded = is.na(lead2),
    source = "MDEQ"
  ) |>
  select(sample, lead, excluded, notes, source)

flint_vt <- flint_vt |> 
    reframe(
    sample, lead,
    excluded = FALSE,
    notes = NA_character_,
    source = "Virginia Tech"
  )

flint_all <- bind_rows(flint_mdeq, flint_vt) |>
  mutate(
    source = factor(source, levels = c("MDEQ", "Virginia Tech")),
    status = if_else(excluded, "Excluded by MDEQ", "Included")
  )

summary_table <- flint_all %>%
  group_by(source) %>%
  summarise(
    n = n(),
    mean = mean(lead),
    median = median(lead),
    sd = sd(lead),
    pct_above_5 = mean(lead > 5) * 100,
    pct_above_15 = mean(lead > 15) * 100
  ) %>%
  arrange(source)

print(summary_table)

2.3 Set thresholds

Code
thresholds <- tibble(
  name = c("EPA Acceptable (5 ppb)", "EPA Intervention (15 ppb)"),
  value = c(5, 15),
  type = c("EPA Acceptable", "EPA Intervention")
)

3. Plot

3.1 Colors and text

Code
pal <- c("MDEQ" = "#616247", "Virginia Tech" = "#DAA03D")

font_add_google("Inter", "inter")
showtext_auto()

bg_color <- "#f5f5f5"

title <- "Lead Concentration in Flint Water Samples: Government vs Citizen Testing"
subtitle <- "Testing from Virginia Tech (n = 271) and MDEQ (n = 71) tell very different stories"
caption <- "Data: Loux and Gibson (2018)\nGraphic: Ana Bodevan anabodevan.github.io\n#TidyTuesday 2025 - Week 44"

3.2 Plot density

Code
ggplot(flint_all, aes(x = lead, y = source, fill = source)) +

  ggdist::stat_halfeye(
    adjust = 0.8,       
    width = 0.6,        
    justification = -0.2,
    point_colour = NA,    
    interval_alpha = 0,   
    .width = NA           
  ) +

  geom_vline(
    data = thresholds,
    aes(xintercept = value, color = type, linetype = type),
    size = 0.8
  ) +

  scale_color_manual(
    name = "EPA threshold",
    values = c("EPA Acceptable" = "#420612", "EPA Intervention" = "#420612")
  ) +
  scale_linetype_manual(
    name = "EPA threshold",
    values = c("EPA Acceptable" = "dashed", "EPA Intervention" = "solid")
  ) +

  scale_x_continuous(
    trans = scales::pseudo_log_trans(base = 10),
    breaks = c(0, 0.5, 1, 5, 10, 15, 50, 100),
    labels = c("0", "0.5", "1", "5", "10", "15", "50", "100"),
    expand = expansion(mult = c(0.01, 0.02))
  ) +

  scale_fill_manual(values = pal, name = "Source") +

  labs(
    title = title,
    subtitle = subtitle,
    x = "Lead (Part per Billion)",
    y = NULL,
    caption = caption
  ) +


  my_theme(gridline_x = TRUE, gridline_y = FALSE) +

  theme(
    legend.position = "top",
    legend.box = "horizontal",
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 11),

    plot.background = element_rect(fill = bg_color, color = NA),
    plot.subtitle = element_text(color = "gray20"),

    text = element_text(family = "inter"),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.title.y = element_blank(),
    plot.margin = margin(t = 10, r = 10, b = 5, l = 10)
  ) +

  coord_cartesian(ylim = c(1.5, 3))