Penguins EDA

1 Penguin Size and Mass by Sex and Species

Code
library(palmerpenguins) 
library(dplyr)
library(ggplot2)
library(tidyr)
library(DBI)
library(duckdb)
Code
# df <- palmerpenguins::penguins%>% drop_na()
# query data from duckdb database
con = DBI::dbConnect(duckdb::duckdb(), dbdir = "my-db.duckdb")
df = dplyr::tbl(con, "penguins")
Code
  df %>%
  group_by(species, sex) %>%
  summarise(
    across(
        ends_with("mm") | ends_with("g"),
      \(x) mean(x, na.rm = TRUE)
      )
    ) %>%
  # not required, but illustrates that work has been pushed off to duckdb
  dplyr::collect() %>%
  knitr::kable()
species sex bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
Adelie NA 37.84000 18.32000 185.6000 3540.000
Gentoo female 45.56379 14.23793 212.7069 4679.741
Adelie male 40.39041 19.07260 192.4110 4043.493
Chinstrap male 51.09412 19.25294 199.9118 3938.971
Adelie female 37.25753 17.62192 187.7945 3368.836
Gentoo NA 45.62500 14.55000 215.7500 4587.500
Gentoo male 49.47377 15.71803 221.5410 5484.836
Chinstrap female 46.57353 17.58824 191.7353 3527.206

2 Penguin Size vs Mass by Species

Code
# shifting backend to duckdb required refactoring pipe to keep operations
# together
df %>%
  filter(!is.na(species), !is.na(bill_length_mm), !is.na(body_mass_g)) %>%
  mutate(colour = case_when(
    (species == "Adelie") ~ "#ff7400",
    (species == "Chinstrap") ~ "#c35ccc",
    (species == "Gentoo") ~ "#057275",
    TRUE ~ ""
  )) %>%
  ggplot(aes(x = bill_length_mm, y = body_mass_g, colour = colour)) +
  geom_point() +
  geom_smooth(method = "lm") +
  scale_color_manual(
    values = c("#ff7400", "#c35ccc", "#057275"),
    labels = c("Adelie", "Chinstrap", "Gentoo")
  ) +
  theme_bw()

Code
DBI::dbDisconnect(con, shutdown = TRUE)