Making comparisons between groups and strata
Melbourne’s daily maximum temperature from 1970 to 2020.
What are the strata in temporal data?
df9 <- read_csv(here::here("data", "melb_temp.csv")) |>
janitor::clean_names() |>
rename(temp = maximum_temperature_degree_c) |>
filter(! |>
dplyr::select(year, month, day, temp)
Why can we make the comparison across months?
Because it is the same location, and same years, for each month subset.
Is some variation in temperature each month due to changing climate?
How would you check this?
What is scales="free_y"
data(olives, package = "classifly")
df2 <- olives |>
mutate(Region = factor(Region, labels = c("South", "Sardinia", "North")))
g1 <-
df2 |>
mutate(Area = fct_reorder(Area, palmitic)) |>
ggplot(aes(Area, palmitic, color = Region)) +
geom_boxplot() +
scale_color_discrete_divergingx(palette="Zissou 1") +
guides(color = FALSE, x = guide_axis( = 2)) +
g2 <- ggplot(df2, aes(Region, palmitic, color = Region)) +
geom_boxplot() +
scale_color_discrete_divergingx(palette="Zissou 1") +
guides(color = FALSE) +
theme(axis.text = element_blank())
g3 <- ggplot(df2, aes(palmitic, color = Region)) +
geom_density() +
scale_color_discrete_divergingx(palette="Zissou 1") +
guides(color = FALSE) +
theme(axis.text = element_blank())
g4 <- ggplot(df2, aes(palmitic, color = Region)) +
stat_ecdf() +
scale_color_discrete_divergingx(palette="Zissou 1") +
guides(color = FALSE) +
theme(axis.text = element_blank())
g5 <- g2 + g3 + g4 + plot_layout(ncol=3)
g1 + g5 + plot_layout(ncol=1, heights=c(2,1),
guides = "collect")
Comparison to all, by putting a shadow of all the data underneath the subset in each cell.
The coplot divides the numerical variable into chunks, and facets by these. The chunks traditionally we overlapping.
g1 <- ggplot(EastIndiesTrade, aes(Year, Exports)) +
xmin = 1701, xmax = 1714,
ymin = -Inf, ymax = Inf,
fill = "red", alpha = 0.3
) +
xmin = 1756, xmax = 1763,
ymin = -Inf, ymax = Inf,
fill = "red", alpha = 0.3
) +
xmin = 1775, xmax = 1780,
ymin = -Inf, ymax = Inf,
fill = "red", alpha = 0.3
) +
geom_line(color = "#339933", size = 2) +
geom_line(aes(Year, Imports), color = "red", size = 2) +
geom_ribbon(aes(ymin = Exports, ymax = Imports), fill = "gray") +
labs(y = "<span style='color:#339933'>Export</span>/<span style='color:red'>Import</span>", tag = "(A)") +
theme(aspect.ratio=0.7, axis.title.y = ggtext::element_markdown())
g2 <- ggplot(EastIndiesTrade, aes(Year, Imports - Exports)) +
xmin = 1701, xmax = 1714,
ymin = -Inf, ymax = Inf,
fill = "red", alpha = 0.3
) +
xmin = 1756, xmax = 1763,
ymin = -Inf, ymax = Inf,
fill = "red", alpha = 0.3
) +
xmin = 1775, xmax = 1780,
ymin = -Inf, ymax = Inf,
fill = "red", alpha = 0.3
) +
geom_line(size = 2) +
labs(tag = "(B)") +
g3 <- ggplot(EastIndiesTrade, aes(Year, (Imports - Exports) / (Exports + Imports) * 2)) +
xmin = 1701, xmax = 1714,
ymin = -Inf, ymax = Inf,
fill = "red", alpha = 0.3
) +
xmin = 1756, xmax = 1763,
ymin = -Inf, ymax = Inf,
fill = "red", alpha = 0.3
) +
xmin = 1775, xmax = 1780,
ymin = -Inf, ymax = Inf,
fill = "red", alpha = 0.3
) +
geom_line(color = "#001a66", size = 2) +
labs(y = "Relative difference", tag = "(C)") +
g1 + g1 + g2 + g3 + plot_layout(ncol=2)
If we were wanting to measure the effect of incorporating a data analytics competition on student learning which is the best design?
What other modifications to the design can you think of?
The goal is to demonstrate that the hexagon tile map is better than the choropleth for communicating disease incidence across Australia.
The choropleth fills geographic regions (LGAs, SA2s, …) with colour corresponding to the thyroid cancer relative difference from the overall mean. The hexagons, are also filled this way.
Pairing is done on the data set. Four different data sets used for each pattern.
Trial 1
Participant 1
Participant 2
Trial 2
Participant 1
Participant 2
Ignore the pairing
Looks like detection rate about 50-50 for hexagon tile map, which is better than almost zero for choropleth map.
Account for the pairing
hstudy |>
filter(trend == "three cities") |>
select(type, replicate, detect) |>
group_by(type, replicate) |>
summarise(pdetect = length(detect[detect == 1])/length(detect)) |>
ggplot(aes(x=type, y=pdetect)) +
geom_point() +
geom_line(aes(group=replicate)) +
ylim(c(0,1)) +
xlab("") +
ylab("Proportion detected")
For each data set, the hexagon tile map performed better.
I am 165 cms tall.
Within each strata convert values to a z-score.
\[ z = \frac{x-\bar{x}}{s} \]
My z-score is 0.09.
Rob’s height is 170 cms. His z-score is -0.49.
I am relatively TALLER than Rob.
data(anorexia, package="MASS")
aes(x=Prewt, y=Postwt,
colour=Treat)) +
coord_equal() +
xlim(c(70, 110)) + ylim(c(70, 110)) +
xlab("Pre-treatment weight (lbs)") +
ylab("Post-treatment weight (lbs)") +
geom_abline(intercept=0, slope=1,
colour="grey80", linewidth=1.25) +
geom_density2d() +
geom_point(size=3) +
facet_grid(.~Treat) +
theme(legend.position = "none")
Primary comparison is sex
, relative to yearly trend.
Primary comparison is age
, relative to yearly trend.
Primary comparison is year
trend, separately for age and sex.
For choropleth vs hexagon tiles, sample participants with replacement.
hstudy_sub <- hstudy |>
filter(trend == "three cities") |>
select(id, type, replicate, detect)
# Function to compute proportions
prop_func <- function(df) {
df_smry <- df |>
group_by(type, replicate) |>
summarise(pdetect = length(detect[detect == 1])/length(detect)) |>
ungroup() |>
pivot_wider(names_from = c(type, replicate),
values_from = pdetect)
nboots <- 100
bsamps <- tibble(samp="0", prop_func(hstudy_sub))
for (i in 1:nboots) {
samp_id <- sort(sample(unique(hstudy_sub$id),
hs_b <- NULL
for (j in samp_id) {
x <- hstudy_sub |>
filter(id == j)
hs_b <- bind_rows(hs_b, x)
bsamps <- bind_rows(bsamps,
tibble(samp=as.character(i), prop_func(hs_b)))
bsamps_long <- bsamps |>
names_to = "treatments",
values_to = "pdetect") |>
separate(treatments, into=c("type", "replicate"))
ggplot() +
geom_line(data=filter(bsamps_long, samp != "0"),
linewidth=0.5, alpha=0.6, colour="grey70") +
samp == "0"),
linewidth=2) +
facet_wrap(~replicate) +
ylim(c(0,1)) +
xlab("") +
ylab("Proportion detected")
For choropleth vs hexagon tiles, randomise the type of plot each participant received. This breaks any dependence between type and detection rate.
n_nulls <- 11
lsamps <- tibble(samp="0", prop_func(hstudy_sub))
for (i in 1:n_nulls) {
hs_b <- hstudy_sub |>
group_by(id) |>
mutate(type = sample(type)) |>
lsamps <- bind_rows(lsamps,
tibble(samp=as.character(i), prop_func(hs_b)))
lsamps_long <- lsamps |>
names_to = "treatments",
values_to = "pdetect") |>
separate(treatments, into=c("type", "replicate"))
lsamps_long |> ggplot() +
group=replicate)) +
facet_wrap(~samp, ncol=4) +
ylim(c(0,1)) +
xlab("") +
ylab("Proportion detected")
