Making comparisons between groups and strata
Department of Econometrics and Business Statistics
-Edward Tufte
Melbourne’s daily maximum temperature from 1970 to 2020.
What are the strata in temporal data?
df9 <- read_csv(here::here("data", "melb_temp.csv")) |>
janitor::clean_names() |>
rename(temp = maximum_temperature_degree_c) |>
filter(!is.na(temp)) |>
dplyr::select(year, month, day, temp)
skimr::skim(df9)
── Data Summary ────────────────────────
Values
Name df9
Number of rows 18310
Number of columns 4
_______________________
Column type frequency:
character 2
numeric 2
________________________
Group variables None
── Variable type: character ────────────────────────────────
skim_variable n_missing complete_rate min max empty
1 month 0 1 2 2 0
2 day 0 1 2 2 0
n_unique whitespace
1 12 0
2 31 0
── Variable type: numeric ──────────────────────────────────
skim_variable n_missing complete_rate mean sd p0
1 year 0 1 1995. 14.5 1970
2 temp 0 1 19.9 6.48 5.7
p25 p50 p75 p100 hist
1 1983 1995 2008 2020 ▇▇▇▇▇
2 14.8 18.6 23.6 46.8 ▃▇▃▁▁
Why can we make the comparison across months?
Because it is the same location, and same years, for each month subset.
Is some variation in temperature each month due to changing climate?
How would you check this?
What is scales="free_y"
for?
data(olives, package = "classifly")
df2 <- olives |>
mutate(Region = factor(Region, labels = c("South", "Sardinia", "North")))
skimr::skim(df2)
── Data Summary ────────────────────────
Values
Name df2
Number of rows 572
Number of columns 10
_______________________
Column type frequency:
factor 2
numeric 8
________________________
Group variables None
── Variable type: factor ───────────────────────────────────
skim_variable n_missing complete_rate ordered n_unique
1 Region 0 1 FALSE 3
2 Area 0 1 FALSE 9
top_counts
1 Sou: 323, Nor: 151, Sar: 98
2 Sou: 206, Inl: 65, Cal: 56, Umb: 51
── Variable type: numeric ──────────────────────────────────
skim_variable n_missing complete_rate mean sd p0
1 palmitic 0 1 1232. 169. 610
2 palmitoleic 0 1 126. 52.5 15
3 stearic 0 1 229. 36.7 152
4 oleic 0 1 7312. 406. 6300
5 linoleic 0 1 981. 243. 448
6 linolenic 0 1 31.9 13.0 0
7 arachidic 0 1 58.1 22.0 0
8 eicosenoic 0 1 16.3 14.1 1
p25 p50 p75 p100 hist
1 1095 1201 1360 1753 ▁▂▇▆▁
2 87.8 110 169. 280 ▂▇▅▃▁
3 205 223 249 375 ▂▇▃▁▁
4 7000 7302. 7680 8410 ▁▇▇▇▁
5 771. 1030 1181. 1470 ▃▅▃▇▃
6 26 33 40.2 74 ▂▅▇▂▁
7 50 61 70 105 ▂▁▇▇▂
8 2 17 28 58 ▇▃▅▂▁
g1 <-
df2 |>
mutate(Area = fct_reorder(Area, palmitic)) |>
ggplot(aes(Area, palmitic, color = Region)) +
geom_boxplot() +
scale_color_discrete_divergingx(palette="Zissou 1") +
guides(color = FALSE, x = guide_axis(n.dodge = 2)) +
theme(aspect.ratio=0.5)
g2 <- ggplot(df2, aes(Region, palmitic, color = Region)) +
geom_boxplot() +
scale_color_discrete_divergingx(palette="Zissou 1") +
guides(color = FALSE) +
theme(axis.text = element_blank())
g3 <- ggplot(df2, aes(palmitic, color = Region)) +
geom_density() +
scale_color_discrete_divergingx(palette="Zissou 1") +
guides(color = FALSE) +
theme(axis.text = element_blank())
g4 <- ggplot(df2, aes(palmitic, color = Region)) +
stat_ecdf() +
scale_color_discrete_divergingx(palette="Zissou 1") +
guides(color = FALSE) +
theme(axis.text = element_blank())
g5 <- g2 + g3 + g4 + plot_layout(ncol=3)
g1 + g5 + plot_layout(ncol=1, heights=c(2,1),
guides = "collect")
Comparison to all, by putting a shadow of all the data underneath the subset in each cell.
The coplot divides the numerical variable into chunks, and facets by these. The chunks traditionally we overlapping.
Becker, Cleveland and Shyu, (1996); Cleveland (1993)
── Data Summary ────────────────────────
Values
Name EastIndiesTrade
Number of rows 81
Number of columns 3
_______________________
Column type frequency:
numeric 3
________________________
Group variables None
── Variable type: numeric ──────────────────────────────────
skim_variable n_missing complete_rate mean sd p0
1 Year 0 1 1740 23.5 1700
2 Exports 0 1 518. 421. 100
3 Imports 0 1 1005. 320. 460
p25 p50 p75 p100 hist
1 1720 1740 1760 1780 ▇▇▇▇▇
2 145 370 840 1395 ▇▂▃▂▂
3 835 975 1000 1550 ▃▃▇▁▅
g1 <- ggplot(EastIndiesTrade, aes(Year, Exports)) +
annotate("rect",
xmin = 1701, xmax = 1714,
ymin = -Inf, ymax = Inf,
fill = "red", alpha = 0.3
) +
annotate("rect",
xmin = 1756, xmax = 1763,
ymin = -Inf, ymax = Inf,
fill = "red", alpha = 0.3
) +
annotate("rect",
xmin = 1775, xmax = 1780,
ymin = -Inf, ymax = Inf,
fill = "red", alpha = 0.3
) +
geom_line(color = "#339933", size = 2) +
geom_line(aes(Year, Imports), color = "red", size = 2) +
geom_ribbon(aes(ymin = Exports, ymax = Imports), fill = "gray") +
labs(y = "<span style='color:#339933'>Export</span>/<span style='color:red'>Import</span>", tag = "(A)") +
theme(aspect.ratio=0.7, axis.title.y = ggtext::element_markdown())
g2 <- ggplot(EastIndiesTrade, aes(Year, Imports - Exports)) +
annotate("rect",
xmin = 1701, xmax = 1714,
ymin = -Inf, ymax = Inf,
fill = "red", alpha = 0.3
) +
annotate("rect",
xmin = 1756, xmax = 1763,
ymin = -Inf, ymax = Inf,
fill = "red", alpha = 0.3
) +
annotate("rect",
xmin = 1775, xmax = 1780,
ymin = -Inf, ymax = Inf,
fill = "red", alpha = 0.3
) +
geom_line(size = 2) +
labs(tag = "(B)") +
theme(aspect.ratio=0.7)
g3 <- ggplot(EastIndiesTrade, aes(Year, (Imports - Exports) / (Exports + Imports) * 2)) +
annotate("rect",
xmin = 1701, xmax = 1714,
ymin = -Inf, ymax = Inf,
fill = "red", alpha = 0.3
) +
annotate("rect",
xmin = 1756, xmax = 1763,
ymin = -Inf, ymax = Inf,
fill = "red", alpha = 0.3
) +
annotate("rect",
xmin = 1775, xmax = 1780,
ymin = -Inf, ymax = Inf,
fill = "red", alpha = 0.3
) +
geom_line(color = "#001a66", size = 2) +
labs(y = "Relative difference", tag = "(C)") +
theme(aspect.ratio=0.7)
g1 + g1 + g2 + g3 + plot_layout(ncol=2)
If we were wanting to measure the effect of incorporating a data analytics competition on student learning which is the best design?
METHOD A
METHOD B
What other modifications to the design can you think of?
The goal is to demonstrate that the hexagon tile map is better than the choropleth for communicating disease incidence across Australia.
The choropleth fills geographic regions (LGAs, SA2s, …) with colour corresponding to the thyroid cancer relative difference from the overall mean. The hexagons, are also filled this way.
Pairing is done on the data set. Four different data sets used for each pattern.
Trial 1
Participant 1
Participant 2
Trial 2
Participant 1
Participant 2
Ignore the pairing
Looks like detection rate about 50-50 for hexagon tile map, which is better than almost zero for choropleth map.
Account for the pairing
hstudy |>
filter(trend == "three cities") |>
select(type, replicate, detect) |>
group_by(type, replicate) |>
summarise(pdetect = length(detect[detect == 1])/length(detect)) |>
ggplot(aes(x=type, y=pdetect)) +
geom_point() +
geom_line(aes(group=replicate)) +
ylim(c(0,1)) +
xlab("") +
ylab("Proportion detected")
For each data set, the hexagon tile map performed better.
I am 165 cms tall.
Within each strata convert values to a z-score.
\[ z = \frac{x-\bar{x}}{s} \]
My z-score is 0.09.
Rob’s height is 170 cms. His z-score is -0.49.
I am relatively TALLER than Rob.
data(anorexia, package="MASS")
ggplot(data=anorexia,
aes(x=Prewt, y=Postwt,
colour=Treat)) +
coord_equal() +
xlim(c(70, 110)) + ylim(c(70, 110)) +
xlab("Pre-treatment weight (lbs)") +
ylab("Post-treatment weight (lbs)") +
geom_abline(intercept=0, slope=1,
colour="grey80", linewidth=1.25) +
geom_density2d() +
geom_point(size=3) +
facet_grid(.~Treat) +
theme(legend.position = "none")
Primary comparison is sex
, relative to yearly trend.
Primary comparison is age
, relative to yearly trend.
Primary comparison is year
trend, separately for age and sex.
For choropleth vs hexagon tiles, sample participants with replacement.
hstudy_sub <- hstudy |>
filter(trend == "three cities") |>
select(id, type, replicate, detect)
# Function to compute proportions
prop_func <- function(df) {
df_smry <- df |>
group_by(type, replicate) |>
summarise(pdetect = length(detect[detect == 1])/length(detect)) |>
ungroup() |>
pivot_wider(names_from = c(type, replicate),
values_from = pdetect)
df_smry
}
nboots <- 100
set.seed(1023)
bsamps <- tibble(samp="0", prop_func(hstudy_sub))
for (i in 1:nboots) {
samp_id <- sort(sample(unique(hstudy_sub$id),
replace=TRUE))
hs_b <- NULL
for (j in samp_id) {
x <- hstudy_sub |>
filter(id == j)
hs_b <- bind_rows(hs_b, x)
}
bsamps <- bind_rows(bsamps,
tibble(samp=as.character(i), prop_func(hs_b)))
}
bsamps_long <- bsamps |>
pivot_longer(Geography_9:Hexagons_12,
names_to = "treatments",
values_to = "pdetect") |>
separate(treatments, into=c("type", "replicate"))
ggplot() +
geom_line(data=filter(bsamps_long, samp != "0"),
aes(x=type,
y=pdetect,
group=samp),
linewidth=0.5, alpha=0.6, colour="grey70") +
geom_line(data=filter(bsamps_long,
samp == "0"),
aes(x=type,
y=pdetect,
group=samp),
linewidth=2) +
facet_wrap(~replicate) +
ylim(c(0,1)) +
xlab("") +
ylab("Proportion detected")
For choropleth vs hexagon tiles, randomise the type of plot each participant received. This breaks any dependence between type and detection rate.
n_nulls <- 11
set.seed(1110)
lsamps <- tibble(samp="0", prop_func(hstudy_sub))
for (i in 1:n_nulls) {
hs_b <- hstudy_sub |>
group_by(id) |>
mutate(type = sample(type)) |>
ungroup()
lsamps <- bind_rows(lsamps,
tibble(samp=as.character(i), prop_func(hs_b)))
}
lsamps_long <- lsamps |>
pivot_longer(Geography_9:Hexagons_12,
names_to = "treatments",
values_to = "pdetect") |>
separate(treatments, into=c("type", "replicate"))
lsamps_long |> ggplot() +
geom_line(aes(x=type,
y=pdetect,
group=replicate)) +
facet_wrap(~samp, ncol=4) +
ylim(c(0,1)) +
xlab("") +
ylab("Proportion detected")
ETC5521 Lecture 7 | ddde.numbat.space