Show code
library(tidyverse)
library(lme4)
library(lmerTest)
library(performance)
library(knitr)
library(scales)
library(patchwork)
library(here)
source(here::here("R", "helpers.R"))Do school financial variables improve the attainment models?
library(tidyverse)
library(lme4)
library(lmerTest)
library(performance)
library(knitr)
library(scales)
library(patchwork)
library(here)
source(here::here("R", "helpers.R"))This document takes the baseline multilevel models from model_experiments.qmd and systematically tests whether adding school-level financial variables (from CFR/AAR returns) improves model fit.
The baseline model predicts log(ATT8SCR) using 9 fixed effects with nested random intercepts for year, Ofsted rating, and region/LA:
\[\log(\text{ATT8}) \sim \text{9 predictors} + (1|\text{year}) + (1|\text{Ofsted}) + (1|\text{region/LA})\]
We test financial variables one at a time, then in combination, comparing marginal and conditional \(R^2\).
# Load the main panel (has gorard_segregation, OFSTEDRATING_1, fin_* cols, etc.)
panel <- readRDS(here::here("data", "panel_data.rds"))
# Supplement with per-pupil columns not carried by 03a_add_finance.R
finance <- readRDS(here::here("data", "financial_returns.rds"))
finance_extra <- finance %>%
transmute(URN, academic_year,
fin_premises_costs_pp = premises_costs_pp)
panel <- panel %>%
left_join(finance_extra, by = c("URN", "academic_year"))
cat("Panel:", nrow(panel), "rows\n")Panel: 16174 rows
cat("Financial match:", sum(!is.na(panel$fin_total_income_pp)), "/", nrow(panel), "\n")Financial match: 10204 / 16174
The full model dataset requires workforce variables (so drops 2024-25) and financial data (which further reduces sample size).
model_data <- panel %>%
filter(MINORGROUP %in% c("Academy", "Maintained school")) %>%
filter(
!is.na(ATT8SCR), ATT8SCR > 0,
PTFSM6CLA1A > 0, PERCTOT > 0, PNUMEAL > 0,
!is.na(OFSTEDRATING_1), !is.na(gor_name), !is.na(LANAME),
!is.na(remained_in_the_same_school),
!is.na(teachers_on_leadership_pay_range_percent),
average_number_of_days_taken > 0,
!is.na(gorard_segregation)
) %>%
mutate(
OFSTEDRATING_1 = factor(OFSTEDRATING_1,
levels = c("Outstanding", "Good",
"Requires Improvement", "Inadequate"),
ordered = TRUE),
gor_name = factor(gor_name),
LANAME = factor(LANAME),
year_label = factor(year_label)
) %>%
droplevels()
contrasts(model_data$OFSTEDRATING_1) <-
contr.treatment(levels(model_data$OFSTEDRATING_1))
cat("Full model dataset:", nrow(model_data), "rows\n")Full model dataset: 12210 rows
cat("Years:", paste(levels(model_data$year_label), collapse = ", "), "\n")Years: 2021-22, 2022-23, 2023-24, 2024-25
cat("With financial data:", sum(!is.na(model_data$fin_total_income_pp)), "\n")With financial data: 9554
To ensure fair comparison, all models (baseline and extended) are fitted on the same rows — only those with complete financial data.
# Dataset with complete financial data for fair comparison
d_fin <- model_data %>%
filter(
!is.na(fin_total_income_pp), fin_total_income_pp > 0,
!is.na(fin_staff_income_ratio),
!is.na(fin_balance_income_ratio)
) %>%
droplevels()
contrasts(d_fin$OFSTEDRATING_1) <-
contr.treatment(levels(d_fin$OFSTEDRATING_1))
cat("Model dataset with financial data:", nrow(d_fin), "rows\n")Model dataset with financial data: 9548 rows
cat("Years:", paste(sort(unique(as.character(d_fin$year_label))), collapse = ", "), "\n")Years: 2021-22, 2022-23, 2023-24, 2024-25
cat("LAs:", n_distinct(d_fin$LANAME), "\n")LAs: 152
d_fin %>%
summarise(
across(
starts_with("fin_") & where(is.numeric),
list(
n = ~sum(!is.na(.x)),
Median = ~median(.x, na.rm = TRUE),
SD = ~sd(.x, na.rm = TRUE)
),
.names = "{.col}|{.fn}"
)
) %>%
pivot_longer(everything(), names_to = c("Variable", "Stat"), names_sep = "\\|") %>%
pivot_wider(names_from = Stat, values_from = value) %>%
mutate(
Label = var_label(Variable),
Median = round(Median, 2),
SD = round(SD, 2),
n = comma(n)
) %>%
select(Variable, Label, n, Median, SD) %>%
kable()| Variable | Label | n | Median | SD |
|---|---|---|---|---|
| fin_total_income | Total income (£) | 9,548 | 7780000.00 | 2925063.78 |
| fin_total_expenditure | Total expenditure (£) | 9,548 | 7450867.50 | 2873014.47 |
| fin_in_year_balance | In-year balance (£) | 9,548 | 228000.00 | 653071.57 |
| fin_revenue_reserve | Revenue reserve (£) | 9,548 | 505000.04 | 1131603.35 |
| fin_grant_funding | Grant funding (£) | 9,548 | 7562000.00 | 2815213.81 |
| fin_self_generated | Self-generated income (£) | 9,548 | 229509.50 | 424411.14 |
| fin_staff_costs | Total staff costs (£) | 9,548 | 5609000.00 | 2171065.82 |
| fin_teaching_staff | Teaching staff costs (£) | 8,926 | 4102546.00 | 1648305.11 |
| fin_premises_costs | Premises costs (£) | 9,548 | 416514.00 | 394619.97 |
| fin_energy | Energy costs (£) | 8,926 | 159000.00 | 108866.84 |
| fin_pupil_premium | Pupil premium income (£) | 1,685 | 275598.00 | 146758.69 |
| fin_total_income_pp | Total income per pupil (£) | 9,548 | 7208.88 | 1311.94 |
| fin_total_expenditure_pp | Total expenditure per pupil (£) | 9,548 | 6922.25 | 1379.05 |
| fin_staff_costs_pp | Staff costs per pupil (£) | 9,548 | 5210.32 | 979.37 |
| fin_teaching_staff_pp | Teaching staff costs per pupil (£) | 8,926 | 3806.37 | 736.47 |
| fin_grant_funding_pp | Grant funding per pupil (£) | 9,548 | 6985.92 | 1239.12 |
| fin_staff_income_ratio | Staff costs as % of income | 9,548 | 0.73 | 0.15 |
| fin_teaching_expenditure_ratio | Teaching staff as % of expenditure | 8,916 | 0.55 | 0.07 |
| fin_balance_income_ratio | In-year balance as % of income | 9,548 | 0.03 | 0.20 |
| fin_premises_costs_pp | fin_premises_costs_pp | 9,548 | 394.93 | 320.94 |
This exactly replicates Analysis A “All Pupils” from model_experiments.qmd, but fitted on the financial-data-complete subset for fair comparison.
mod_baseline <- lmer(
log(ATT8SCR) ~
log(PTFSM6CLA1A) + log(PERCTOT) + log(PNUMEAL) +
PTPRIORLO + ADMPOL_PT + gorard_segregation +
remained_in_the_same_school +
teachers_on_leadership_pay_range_percent +
log(average_number_of_days_taken) +
(1 | year_label) + (1 | OFSTEDRATING_1) + (1 | gor_name/LANAME),
data = d_fin,
REML = TRUE,
na.action = na.exclude,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 20000))
)
r2_baseline <- r2(mod_baseline)
cat("Baseline R² (marginal):", round(r2_baseline$R2_marginal, 4), "\n")Baseline R² (marginal): 0.6454
cat("Baseline R² (conditional):", round(r2_baseline$R2_conditional, 4), "\n")Baseline R² (conditional): 0.7904
cat("Singular?", isSingular(mod_baseline), "\n")Singular? FALSE
Each financial variable is added one at a time to the baseline model. We compare the change in marginal and conditional \(R^2\).
# Define candidate additions
# Per-pupil vars are log-transformed; ratios are left linear
candidates <- list(
"log(fin_total_income_pp)" = "Total income pp (log)",
"log(fin_total_expenditure_pp)" = "Total expenditure pp (log)",
"log(fin_staff_costs_pp)" = "Staff costs pp (log)",
"log(fin_teaching_staff_pp)" = "Teaching staff pp (log)",
"log(fin_grant_funding_pp)" = "Grant funding pp (log)",
"log(fin_premises_costs_pp)" = "Premises costs pp (log)",
"fin_staff_income_ratio" = "Staff/income ratio",
"fin_teaching_expenditure_ratio" = "Teaching/expenditure ratio",
"fin_balance_income_ratio" = "Balance/income ratio"
)
base_formula <- log(ATT8SCR) ~
log(PTFSM6CLA1A) + log(PERCTOT) + log(PNUMEAL) +
PTPRIORLO + ADMPOL_PT + gorard_segregation +
remained_in_the_same_school +
teachers_on_leadership_pay_range_percent +
log(average_number_of_days_taken) +
(1 | year_label) + (1 | OFSTEDRATING_1) + (1 | gor_name/LANAME)
results <- tibble(
term = character(),
label = character(),
r2_marginal = numeric(),
r2_conditional = numeric(),
delta_marginal = numeric(),
delta_conditional = numeric(),
coef = numeric(),
se = numeric(),
p_value = numeric(),
singular = logical()
)
# Fit each candidate
for (term in names(candidates)) {
# Build extended formula
ext_formula <- update(base_formula, paste("~ . +", term))
# Filter for valid values of the candidate
# (log-transformed vars need > 0)
d_sub <- d_fin
if (grepl("^log\\(", term)) {
inner_var <- gsub("^log\\((.+)\\)$", "\\1", term)
d_sub <- d_sub %>% filter(!is.na(.data[[inner_var]]), .data[[inner_var]] > 0)
} else {
d_sub <- d_sub %>% filter(!is.na(.data[[term]]), is.finite(.data[[term]]))
}
d_sub <- droplevels(d_sub)
contrasts(d_sub$OFSTEDRATING_1) <- contr.treatment(levels(d_sub$OFSTEDRATING_1))
tryCatch({
mod <- lmer(
ext_formula, data = d_sub, REML = TRUE, na.action = na.exclude,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 20000))
)
r2_ext <- r2(mod)
coefs <- summary(mod)$coefficients
# Find the row for our added term
term_clean <- term
row_match <- grep(gsub("[()]", "", term), rownames(coefs), fixed = FALSE)
if (length(row_match) == 0) row_match <- nrow(coefs)
results <- bind_rows(results, tibble(
term = term,
label = candidates[[term]],
r2_marginal = r2_ext$R2_marginal,
r2_conditional = r2_ext$R2_conditional,
delta_marginal = r2_ext$R2_marginal - r2_baseline$R2_marginal,
delta_conditional = r2_ext$R2_conditional - r2_baseline$R2_conditional,
coef = coefs[row_match[1], "Estimate"],
se = coefs[row_match[1], "Std. Error"],
p_value = coefs[row_match[1], "Pr(>|t|)"],
singular = isSingular(mod)
))
}, error = function(e) {
message(" Failed: ", term, " — ", e$message)
})
}results %>%
mutate(
`R² (m)` = sprintf("%.4f", r2_marginal),
`R² (c)` = sprintf("%.4f", r2_conditional),
`ΔR² (m)` = sprintf("%+.4f", delta_marginal),
`ΔR² (c)` = sprintf("%+.4f", delta_conditional),
Coef = sprintf("%.4f", coef),
SE = sprintf("%.4f", se),
p = case_when(
p_value < 0.001 ~ "< 0.001",
p_value < 0.01 ~ sprintf("%.3f", p_value),
p_value < 0.05 ~ sprintf("%.3f", p_value),
TRUE ~ sprintf("%.3f", p_value)
),
Sig = case_when(
p_value < 0.001 ~ "***",
p_value < 0.01 ~ "**",
p_value < 0.05 ~ "*",
p_value < 0.1 ~ ".",
TRUE ~ ""
)
) %>%
select(Variable = label, `R² (m)`, `R² (c)`, `ΔR² (m)`, `ΔR² (c)`,
Coef, SE, p, Sig) %>%
kable()| Variable | R² (m) | R² (c) | ΔR² (m) | ΔR² (c) | Coef | SE | p | Sig |
|---|---|---|---|---|---|---|---|---|
| Total income pp (log) | 0.6465 | 0.7896 | +0.0011 | -0.0008 | -0.0199 | 0.0058 | < 0.001 | *** |
| Total expenditure pp (log) | 0.6469 | 0.7895 | +0.0014 | -0.0008 | -0.0163 | 0.0051 | 0.001 | ** |
| Staff costs pp (log) | 0.6462 | 0.7899 | +0.0008 | -0.0005 | -0.0093 | 0.0038 | 0.016 | * |
| Teaching staff pp (log) | 0.6514 | 0.7976 | +0.0060 | +0.0072 | 0.0048 | 0.0050 | 0.330 | |
| Grant funding pp (log) | 0.6467 | 0.7894 | +0.0013 | -0.0010 | -0.0245 | 0.0063 | < 0.001 | *** |
| Premises costs pp (log) | 0.6449 | 0.7895 | -0.0006 | -0.0008 | -0.0011 | 0.0015 | 0.462 | |
| Staff/income ratio | 0.6454 | 0.7904 | -0.0001 | -0.0000 | 0.0031 | 0.0061 | 0.611 | |
| Teaching/expenditure ratio | 0.6539 | 0.7976 | +0.0084 | +0.0072 | 0.0689 | 0.0149 | < 0.001 | *** |
| Balance/income ratio | 0.6454 | 0.7904 | -0.0000 | -0.0000 | -0.0008 | 0.0045 | 0.854 |
results %>%
mutate(label = fct_reorder(label, delta_marginal)) %>%
ggplot(aes(delta_marginal, label)) +
geom_col(aes(fill = delta_marginal > 0), show.legend = FALSE) +
geom_vline(xintercept = 0, linetype = "dashed") +
scale_fill_manual(values = c(`TRUE` = "#2e6260", `FALSE` = "#7b132b")) +
scale_x_continuous(labels = label_number(accuracy = 0.0001)) +
labs(x = expression(Delta * R^2 ~ "(marginal)"),
y = NULL,
title = "Change in marginal R² from adding each financial variable")results %>%
filter(p_value < 0.1) %>%
mutate(
label = fct_reorder(label, coef),
lower = coef - 1.96 * se,
upper = coef + 1.96 * se
) %>%
ggplot(aes(coef, label)) +
geom_vline(xintercept = 0, linetype = "dashed", colour = "grey50") +
geom_pointrange(aes(xmin = lower, xmax = upper), colour = "#2e6260",
linewidth = 0.8, size = 0.8) +
labs(x = "Coefficient estimate (95% CI)", y = NULL,
title = "Significant or marginally significant financial predictors")Based on the single-variable results, we test combinations of the most promising financial variables.
# Select variables that improved marginal R² or were significant
sig_vars <- results %>%
filter(p_value < 0.05, delta_marginal > 0) %>%
arrange(desc(delta_marginal))
cat("Significant improvers (p < 0.05, positive ΔR²):\n")Significant improvers (p < 0.05, positive ΔR²):
for (i in seq_len(nrow(sig_vars))) {
cat(sprintf(" %s: ΔR²(m) = %+.4f, coef = %.4f, p = %.4f\n",
sig_vars$label[i], sig_vars$delta_marginal[i],
sig_vars$coef[i], sig_vars$p_value[i]))
} Teaching/expenditure ratio: ΔR²(m) = +0.0084, coef = 0.0689, p = 0.0000
Total expenditure pp (log): ΔR²(m) = +0.0014, coef = -0.0163, p = 0.0015
Grant funding pp (log): ΔR²(m) = +0.0013, coef = -0.0245, p = 0.0001
Total income pp (log): ΔR²(m) = +0.0011, coef = -0.0199, p = 0.0006
Staff costs pp (log): ΔR²(m) = +0.0008, coef = -0.0093, p = 0.0157
if (nrow(sig_vars) > 0) {
# Build formula with all significant vars
fin_terms <- paste(sig_vars$term, collapse = " + ")
multi_formula <- update(base_formula, paste("~ . +", fin_terms))
# Filter data
d_multi <- d_fin
for (term in sig_vars$term) {
if (grepl("^log\\(", term)) {
inner_var <- gsub("^log\\((.+)\\)$", "\\1", term)
d_multi <- d_multi %>% filter(!is.na(.data[[inner_var]]), .data[[inner_var]] > 0)
} else {
d_multi <- d_multi %>% filter(!is.na(.data[[term]]), is.finite(.data[[term]]))
}
}
d_multi <- droplevels(d_multi)
contrasts(d_multi$OFSTEDRATING_1) <- contr.treatment(levels(d_multi$OFSTEDRATING_1))
mod_multi <- lmer(
multi_formula, data = d_multi, REML = TRUE, na.action = na.exclude,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 20000))
)
r2_multi <- r2(mod_multi)
cat("Combined model R² (marginal):", round(r2_multi$R2_marginal, 4), "\n")
cat("Combined model R² (conditional):", round(r2_multi$R2_conditional, 4), "\n")
cat("Baseline R² (marginal):", round(r2_baseline$R2_marginal, 4), "\n")
cat("Improvement:", sprintf("%+.4f", r2_multi$R2_marginal - r2_baseline$R2_marginal), "\n")
cat("Singular?", isSingular(mod_multi), "\n")
cat("Observations:", nrow(d_multi), "\n\n")
summary(mod_multi)
} else {
cat("No financial variables significantly improved the baseline model.\n")
}Combined model R² (marginal): 0.6552
Combined model R² (conditional): 0.7972
Baseline R² (marginal): 0.6454
Improvement: +0.0097
Singular? FALSE
Observations: 8916
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: multi_formula
Data: d_multi
Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 20000))
REML criterion at convergence: -17708.4
Scaled residuals:
Min 1Q Median 3Q Max
-48.736 -0.483 0.024 0.542 5.276
Random effects:
Groups Name Variance Std.Dev.
LANAME:gor_name (Intercept) 0.0008404 0.02899
gor_name (Intercept) 0.0002928 0.01711
OFSTEDRATING_1 (Intercept) 0.0022490 0.04742
year_label (Intercept) 0.0019534 0.04420
Residual 0.0076204 0.08729
Number of obs: 8916, groups:
LANAME:gor_name, 152; gor_name, 9; OFSTEDRATING_1, 4; year_label, 4
Fixed effects:
Estimate Std. Error df
(Intercept) 4.730e+00 6.926e-02 1.215e+02
log(PTFSM6CLA1A) -6.569e-02 2.979e-03 8.024e+03
log(PERCTOT) -2.098e-01 5.148e-03 8.888e+03
log(PNUMEAL) 7.277e-03 1.283e-03 6.910e+03
PTPRIORLO -5.879e-03 1.511e-04 8.662e+03
ADMPOL_PTOTHER NON SEL -1.520e-03 7.727e-03 1.430e+03
ADMPOL_PTSEL 1.037e-01 7.215e-03 8.376e+03
gorard_segregation -7.739e-02 4.947e-02 4.592e+02
remained_in_the_same_school 3.892e-04 5.349e-05 8.844e+03
teachers_on_leadership_pay_range_percent -1.314e-03 2.213e-04 8.872e+03
log(average_number_of_days_taken) -1.486e-02 2.571e-03 8.838e+03
fin_teaching_expenditure_ratio 1.014e-01 1.884e-02 8.872e+03
log(fin_total_expenditure_pp) 3.992e-02 1.712e-02 8.855e+03
log(fin_grant_funding_pp) -3.488e-02 1.690e-02 8.869e+03
log(fin_total_income_pp) 6.436e-03 1.654e-02 8.838e+03
log(fin_staff_costs_pp) -2.904e-02 9.929e-03 8.835e+03
t value Pr(>|t|)
(Intercept) 68.294 < 2e-16 ***
log(PTFSM6CLA1A) -22.055 < 2e-16 ***
log(PERCTOT) -40.746 < 2e-16 ***
log(PNUMEAL) 5.673 1.46e-08 ***
PTPRIORLO -38.899 < 2e-16 ***
ADMPOL_PTOTHER NON SEL -0.197 0.84410
ADMPOL_PTSEL 14.377 < 2e-16 ***
gorard_segregation -1.564 0.11845
remained_in_the_same_school 7.277 3.69e-13 ***
teachers_on_leadership_pay_range_percent -5.939 2.98e-09 ***
log(average_number_of_days_taken) -5.781 7.67e-09 ***
fin_teaching_expenditure_ratio 5.384 7.49e-08 ***
log(fin_total_expenditure_pp) 2.332 0.01970 *
log(fin_grant_funding_pp) -2.064 0.03901 *
log(fin_total_income_pp) 0.389 0.69711
log(fin_staff_costs_pp) -2.925 0.00346 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Do financial variables help predict outcomes for disadvantaged pupils specifically?
d_disadv <- d_fin %>%
filter(!is.na(ATT8SCR_FSM6CLA1A), ATT8SCR_FSM6CLA1A > 0) %>%
droplevels()
contrasts(d_disadv$OFSTEDRATING_1) <- contr.treatment(levels(d_disadv$OFSTEDRATING_1))
mod_disadv_base <- lmer(
log(ATT8SCR_FSM6CLA1A) ~
log(PTFSM6CLA1A) + log(PERCTOT) + log(PNUMEAL) +
PTPRIORLO + ADMPOL_PT + gorard_segregation +
remained_in_the_same_school +
teachers_on_leadership_pay_range_percent +
log(average_number_of_days_taken) +
(1 | year_label) + (1 | OFSTEDRATING_1) + (1 | gor_name/LANAME),
data = d_disadv, REML = TRUE, na.action = na.exclude,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 20000))
)
r2_disadv_base <- r2(mod_disadv_base)
cat("Disadvantaged baseline R² (m):", round(r2_disadv_base$R2_marginal, 4), "\n")Disadvantaged baseline R² (m): 0.5461
cat("Disadvantaged baseline R² (c):", round(r2_disadv_base$R2_conditional, 4), "\n")Disadvantaged baseline R² (c): 0.7206
results_disadv <- tibble(
term = character(), label = character(),
r2_marginal = numeric(), r2_conditional = numeric(),
delta_marginal = numeric(), delta_conditional = numeric(),
coef = numeric(), se = numeric(), p_value = numeric()
)
disadv_base_formula <- log(ATT8SCR_FSM6CLA1A) ~
log(PTFSM6CLA1A) + log(PERCTOT) + log(PNUMEAL) +
PTPRIORLO + ADMPOL_PT + gorard_segregation +
remained_in_the_same_school +
teachers_on_leadership_pay_range_percent +
log(average_number_of_days_taken) +
(1 | year_label) + (1 | OFSTEDRATING_1) + (1 | gor_name/LANAME)
for (term in names(candidates)) {
ext_formula <- update(disadv_base_formula, paste("~ . +", term))
d_sub <- d_disadv
if (grepl("^log\\(", term)) {
inner_var <- gsub("^log\\((.+)\\)$", "\\1", term)
d_sub <- d_sub %>% filter(!is.na(.data[[inner_var]]), .data[[inner_var]] > 0)
} else {
d_sub <- d_sub %>% filter(!is.na(.data[[term]]), is.finite(.data[[term]]))
}
d_sub <- droplevels(d_sub)
contrasts(d_sub$OFSTEDRATING_1) <- contr.treatment(levels(d_sub$OFSTEDRATING_1))
tryCatch({
mod <- lmer(
ext_formula, data = d_sub, REML = TRUE, na.action = na.exclude,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 20000))
)
r2_ext <- r2(mod)
coefs <- summary(mod)$coefficients
row_match <- grep(gsub("[()]", "", term), rownames(coefs), fixed = FALSE)
if (length(row_match) == 0) row_match <- nrow(coefs)
results_disadv <- bind_rows(results_disadv, tibble(
term = term, label = candidates[[term]],
r2_marginal = r2_ext$R2_marginal, r2_conditional = r2_ext$R2_conditional,
delta_marginal = r2_ext$R2_marginal - r2_disadv_base$R2_marginal,
delta_conditional = r2_ext$R2_conditional - r2_disadv_base$R2_conditional,
coef = coefs[row_match[1], "Estimate"],
se = coefs[row_match[1], "Std. Error"],
p_value = coefs[row_match[1], "Pr(>|t|)"]
))
}, error = function(e) message(" Failed: ", term))
}results_disadv %>%
mutate(
`R² (m)` = sprintf("%.4f", r2_marginal),
`ΔR² (m)` = sprintf("%+.4f", delta_marginal),
Coef = sprintf("%.4f", coef),
p = case_when(
p_value < 0.001 ~ "< 0.001",
TRUE ~ sprintf("%.3f", p_value)
),
Sig = case_when(
p_value < 0.001 ~ "***", p_value < 0.01 ~ "**",
p_value < 0.05 ~ "*", p_value < 0.1 ~ ".", TRUE ~ ""
)
) %>%
select(Variable = label, `R² (m)`, `ΔR² (m)`, Coef, p, Sig) %>%
kable()| Variable | R² (m) | ΔR² (m) | Coef | p | Sig |
|---|---|---|---|---|---|
| Total income pp (log) | 0.5461 | -0.0000 | 0.0008 | 0.913 | |
| Total expenditure pp (log) | 0.5463 | +0.0001 | -0.0010 | 0.888 | |
| Staff costs pp (log) | 0.5463 | +0.0001 | -0.0030 | 0.574 | |
| Teaching staff pp (log) | 0.5495 | +0.0033 | 0.0167 | 0.012 | * |
| Grant funding pp (log) | 0.5461 | -0.0000 | -0.0010 | 0.904 | |
| Premises costs pp (log) | 0.5457 | -0.0005 | -0.0015 | 0.432 | |
| Staff/income ratio | 0.5462 | +0.0000 | -0.0016 | 0.839 | |
| Teaching/expenditure ratio | 0.5509 | +0.0047 | 0.0732 | < 0.001 | *** |
| Balance/income ratio | 0.5461 | -0.0001 | -0.0012 | 0.840 |
d_nondisadv <- d_fin %>%
filter(!is.na(ATT8SCR_NFSM6CLA1A), ATT8SCR_NFSM6CLA1A > 0) %>%
droplevels()
contrasts(d_nondisadv$OFSTEDRATING_1) <- contr.treatment(levels(d_nondisadv$OFSTEDRATING_1))
mod_nondisadv_base <- lmer(
log(ATT8SCR_NFSM6CLA1A) ~
log(PTFSM6CLA1A) + log(PERCTOT) + log(PNUMEAL) +
PTPRIORLO + ADMPOL_PT + gorard_segregation +
remained_in_the_same_school +
teachers_on_leadership_pay_range_percent +
log(average_number_of_days_taken) +
(1 | year_label) + (1 | OFSTEDRATING_1) + (1 | gor_name/LANAME),
data = d_nondisadv, REML = TRUE, na.action = na.exclude,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 20000))
)
r2_nondisadv_base <- r2(mod_nondisadv_base)
cat("Non-disadvantaged baseline R² (m):", round(r2_nondisadv_base$R2_marginal, 4), "\n")Non-disadvantaged baseline R² (m): 0.6131
cat("Non-disadvantaged baseline R² (c):", round(r2_nondisadv_base$R2_conditional, 4), "\n")Non-disadvantaged baseline R² (c): 0.7881
results_nondisadv <- tibble(
term = character(), label = character(),
r2_marginal = numeric(), r2_conditional = numeric(),
delta_marginal = numeric(), delta_conditional = numeric(),
coef = numeric(), se = numeric(), p_value = numeric()
)
nondisadv_base_formula <- log(ATT8SCR_NFSM6CLA1A) ~
log(PTFSM6CLA1A) + log(PERCTOT) + log(PNUMEAL) +
PTPRIORLO + ADMPOL_PT + gorard_segregation +
remained_in_the_same_school +
teachers_on_leadership_pay_range_percent +
log(average_number_of_days_taken) +
(1 | year_label) + (1 | OFSTEDRATING_1) + (1 | gor_name/LANAME)
for (term in names(candidates)) {
ext_formula <- update(nondisadv_base_formula, paste("~ . +", term))
d_sub <- d_nondisadv
if (grepl("^log\\(", term)) {
inner_var <- gsub("^log\\((.+)\\)$", "\\1", term)
d_sub <- d_sub %>% filter(!is.na(.data[[inner_var]]), .data[[inner_var]] > 0)
} else {
d_sub <- d_sub %>% filter(!is.na(.data[[term]]), is.finite(.data[[term]]))
}
d_sub <- droplevels(d_sub)
contrasts(d_sub$OFSTEDRATING_1) <- contr.treatment(levels(d_sub$OFSTEDRATING_1))
tryCatch({
mod <- lmer(
ext_formula, data = d_sub, REML = TRUE, na.action = na.exclude,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 20000))
)
r2_ext <- r2(mod)
coefs <- summary(mod)$coefficients
row_match <- grep(gsub("[()]", "", term), rownames(coefs), fixed = FALSE)
if (length(row_match) == 0) row_match <- nrow(coefs)
results_nondisadv <- bind_rows(results_nondisadv, tibble(
term = term, label = candidates[[term]],
r2_marginal = r2_ext$R2_marginal, r2_conditional = r2_ext$R2_conditional,
delta_marginal = r2_ext$R2_marginal - r2_nondisadv_base$R2_marginal,
delta_conditional = r2_ext$R2_conditional - r2_nondisadv_base$R2_conditional,
coef = coefs[row_match[1], "Estimate"],
se = coefs[row_match[1], "Std. Error"],
p_value = coefs[row_match[1], "Pr(>|t|)"]
))
}, error = function(e) message(" Failed: ", term))
}results_nondisadv %>%
mutate(
`R² (m)` = sprintf("%.4f", r2_marginal),
`ΔR² (m)` = sprintf("%+.4f", delta_marginal),
Coef = sprintf("%.4f", coef),
p = case_when(
p_value < 0.001 ~ "< 0.001",
TRUE ~ sprintf("%.3f", p_value)
),
Sig = case_when(
p_value < 0.001 ~ "***", p_value < 0.01 ~ "**",
p_value < 0.05 ~ "*", p_value < 0.1 ~ ".", TRUE ~ ""
)
) %>%
select(Variable = label, `R² (m)`, `ΔR² (m)`, Coef, p, Sig) %>%
kable()| Variable | R² (m) | ΔR² (m) | Coef | p | Sig |
|---|---|---|---|---|---|
| Total income pp (log) | 0.6137 | +0.0005 | -0.0131 | 0.007 | ** |
| Total expenditure pp (log) | 0.6142 | +0.0011 | -0.0121 | 0.005 | ** |
| Staff costs pp (log) | 0.6138 | +0.0007 | -0.0074 | 0.028 | * |
| Teaching staff pp (log) | 0.6171 | +0.0039 | 0.0025 | 0.554 | |
| Grant funding pp (log) | 0.6139 | +0.0008 | -0.0177 | < 0.001 | *** |
| Premises costs pp (log) | 0.6133 | +0.0001 | 0.0008 | 0.517 | |
| Staff/income ratio | 0.6131 | -0.0000 | 0.0007 | 0.895 | |
| Teaching/expenditure ratio | 0.6181 | +0.0050 | 0.0369 | 0.004 | ** |
| Balance/income ratio | 0.6131 | +0.0000 | 0.0008 | 0.830 |
all_results <- bind_rows(
results %>% mutate(outcome = "All Pupils"),
results_disadv %>% mutate(outcome = "Disadvantaged"),
results_nondisadv %>% mutate(outcome = "Non-Disadvantaged")
)
all_results %>%
mutate(label = fct_reorder(label, delta_marginal, .fun = mean)) %>%
ggplot(aes(delta_marginal, label, colour = outcome)) +
geom_vline(xintercept = 0, linetype = "dashed", colour = "grey50") +
geom_point(size = 3, position = position_dodge(width = 0.5)) +
scale_colour_manual(values = c("#2e6260", "#4e3c56", "#abc766", "#e16fca", "#f9dd73"), name = "Outcome") +
scale_x_continuous(labels = label_number(accuracy = 0.0001)) +
labs(x = expression(Delta * R^2 ~ "(marginal)"), y = NULL,
title = "Financial variable impact varies by pupil group") +
theme(legend.position = "bottom")all_results %>%
mutate(
sig = case_when(
p_value < 0.001 ~ "***",
p_value < 0.01 ~ "**",
p_value < 0.05 ~ "*",
p_value < 0.1 ~ ".",
TRUE ~ ""
),
cell = paste0(sprintf("%+.4f", delta_marginal), " ", sig)
) %>%
select(Variable = label, outcome, cell) %>%
pivot_wider(names_from = outcome, values_from = cell) %>%
kable()| Variable | All Pupils | Disadvantaged | Non-Disadvantaged |
|---|---|---|---|
| Total income pp (log) | +0.0011 *** | -0.0000 | +0.0005 ** |
| Total expenditure pp (log) | +0.0014 ** | +0.0001 | +0.0011 ** |
| Staff costs pp (log) | +0.0008 * | +0.0001 | +0.0007 * |
| Teaching staff pp (log) | +0.0060 | +0.0033 * | +0.0039 |
| Grant funding pp (log) | +0.0013 *** | -0.0000 | +0.0008 *** |
| Premises costs pp (log) | -0.0006 | -0.0005 | +0.0001 |
| Staff/income ratio | -0.0001 | +0.0000 | -0.0000 |
| Teaching/expenditure ratio | +0.0084 *** | +0.0047 *** | +0.0050 ** |
| Balance/income ratio | -0.0000 | -0.0001 | +0.0000 |
# Collect all significant improvers across all outcomes
all_sig <- all_results %>%
filter(p_value < 0.05, delta_marginal > 0)
if (nrow(all_sig) > 0) {
best_terms <- all_sig %>%
group_by(term, label) %>%
summarise(
mean_delta = mean(delta_marginal),
n_outcomes = n(),
.groups = "drop"
) %>%
arrange(desc(n_outcomes), desc(mean_delta))
cat("Financial variables that significantly improved at least one outcome:\n\n")
for (i in seq_len(nrow(best_terms))) {
cat(sprintf(" %s: improved %d/3 outcomes, mean ΔR²(m) = %+.4f\n",
best_terms$label[i], best_terms$n_outcomes[i],
best_terms$mean_delta[i]))
}
# Fit the best combined model for All Pupils
cat("\n\n--- Best combined model for All Pupils ---\n\n")
best_fin_terms <- paste(best_terms$term, collapse = " + ")
best_formula <- update(base_formula, paste("~ . +", best_fin_terms))
d_best <- d_fin
for (term in best_terms$term) {
if (grepl("^log\\(", term)) {
inner_var <- gsub("^log\\((.+)\\)$", "\\1", term)
d_best <- d_best %>% filter(!is.na(.data[[inner_var]]), .data[[inner_var]] > 0)
} else {
d_best <- d_best %>% filter(!is.na(.data[[term]]), is.finite(.data[[term]]))
}
}
d_best <- droplevels(d_best)
contrasts(d_best$OFSTEDRATING_1) <- contr.treatment(levels(d_best$OFSTEDRATING_1))
mod_best <- lmer(
best_formula, data = d_best, REML = TRUE, na.action = na.exclude,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 20000))
)
r2_best <- r2(mod_best)
cat("Best model R² (marginal):", round(r2_best$R2_marginal, 4), "\n")
cat("Best model R² (conditional):", round(r2_best$R2_conditional, 4), "\n")
cat("Baseline R² (marginal):", round(r2_baseline$R2_marginal, 4), "\n")
cat("Total improvement:", sprintf("%+.4f", r2_best$R2_marginal - r2_baseline$R2_marginal), "\n")
cat("Observations:", nrow(d_best), "\n")
cat("Singular?", isSingular(mod_best), "\n\n")
summary(mod_best)
} else {
cat("No financial variables significantly improved any of the three baseline models.\n")
cat("This suggests that school-level financial characteristics are largely\n")
cat("orthogonal to attainment outcomes once the existing predictors\n")
cat("(deprivation, absence, prior attainment, workforce, segregation)\n")
cat("are accounted for.\n")
}Financial variables that significantly improved at least one outcome:
Teaching/expenditure ratio: improved 3/3 outcomes, mean ΔR²(m) = +0.0061
Total expenditure pp (log): improved 2/3 outcomes, mean ΔR²(m) = +0.0013
Grant funding pp (log): improved 2/3 outcomes, mean ΔR²(m) = +0.0010
Total income pp (log): improved 2/3 outcomes, mean ΔR²(m) = +0.0008
Staff costs pp (log): improved 2/3 outcomes, mean ΔR²(m) = +0.0008
Teaching staff pp (log): improved 1/3 outcomes, mean ΔR²(m) = +0.0033
--- Best combined model for All Pupils ---
Best model R² (marginal): 0.655
Best model R² (conditional): 0.797
Baseline R² (marginal): 0.6454
Total improvement: +0.0095
Observations: 8904
Singular? FALSE
Linear mixed model fit by REML. t-tests use Satterthwaite's method [
lmerModLmerTest]
Formula: best_formula
Data: d_best
Control: lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 20000))
REML criterion at convergence: -17674.5
Scaled residuals:
Min 1Q Median 3Q Max
-48.712 -0.481 0.024 0.541 5.306
Random effects:
Groups Name Variance Std.Dev.
LANAME:gor_name (Intercept) 0.0008395 0.02897
gor_name (Intercept) 0.0002931 0.01712
OFSTEDRATING_1 (Intercept) 0.0022485 0.04742
year_label (Intercept) 0.0019563 0.04423
Residual 0.0076262 0.08733
Number of obs: 8904, groups:
LANAME:gor_name, 152; gor_name, 9; OFSTEDRATING_1, 4; year_label, 4
Fixed effects:
Estimate Std. Error df
(Intercept) 4.752e+00 8.106e-02 2.255e+02
log(PTFSM6CLA1A) -6.559e-02 2.990e-03 8.032e+03
log(PERCTOT) -2.097e-01 5.152e-03 8.875e+03
log(PNUMEAL) 7.271e-03 1.285e-03 6.881e+03
PTPRIORLO -5.883e-03 1.514e-04 8.643e+03
ADMPOL_PTOTHER NON SEL -1.131e-03 7.734e-03 1.426e+03
ADMPOL_PTSEL 1.037e-01 7.229e-03 8.368e+03
gorard_segregation -7.492e-02 4.950e-02 4.594e+02
remained_in_the_same_school 3.908e-04 5.378e-05 8.826e+03
teachers_on_leadership_pay_range_percent -1.315e-03 2.214e-04 8.859e+03
log(average_number_of_days_taken) -1.485e-02 2.576e-03 8.826e+03
fin_teaching_expenditure_ratio 7.539e-02 4.917e-02 8.826e+03
log(fin_total_expenditure_pp) 3.409e-02 2.848e-02 8.813e+03
log(fin_grant_funding_pp) -3.464e-02 1.741e-02 8.862e+03
log(fin_total_income_pp) 5.112e-03 1.676e-02 8.824e+03
log(fin_staff_costs_pp) -3.909e-02 1.695e-02 8.845e+03
log(fin_teaching_staff_pp) 1.676e-02 2.375e-02 8.793e+03
t value Pr(>|t|)
(Intercept) 58.628 < 2e-16 ***
log(PTFSM6CLA1A) -21.938 < 2e-16 ***
log(PERCTOT) -40.699 < 2e-16 ***
log(PNUMEAL) 5.657 1.60e-08 ***
PTPRIORLO -38.848 < 2e-16 ***
ADMPOL_PTOTHER NON SEL -0.146 0.8837
ADMPOL_PTSEL 14.351 < 2e-16 ***
gorard_segregation -1.513 0.1308
remained_in_the_same_school 7.268 3.97e-13 ***
teachers_on_leadership_pay_range_percent -5.937 3.01e-09 ***
log(average_number_of_days_taken) -5.766 8.37e-09 ***
fin_teaching_expenditure_ratio 1.533 0.1252
log(fin_total_expenditure_pp) 1.197 0.2314
log(fin_grant_funding_pp) -1.989 0.0467 *
log(fin_total_income_pp) 0.305 0.7604
log(fin_staff_costs_pp) -2.306 0.0211 *
log(fin_teaching_staff_pp) 0.705 0.4805
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary_rows <- tibble(
Model = c("Baseline (no finance)", "Best financial model"),
`R² marginal` = c(
round(r2_baseline$R2_marginal, 4),
if (exists("mod_best")) round(r2_best$R2_marginal, 4) else NA
),
`R² conditional` = c(
round(r2_baseline$R2_conditional, 4),
if (exists("mod_best")) round(r2_best$R2_conditional, 4) else NA
)
)
if (exists("mod_multi") && nrow(sig_vars) > 0) {
summary_rows <- bind_rows(
summary_rows[1, ],
tibble(
Model = "All significant financial vars",
`R² marginal` = round(r2_multi$R2_marginal, 4),
`R² conditional` = round(r2_multi$R2_conditional, 4)
),
summary_rows[2, ]
)
}
summary_rows %>%
filter(!is.na(`R² marginal`)) %>%
kable()| Model | R² marginal | R² conditional |
|---|---|---|
| Baseline (no finance) | 0.6454 | 0.7904 |
| All significant financial vars | 0.6552 | 0.7972 |
| Best financial model | 0.6550 | 0.7970 |
Do special educational needs (SEN) and Education, Health and Care Plan (EHCP) variables add predictive power beyond the 9-variable baseline?
The panel data includes four school-level SEN variables from the school census:
| Variable | Description |
|---|---|
PSENELSE |
% pupils with an EHC plan |
TSENELSE |
Number of pupils with an EHC plan |
PSENELK |
% pupils with SEN support (but no EHC plan) |
TSENELK |
Number of pupils with SEN support |
We test each variable as a single addition to the baseline model, following the same methodology as the financial experiments above. The baseline here uses average_number_of_days_taken (raw, not log-transformed) to align with the main model specification.
sen_vars <- c("PSENELSE", "TSENELSE", "PSENELK", "TSENELK")
model_data %>%
summarise(
across(
all_of(sen_vars),
list(
n_valid = ~sum(!is.na(.x)),
pct_valid = ~round(mean(!is.na(.x)) * 100, 1),
Median = ~median(.x, na.rm = TRUE),
SD = ~sd(.x, na.rm = TRUE)
),
.names = "{.col}|{.fn}"
)
) %>%
pivot_longer(everything(), names_to = c("Variable", "Stat"), names_sep = "\\|") %>%
pivot_wider(names_from = Stat, values_from = value) %>%
mutate(
Label = var_label(Variable),
Median = round(Median, 2),
SD = round(SD, 2),
n_valid = comma(n_valid)
) %>%
select(Variable, Label, n_valid, pct_valid, Median, SD) %>%
kable()| Variable | Label | n_valid | pct_valid | Median | SD |
|---|---|---|---|---|---|
| PSENELSE | % pupils with EHC plan | 12,210 | 100 | 2.3 | 1.71 |
| TSENELSE | No. pupils with EHC plan | 12,210 | 100 | 24.0 | 18.27 |
| PSENELK | % pupils with SEN support | 12,210 | 100 | 12.8 | 5.61 |
| TSENELK | No. pupils with SEN support | 12,210 | 100 | 131.0 | 66.27 |
We create a dataset with complete SEN data for fair comparison. The baseline is re-fitted on this same subset using the raw (not log-transformed) average_number_of_days_taken.
d_sen <- model_data %>%
filter(
!is.na(PSENELSE), !is.na(PSENELK),
!is.na(TSENELSE), !is.na(TSENELK)
) %>%
droplevels()
contrasts(d_sen$OFSTEDRATING_1) <-
contr.treatment(levels(d_sen$OFSTEDRATING_1))
cat("SEN-complete dataset:", nrow(d_sen), "rows\n")SEN-complete dataset: 12210 rows
cat("Years:", paste(sort(unique(as.character(d_sen$year_label))), collapse = ", "), "\n")Years: 2021-22, 2022-23, 2023-24, 2024-25
cat("LAs:", n_distinct(d_sen$LANAME), "\n")LAs: 152
cat("Schools:", n_distinct(d_sen$URN), "\n")Schools: 3311
mod_sen_baseline <- lmer(
log(ATT8SCR) ~
log(PTFSM6CLA1A) + log(PERCTOT) + log(PNUMEAL) +
PTPRIORLO + ADMPOL_PT + gorard_segregation +
remained_in_the_same_school +
teachers_on_leadership_pay_range_percent +
average_number_of_days_taken +
(1 | year_label) + (1 | OFSTEDRATING_1) + (1 | gor_name/LANAME),
data = d_sen,
REML = TRUE,
na.action = na.exclude,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 20000))
)
r2_sen_baseline <- r2(mod_sen_baseline)
cat("SEN baseline R² (marginal):", round(r2_sen_baseline$R2_marginal, 4), "\n")SEN baseline R² (marginal): 0.6317
cat("SEN baseline R² (conditional):", round(r2_sen_baseline$R2_conditional, 4), "\n")SEN baseline R² (conditional): 0.7712
cat("Singular?", isSingular(mod_sen_baseline), "\n")Singular? FALSE
sen_candidates <- c("PSENELSE", "TSENELSE", "PSENELK", "TSENELK")
base_formula_sen <- log(ATT8SCR) ~
log(PTFSM6CLA1A) + log(PERCTOT) + log(PNUMEAL) +
PTPRIORLO + ADMPOL_PT + gorard_segregation +
remained_in_the_same_school +
teachers_on_leadership_pay_range_percent +
average_number_of_days_taken +
(1 | year_label) + (1 | OFSTEDRATING_1) + (1 | gor_name/LANAME)
sen_results <- map_dfr(sen_candidates, function(term) {
fml <- update(base_formula_sen, paste("~ . +", term))
mod <- tryCatch(
lmer(fml, data = d_sen, REML = TRUE, na.action = na.exclude,
control = lmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 20000))),
error = function(e) NULL
)
if (is.null(mod)) {
return(tibble(term = term, label = var_label(term),
coef = NA, se = NA, t_value = NA, p_value = NA,
r2_marginal = NA, r2_conditional = NA,
delta_marginal = NA, delta_conditional = NA,
singular = NA))
}
r2_ext <- r2(mod)
coefs <- summary(mod)$coefficients
row <- which(rownames(coefs) == term)
tibble(
term = term,
label = var_label(term),
coef = coefs[row, "Estimate"],
se = coefs[row, "Std. Error"],
t_value = coefs[row, "t value"],
p_value = coefs[row, "Pr(>|t|)"],
r2_marginal = r2_ext$R2_marginal,
r2_conditional = r2_ext$R2_conditional,
delta_marginal = r2_ext$R2_marginal - r2_sen_baseline$R2_marginal,
delta_conditional = r2_ext$R2_conditional - r2_sen_baseline$R2_conditional,
singular = isSingular(mod)
)
})sen_results %>%
mutate(
coef = round(coef, 5),
se = round(se, 5),
t_value = round(t_value, 2),
p_value = round(p_value, 4),
r2_marginal = round(r2_marginal, 4),
r2_conditional = round(r2_conditional, 4),
delta_marginal = round(delta_marginal, 5),
delta_conditional = round(delta_conditional, 5),
sig = case_when(
p_value < 0.001 ~ "***",
p_value < 0.01 ~ "**",
p_value < 0.05 ~ "*",
p_value < 0.1 ~ ".",
TRUE ~ ""
)
) %>%
select(Variable = term, Label = label, Coef = coef, SE = se,
`t` = t_value, `p` = p_value, Sig = sig,
`R² marg.` = r2_marginal, `ΔR² marg.` = delta_marginal) %>%
kable()| Variable | Label | Coef | SE | t | p | Sig | R² marg. | ΔR² marg. |
|---|---|---|---|---|---|---|---|---|
| PSENELSE | % pupils with EHC plan | -0.00249 | 0.00060 | -4.12 | 0.0000 | *** | 0.6307 | -0.00101 |
| TSENELSE | No. pupils with EHC plan | -0.00012 | 0.00006 | -1.93 | 0.0538 | . | 0.6315 | -0.00023 |
| PSENELK | % pupils with SEN support | -0.00196 | 0.00019 | -10.05 | 0.0000 | *** | 0.6343 | 0.00259 |
| TSENELK | No. pupils with SEN support | -0.00009 | 0.00002 | -5.16 | 0.0000 | *** | 0.6339 | 0.00224 |
sen_results %>%
filter(!is.na(delta_marginal)) %>%
mutate(label = fct_reorder(label, delta_marginal)) %>%
ggplot(aes(delta_marginal, label)) +
geom_col(aes(fill = delta_marginal > 0), show.legend = FALSE) +
geom_vline(xintercept = 0, linetype = "dashed") +
scale_fill_manual(values = c(`TRUE` = "#2e6260", `FALSE` = "#7b132b")) +
scale_x_continuous(labels = label_number(accuracy = 0.0001)) +
labs(x = expression(Delta * R^2 ~ "(marginal)"),
y = NULL,
title = "Change in marginal R² from adding each SEN/EHCP variable")sen_results %>%
filter(!is.na(coef)) %>%
mutate(
label = fct_reorder(label, coef),
lower = coef - 1.96 * se,
upper = coef + 1.96 * se
) %>%
ggplot(aes(coef, label)) +
geom_vline(xintercept = 0, linetype = "dashed", colour = "grey50") +
geom_pointrange(aes(xmin = lower, xmax = upper), colour = "#2e6260",
linewidth = 0.8, size = 0.8) +
labs(x = "Coefficient estimate (95% CI)", y = NULL,
title = "SEN/EHCP variable coefficients")If any SEN variables are significant (p < 0.05) and improve marginal R², we fit them together.
sen_sig <- sen_results %>%
filter(p_value < 0.05, delta_marginal > 0)
cat("Significant SEN improvers:", nrow(sen_sig), "\n")Significant SEN improvers: 2
if (nrow(sen_sig) > 0) {
cat("Variables:", paste(sen_sig$term, collapse = ", "), "\n")
}Variables: PSENELK, TSENELK
if (nrow(sen_sig) > 1) {
multi_terms <- paste(sen_sig$term, collapse = " + ")
fml_multi <- update(base_formula_sen, paste("~ . +", multi_terms))
mod_sen_multi <- lmer(
fml_multi, data = d_sen, REML = TRUE, na.action = na.exclude,
control = lmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 20000))
)
r2_sen_multi <- r2(mod_sen_multi)
cat("Multi-SEN model R² (marginal):",
round(r2_sen_multi$R2_marginal, 4), "\n")
cat("Multi-SEN model R² (conditional):",
round(r2_sen_multi$R2_conditional, 4), "\n")
cat("ΔR² marginal:",
round(r2_sen_multi$R2_marginal - r2_sen_baseline$R2_marginal, 5), "\n")
cat("\nCoefficients:\n")
print(summary(mod_sen_multi)$coefficients[sen_sig$term, , drop = FALSE])
} else if (nrow(sen_sig) == 1) {
cat("\nOnly one significant SEN variable — no multi-variable model needed.\n")
cat("The single-variable result above is the best SEN model.\n")
} else {
cat("\nNo SEN variables significantly improve the baseline model.\n")
}Multi-SEN model R² (marginal): 0.6324
Multi-SEN model R² (conditional): 0.7731
ΔR² marginal: 0.00068
Coefficients:
Estimate Std. Error df t value Pr(>|t|)
PSENELK -0.0032831261 3.297113e-04 12159.61 -9.957578 2.868785e-23
TSENELK 0.0001482161 2.979569e-05 12174.08 4.974415 6.633826e-07
sen_summary <- tibble(
Model = "Baseline (no SEN)",
`R² marginal` = round(r2_sen_baseline$R2_marginal, 4),
`R² conditional` = round(r2_sen_baseline$R2_conditional, 4)
)
# Add best single-variable result
best_single <- sen_results %>% filter(!is.na(delta_marginal)) %>%
slice_max(delta_marginal, n = 1)
if (nrow(best_single) > 0 && best_single$delta_marginal > 0) {
sen_summary <- bind_rows(
sen_summary,
tibble(
Model = paste0("+ ", best_single$label, " (best single)"),
`R² marginal` = round(best_single$r2_marginal, 4),
`R² conditional` = round(best_single$r2_conditional, 4)
)
)
}
if (exists("mod_sen_multi") && nrow(sen_sig) > 1) {
sen_summary <- bind_rows(
sen_summary,
tibble(
Model = paste0("+ all significant SEN vars (",
paste(sen_sig$term, collapse = " + "), ")"),
`R² marginal` = round(r2_sen_multi$R2_marginal, 4),
`R² conditional` = round(r2_sen_multi$R2_conditional, 4)
)
)
}
sen_summary %>% kable()| Model | R² marginal | R² conditional |
|---|---|---|
| Baseline (no SEN) | 0.6317 | 0.7712 |
| + % pupils with SEN support (best single) | 0.6343 | 0.7724 |
| + all significant SEN vars (PSENELK + TSENELK) | 0.6324 | 0.7731 |
PSENELSE, PSENELK) are more interpretable than counts because they adjust for school size.PTFSM6CLA1A), so these variables may not add much beyond what FSM% already captures.PSENELSE (% with EHCP) would suggest that schools with more EHCP pupils have lower attainment even after controlling for deprivation, absence, prior attainment, and other predictors — potentially reflecting the severity of need rather than school effectiveness.TSENELSE, TSENELK) may partly proxy for school size and should be interpreted with caution.