1. Load and Validate Data

data <- read.csv("data.csv")
stopifnot(nrow(data) == 24)
stopifnot(all(data$y == rowSums(data[, paste0("c", 1:6)])))

# Factor labels for plotting
data$prompt_label  <- factor(ifelse(data$A == -1, "Direct", "CoT"),
                             levels = c("Direct", "CoT"))
data$context_label <- factor(ifelse(data$B == -1, "No Example", "Example"),
                             levels = c("No Example", "Example"))
data$model_label   <- factor(ifelse(data$C == -1, "Flash-Lite", "Pro"),
                             levels = c("Flash-Lite", "Pro"))

cat("Data loaded:", nrow(data), "runs\n")
## Data loaded: 24 runs
print(data[order(data$run_order),
           c("run_order", "A", "B", "C", "replicate", "c1", "c2", "c3", "c4", "c5", "c6", "y")])
##    run_order  A  B  C replicate c1 c2 c3 c4 c5 c6 y
## 1          1 -1  1 -1         3  1  1  0  1  1  1 5
## 2          2  1 -1 -1         2  1  1  0  1  1  1 5
## 3          3 -1  1  1         3  1  1  1  1  1  1 6
## 4          4  1  1 -1         1  1  1  1  1  1  1 6
## 5          5  1  1 -1         3  1  1  0  1  1  1 5
## 6          6 -1  1 -1         2  1  1  1  1  1  1 6
## 7          7  1 -1  1         2  1  1  1  1  1  1 6
## 8          8 -1 -1 -1         1  1  1  0  1  1  1 5
## 9          9  1  1 -1         2  1  1  1  1  1  1 6
## 10        10 -1 -1  1         2  1  1  1  1  1  1 6
## 11        11 -1 -1 -1         2  1  1  0  1  1  1 5
## 12        12 -1  1  1         2  1  1  1  1  1  1 6
## 13        13  1  1  1         2  1  1  1  1  1  1 6
## 14        14  1 -1  1         1  1  1  1  1  1  1 6
## 15        15 -1 -1  1         3  1  1  1  1  1  1 6
## 16        16  1  1  1         1  1  1  1  1  1  1 6
## 17        17 -1  1 -1         1  1  1  1  1  1  1 6
## 18        18 -1  1  1         1  1  1  1  1  1  1 6
## 19        19 -1 -1 -1         3  1  1  0  1  1  1 5
## 20        20 -1 -1  1         1  1  1  1  1  1  1 6
## 21        21  1  1  1         3  1  1  1  1  1  1 6
## 22        22  1 -1 -1         3  1  1  0  1  1  1 5
## 23        23  1 -1 -1         1  1  1  0  1  1  1 5
## 24        24  1 -1  1         3  1  1  1  1  1  1 6

2. Treatment Combination Means

combo_means <- aggregate(y ~ A + B + C, data = data, FUN = mean)
combo_vars  <- aggregate(y ~ A + B + C, data = data, FUN = var)
combo_summary <- merge(combo_means, combo_vars, by = c("A", "B", "C"),
                       suffixes = c("_mean", "_var"))

# Add readable labels
combo_summary$Prompt  <- ifelse(combo_summary$A == -1, "Direct", "CoT")
combo_summary$Context <- ifelse(combo_summary$B == -1, "No Example", "Example")
combo_summary$Model   <- ifelse(combo_summary$C == -1, "Flash-Lite", "Pro")

print(combo_summary[, c("Prompt", "Context", "Model", "y_mean", "y_var")])
##   Prompt    Context      Model   y_mean     y_var
## 1 Direct No Example Flash-Lite 5.000000 0.0000000
## 2 Direct No Example        Pro 6.000000 0.0000000
## 3 Direct    Example Flash-Lite 5.666667 0.3333333
## 4 Direct    Example        Pro 6.000000 0.0000000
## 5    CoT No Example Flash-Lite 5.000000 0.0000000
## 6    CoT No Example        Pro 6.000000 0.0000000
## 7    CoT    Example Flash-Lite 5.666667 0.3333333
## 8    CoT    Example        Pro 6.000000 0.0000000

3. Factorial Effects (Manual Computation)

n_reps <- 3
k <- 3
N <- 2^k

# Grand mean
grand_mean <- mean(data$y)

# Main effects
eff_A <- mean(data$y[data$A == 1]) - mean(data$y[data$A == -1])
eff_B <- mean(data$y[data$B == 1]) - mean(data$y[data$B == -1])
eff_C <- mean(data$y[data$C == 1]) - mean(data$y[data$C == -1])

# Two-factor interactions
eff_AB <- 0.5 * (
  (mean(data$y[data$A ==  1 & data$B ==  1]) + mean(data$y[data$A == -1 & data$B == -1])) -
  (mean(data$y[data$A ==  1 & data$B == -1]) + mean(data$y[data$A == -1 & data$B ==  1]))
)
eff_AC <- 0.5 * (
  (mean(data$y[data$A ==  1 & data$C ==  1]) + mean(data$y[data$A == -1 & data$C == -1])) -
  (mean(data$y[data$A ==  1 & data$C == -1]) + mean(data$y[data$A == -1 & data$C ==  1]))
)
eff_BC <- 0.5 * (
  (mean(data$y[data$B ==  1 & data$C ==  1]) + mean(data$y[data$B == -1 & data$C == -1])) -
  (mean(data$y[data$B ==  1 & data$C == -1]) + mean(data$y[data$B == -1 & data$C ==  1]))
)

# Three-factor interaction
eff_ABC <- 0.5 * (
  (mean(data$y[data$A ==  1 & data$B ==  1 & data$C ==  1]) +
   mean(data$y[data$A ==  1 & data$B == -1 & data$C == -1]) +
   mean(data$y[data$A == -1 & data$B ==  1 & data$C == -1]) +
   mean(data$y[data$A == -1 & data$B == -1 & data$C ==  1])) -
  (mean(data$y[data$A == -1 & data$B == -1 & data$C == -1]) +
   mean(data$y[data$A == -1 & data$B ==  1 & data$C ==  1]) +
   mean(data$y[data$A ==  1 & data$B == -1 & data$C ==  1]) +
   mean(data$y[data$A ==  1 & data$B ==  1 & data$C == -1]))
)

effects <- c(A = eff_A, B = eff_B, C = eff_C,
             AB = eff_AB, AC = eff_AC, BC = eff_BC, ABC = eff_ABC)

# Pooled error variance
SSE <- sum(aggregate(y ~ A + B + C, data = data,
                     FUN = function(x) sum((x - mean(x))^2))$y)
df_error <- N * (n_reps - 1)  # 8 * 2 = 16
MSE <- SSE / df_error

# Standard error of effects
SE_effect <- sqrt(4 * MSE / (N * n_reps))

# t-statistics and p-values
t_stats  <- effects / SE_effect
p_values <- 2 * pt(abs(t_stats), df = df_error, lower.tail = FALSE)

# 95% confidence intervals
t_crit   <- qt(0.975, df = df_error)
ci_lower <- effects - t_crit * SE_effect
ci_upper <- effects + t_crit * SE_effect

effects_table <- data.frame(
  Effect   = names(effects),
  Estimate = round(effects, 4),
  SE       = round(rep(SE_effect, length(effects)), 4),
  t_stat   = round(t_stats, 4),
  p_value  = round(p_values, 6),
  CI_lower = round(ci_lower, 4),
  CI_upper = round(ci_upper, 4),
  Signif   = ifelse(p_values < 0.001, "***",
             ifelse(p_values < 0.01,  "**",
             ifelse(p_values < 0.05,  "*", "")))
)
rownames(effects_table) <- NULL

cat("Grand Mean:", round(grand_mean, 4), "\n")
## Grand Mean: 5.6667
cat("Pooled MSE:", round(MSE, 4), "  df_error:", df_error, "\n")
## Pooled MSE: 0.0833   df_error: 16
cat("SE of effects:", round(SE_effect, 4), "\n\n")
## SE of effects: 0.1179
print(effects_table)
##   Effect Estimate     SE  t_stat  p_value CI_lower CI_upper Signif
## 1      A   0.0000 0.1179  0.0000 1.000000  -0.2498   0.2498       
## 2      B   0.3333 0.1179  2.8284 0.012109   0.0835   0.5832      *
## 3      C   0.6667 0.1179  5.6569 0.000036   0.4168   0.9165    ***
## 4     AB   0.0000 0.1179  0.0000 1.000000  -0.2498   0.2498       
## 5     AC   0.0000 0.1179  0.0000 1.000000  -0.2498   0.2498       
## 6     BC  -0.3333 0.1179 -2.8284 0.012109  -0.5832  -0.0835      *
## 7    ABC   0.0000 0.1179  0.0000 1.000000  -0.2498   0.2498

4. ANOVA Table

fit <- lm(y ~ A * B * C, data = data)
cat("=== ANOVA Table ===\n")
## === ANOVA Table ===
print(anova(fit))
## Analysis of Variance Table
## 
## Response: y
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## A          1 0.00000 0.00000       0   1.00000    
## B          1 0.66667 0.66667       8   0.01211 *  
## C          1 2.66667 2.66667      32 3.571e-05 ***
## A:B        1 0.00000 0.00000       0   1.00000    
## A:C        1 0.00000 0.00000       0   1.00000    
## B:C        1 0.66667 0.66667       8   0.01211 *  
## A:B:C      1 0.00000 0.00000       0   1.00000    
## Residuals 16 1.33333 0.08333                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n=== Model Summary ===\n")
## 
## === Model Summary ===
print(summary(fit))
## 
## Call:
## lm(formula = y ~ A * B * C, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6667  0.0000  0.0000  0.0000  0.3333 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.667e+00  5.893e-02  96.167  < 2e-16 ***
## A           -4.221e-16  5.893e-02   0.000   1.0000    
## B            1.667e-01  5.893e-02   2.828   0.0121 *  
## C            3.333e-01  5.893e-02   5.657 3.57e-05 ***
## A:B         -4.931e-16  5.893e-02   0.000   1.0000    
## A:C          4.700e-16  5.893e-02   0.000   1.0000    
## B:C         -1.667e-01  5.893e-02  -2.828   0.0121 *  
## A:B:C        3.985e-16  5.893e-02   0.000   1.0000    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2887 on 16 degrees of freedom
## Multiple R-squared:   0.75,  Adjusted R-squared:  0.6406 
## F-statistic: 6.857 on 7 and 16 DF,  p-value: 0.0007243

5. Main Effects Plot

par(mfrow = c(1, 3), mar = c(5, 4, 3, 1))

# A: Prompt Strategy
means_A <- tapply(data$y, data$prompt_label, mean)
barplot(means_A, main = "Factor A: Prompt Strategy",
        ylab = "Mean Constraints Satisfied",
        col = c("#4A90D9", "#D9764A"), border = NA,
        ylim = c(0, 6.5))
text(x = c(0.7, 1.9), y = means_A + 0.15, labels = round(means_A, 2), cex = 0.9)

# B: Context
means_B <- tapply(data$y, data$context_label, mean)
barplot(means_B, main = "Factor B: Context",
        ylab = "Mean Constraints Satisfied",
        col = c("#4A90D9", "#D9764A"), border = NA,
        ylim = c(0, 6.5))
text(x = c(0.7, 1.9), y = means_B + 0.15, labels = round(means_B, 2), cex = 0.9)

# C: Model
means_C <- tapply(data$y, data$model_label, mean)
barplot(means_C, main = "Factor C: Model",
        ylab = "Mean Constraints Satisfied",
        col = c("#4A90D9", "#D9764A"), border = NA,
        ylim = c(0, 6.5))
text(x = c(0.7, 1.9), y = means_C + 0.15, labels = round(means_C, 2), cex = 0.9)

6. Interaction Plots

par(mfrow = c(1, 3), mar = c(5, 4, 3, 2))

# A x B
means_AB <- aggregate(y ~ prompt_label + context_label, data = data, FUN = mean)
interaction.plot(
  x.factor = means_AB$prompt_label,
  trace.factor = means_AB$context_label,
  response = means_AB$y,
  main = "A x B Interaction\n(Prompt x Context)",
  xlab = "Prompt Strategy", ylab = "Mean Constraints Satisfied",
  trace.label = "Context", col = c("#D94A4A", "#4A90D9"), lwd = 2,
  ylim = c(4.5, 6.5), type = "b", pch = c(16, 17)
)

# A x C
means_AC <- aggregate(y ~ prompt_label + model_label, data = data, FUN = mean)
interaction.plot(
  x.factor = means_AC$prompt_label,
  trace.factor = means_AC$model_label,
  response = means_AC$y,
  main = "A x C Interaction\n(Prompt x Model)",
  xlab = "Prompt Strategy", ylab = "Mean Constraints Satisfied",
  trace.label = "Model", col = c("#D94A4A", "#4A90D9"), lwd = 2,
  ylim = c(4.5, 6.5), type = "b", pch = c(16, 17)
)

# B x C
means_BC <- aggregate(y ~ context_label + model_label, data = data, FUN = mean)
interaction.plot(
  x.factor = means_BC$context_label,
  trace.factor = means_BC$model_label,
  response = means_BC$y,
  main = "B x C Interaction\n(Context x Model)",
  xlab = "Context", ylab = "Mean Constraints Satisfied",
  trace.label = "Model", col = c("#D94A4A", "#4A90D9"), lwd = 2,
  ylim = c(4.5, 6.5), type = "b", pch = c(16, 17)
)

7. Diagnostic Plots

par(mfrow = c(1, 2), mar = c(5, 4, 3, 1))

# Residuals vs Fitted
plot(fitted(fit), resid(fit),
     main = "Residuals vs Fitted Values",
     xlab = "Fitted Values", ylab = "Residuals",
     pch = 19, col = rgb(0, 0, 0, 0.6), cex = 1.2)
abline(h = 0, lty = 2, col = "red", lwd = 1.5)

# Normal Q-Q
qqnorm(resid(fit), main = "Normal Q-Q Plot of Residuals",
       pch = 19, col = rgb(0, 0, 0, 0.6), cex = 1.2)
qqline(resid(fit), col = "red", lwd = 1.5)

cat("=== Diagnostic Notes ===\n")
## === Diagnostic Notes ===
cat("Unique fitted values:", length(unique(fitted(fit))), "\n")
## Unique fitted values: 6
cat("Unique residual values:", length(unique(round(resid(fit), 6))), "\n")
## Unique residual values: 3
cat("Residual range:", range(resid(fit)), "\n")
## Residual range: -0.6666667 0.3333333
cat("\nNote: The discrete nature of the response (only values 5 and 6 observed)\n")
## 
## Note: The discrete nature of the response (only values 5 and 6 observed)
cat("and zero variance in 6 of 8 treatment groups creates non-standard residual\n")
## and zero variance in 6 of 8 treatment groups creates non-standard residual
cat("patterns. Homogeneity of variance assumption is violated.\n")
## patterns. Homogeneity of variance assumption is violated.

8. Per-Constraint Breakdown

cat("=== PER-CONSTRAINT PASS RATES ===\n\n")
## === PER-CONSTRAINT PASS RATES ===
constraint_names <- c(
  c1 = "Word Ban", c2 = "Sentence Cap", c3 = "5 Bullets",
  c4 = "Opening Word", c5 = "No Questions", c6 = "Sign-off"
)

pass_rates <- data.frame(
  Constraint = character(),
  Overall = numeric(),
  Direct = numeric(), CoT = numeric(),
  NoExample = numeric(), Example = numeric(),
  FlashLite = numeric(), Pro = numeric(),
  stringsAsFactors = FALSE
)

for (cname in paste0("c", 1:6)) {
  row <- data.frame(
    Constraint = constraint_names[cname],
    Overall    = round(mean(data[[cname]]), 3),
    Direct     = round(mean(data[[cname]][data$A == -1]), 3),
    CoT        = round(mean(data[[cname]][data$A ==  1]), 3),
    NoExample  = round(mean(data[[cname]][data$B == -1]), 3),
    Example    = round(mean(data[[cname]][data$B ==  1]), 3),
    FlashLite  = round(mean(data[[cname]][data$C == -1]), 3),
    Pro        = round(mean(data[[cname]][data$C ==  1]), 3)
  )
  pass_rates <- rbind(pass_rates, row)
}
print(pass_rates)
##      Constraint Overall Direct   CoT NoExample Example FlashLite Pro
## c1     Word Ban   1.000  1.000 1.000       1.0   1.000     1.000   1
## c2 Sentence Cap   1.000  1.000 1.000       1.0   1.000     1.000   1
## c3    5 Bullets   0.667  0.667 0.667       0.5   0.833     0.333   1
## c4 Opening Word   1.000  1.000 1.000       1.0   1.000     1.000   1
## c5 No Questions   1.000  1.000 1.000       1.0   1.000     1.000   1
## c6     Sign-off   1.000  1.000 1.000       1.0   1.000     1.000   1
# Heatmap-style per-constraint pass rates by treatment combination
constraint_by_combo <- matrix(NA, nrow = 6, ncol = 8)
rownames(constraint_by_combo) <- names(constraint_names)
combos <- expand.grid(A = c(-1, 1), B = c(-1, 1), C = c(-1, 1))
combo_labels <- paste0(
  ifelse(combos$A == -1, "Dir", "CoT"), "/",
  ifelse(combos$B == -1, "NoEx", "Ex"), "/",
  ifelse(combos$C == -1, "FL", "Pro")
)
colnames(constraint_by_combo) <- combo_labels

for (i in 1:8) {
  sub <- data[data$A == combos$A[i] & data$B == combos$B[i] & data$C == combos$C[i], ]
  for (j in 1:6) {
    constraint_by_combo[j, i] <- mean(sub[[paste0("c", j)]])
  }
}

par(mar = c(6, 8, 3, 2))
image(1:8, 1:6, t(constraint_by_combo), col = c("#D94A4A", "#FFD700", "#4AD97A"),
      breaks = c(-0.01, 0.4, 0.8, 1.01),
      xlab = "", ylab = "", axes = FALSE,
      main = "Pass Rate by Constraint and Treatment Combination")
axis(1, at = 1:8, labels = combo_labels, las = 2, cex.axis = 0.75)
axis(2, at = 1:6, labels = unname(constraint_names), las = 1, cex.axis = 0.85)
# Add text values
for (i in 1:8) {
  for (j in 1:6) {
    text(i, j, round(constraint_by_combo[j, i], 2), cex = 0.8)
  }
}

9. Detailed c3 (Bullet Count) Analysis

cat("=== c3 (Exactly 5 Bullets) — The Only Variable Constraint ===\n\n")
## === c3 (Exactly 5 Bullets) — The Only Variable Constraint ===
cat("Overall pass rate:", mean(data$c3), "\n\n")
## Overall pass rate: 0.6666667
cat("By Model:\n")
## By Model:
cat("  Flash-Lite:", mean(data$c3[data$C == -1]), "\n")
##   Flash-Lite: 0.3333333
cat("  Pro:       ", mean(data$c3[data$C ==  1]), "\n\n")
##   Pro:        1
cat("Within Flash-Lite, by Context:\n")
## Within Flash-Lite, by Context:
fl <- data[data$C == -1, ]
cat("  No Example:", mean(fl$c3[fl$B == -1]), "(", sum(fl$c3[fl$B == -1]), "/", sum(fl$B == -1), ")\n")
##   No Example: 0 ( 0 / 6 )
cat("  Example:   ", mean(fl$c3[fl$B ==  1]), "(", sum(fl$c3[fl$B ==  1]), "/", sum(fl$B ==  1), ")\n\n")
##   Example:    0.6666667 ( 4 / 6 )
cat("Within Flash-Lite, by Prompt:\n")
## Within Flash-Lite, by Prompt:
cat("  Direct:", mean(fl$c3[fl$A == -1]), "(", sum(fl$c3[fl$A == -1]), "/", sum(fl$A == -1), ")\n")
##   Direct: 0.3333333 ( 2 / 6 )
cat("  CoT:   ", mean(fl$c3[fl$A ==  1]), "(", sum(fl$c3[fl$A ==  1]), "/", sum(fl$A ==  1), ")\n\n")
##   CoT:    0.3333333 ( 2 / 6 )
cat("Within Flash-Lite, by A x B:\n")
## Within Flash-Lite, by A x B:
cat("  Direct/NoEx:", mean(fl$c3[fl$A == -1 & fl$B == -1]),"\n")
##   Direct/NoEx: 0
cat("  Direct/Ex:  ", mean(fl$c3[fl$A == -1 & fl$B ==  1]),"\n")
##   Direct/Ex:   0.6666667
cat("  CoT/NoEx:   ", mean(fl$c3[fl$A ==  1 & fl$B == -1]),"\n")
##   CoT/NoEx:    0
cat("  CoT/Ex:     ", mean(fl$c3[fl$A ==  1 & fl$B ==  1]),"\n")
##   CoT/Ex:      0.6666667

10. Thinking Tokens Summary

cat("=== Thinking Tokens (Pro Model Only) ===\n\n")
## === Thinking Tokens (Pro Model Only) ===
pro <- data[data$C == 1, ]
cat("Mean thinking tokens:", round(mean(pro$thinking_tokens), 1), "\n")
## Mean thinking tokens: 1724.8
cat("Range:", min(pro$thinking_tokens), "-", max(pro$thinking_tokens), "\n")
## Range: 860 - 3312
cat("SD:", round(sd(pro$thinking_tokens), 1), "\n\n")
## SD: 775.6
cat("By Prompt Strategy:\n")
## By Prompt Strategy:
cat("  Direct:", round(mean(pro$thinking_tokens[pro$A == -1]), 1), "\n")
##   Direct: 1920.8
cat("  CoT:   ", round(mean(pro$thinking_tokens[pro$A ==  1]), 1), "\n\n")
##   CoT:    1528.7
cat("By Context:\n")
## By Context:
cat("  No Example:", round(mean(pro$thinking_tokens[pro$B == -1]), 1), "\n")
##   No Example: 1594.3
cat("  Example:   ", round(mean(pro$thinking_tokens[pro$B ==  1]), 1), "\n")
##   Example:    1855.2

11. Effect Confidence Intervals Plot

par(mar = c(5, 6, 3, 2))
n_effects <- length(effects)
y_pos <- n_effects:1

plot(effects_table$Estimate, y_pos,
     xlim = c(min(effects_table$CI_lower) - 0.1, max(effects_table$CI_upper) + 0.1),
     yaxt = "n", xlab = "Effect Estimate", ylab = "",
     main = "95% Confidence Intervals for Factorial Effects",
     pch = 19, cex = 1.3)
axis(2, at = y_pos, labels = effects_table$Effect, las = 1)
abline(v = 0, lty = 2, col = "gray50")

for (i in 1:n_effects) {
  col_i <- ifelse(effects_table$p_value[i] < 0.05, "#D94A4A", "gray50")
  segments(effects_table$CI_lower[i], y_pos[i],
           effects_table$CI_upper[i], y_pos[i],
           col = col_i, lwd = 2.5)
  points(effects_table$Estimate[i], y_pos[i], pch = 19, col = col_i, cex = 1.3)
}
legend("bottomright", legend = c("Significant (p<0.05)", "Not significant"),
       col = c("#D94A4A", "gray50"), lwd = 2.5, pch = 19, cex = 0.85)

12. Summary Statistics

cat("============================================================\n")
## ============================================================
cat("EXPERIMENT SUMMARY\n")
## EXPERIMENT SUMMARY
cat("============================================================\n\n")
## ============================================================
cat("Design: 2^3 replicated factorial (3 replicates, 24 runs)\n")
## Design: 2^3 replicated factorial (3 replicates, 24 runs)
cat("Models: gemini-2.5-pro (thinking) vs gemini-2.5-flash-lite (no thinking)\n\n")
## Models: gemini-2.5-pro (thinking) vs gemini-2.5-flash-lite (no thinking)
cat("Grand Mean:", round(grand_mean, 4), "\n")
## Grand Mean: 5.6667
cat("Pooled MSE:", round(MSE, 4), "\n")
## Pooled MSE: 0.0833
cat("df_error:", df_error, "\n")
## df_error: 16
cat("SE of effects:", round(SE_effect, 4), "\n\n")
## SE of effects: 0.1179
cat("Significant effects (p < 0.05):\n")
## Significant effects (p < 0.05):
sig <- effects_table[effects_table$p_value < 0.05, ]
if (nrow(sig) > 0) {
  for (i in 1:nrow(sig)) {
    cat(sprintf("  %s: %.4f (p = %.6f) [%.4f, %.4f]\n",
                sig$Effect[i], sig$Estimate[i], sig$p_value[i],
                sig$CI_lower[i], sig$CI_upper[i]))
  }
} else {
  cat("  None\n")
}
##   B: 0.3333 (p = 0.012109) [0.0835, 0.5832]
##   C: 0.6667 (p = 0.000036) [0.4168, 0.9165]
##   BC: -0.3333 (p = 0.012109) [-0.5832, -0.0835]
cat("\nNon-significant effects:\n")
## 
## Non-significant effects:
nsig <- effects_table[effects_table$p_value >= 0.05, ]
if (nrow(nsig) > 0) {
  for (i in 1:nrow(nsig)) {
    cat(sprintf("  %s: %.4f (p = %.6f)\n",
                nsig$Effect[i], nsig$Estimate[i], nsig$p_value[i]))
  }
}
##   A: 0.0000 (p = 1.000000)
##   AB: 0.0000 (p = 1.000000)
##   AC: 0.0000 (p = 1.000000)
##   ABC: 0.0000 (p = 1.000000)
cat("\n============================================================\n")
## 
## ============================================================
cat("KEY FINDINGS\n")
## KEY FINDINGS
cat("============================================================\n")
## ============================================================
cat("1. Pro model (thinking) achieved perfect score (6/6) on all 12 runs.\n")
## 1. Pro model (thinking) achieved perfect score (6/6) on all 12 runs.
cat("2. Flash-Lite model varied, with c3 (bullet count) as the sole failure.\n")
## 2. Flash-Lite model varied, with c3 (bullet count) as the sole failure.
cat("3. Within Flash-Lite, providing an example (B=+1) improved c3 pass rate.\n")
## 3. Within Flash-Lite, providing an example (B=+1) improved c3 pass rate.
cat("4. Prompt strategy (A) had no detectable effect on compliance.\n")
## 4. Prompt strategy (A) had no detectable effect on compliance.
cat("5. Ceiling effect in Pro group violates variance homogeneity assumption.\n")
## 5. Ceiling effect in Pro group violates variance homogeneity assumption.