1. Load and Validate Data

data <- read.csv("data.csv")
stopifnot(nrow(data) == 24)
stopifnot(all(data$y == rowSums(data[, paste0("c", 1:6)])))

# Factor labels for plotting
data$prompt_label  <- factor(ifelse(data$A == -1, "Direct", "CoT"),
                             levels = c("Direct", "CoT"))
data$context_label <- factor(ifelse(data$B == -1, "No Example", "Example"),
                             levels = c("No Example", "Example"))
data$model_label   <- factor(ifelse(data$C == -1, "Flash-Lite", "Pro"),
                             levels = c("Flash-Lite", "Pro"))

cat("Data loaded:", nrow(data), "runs\n")

## Data loaded: 24 runs

print(data[order(data$run_order),
           c("run_order", "A", "B", "C", "replicate", "c1", "c2", "c3", "c4", "c5", "c6", "y")])

##    run_order  A  B  C replicate c1 c2 c3 c4 c5 c6 y
## 1          1 -1  1 -1         3  1  1  0  1  1  1 5
## 2          2  1 -1 -1         2  1  1  0  1  1  1 5
## 3          3 -1  1  1         3  1  1  1  1  1  1 6
## 4          4  1  1 -1         1  1  1  1  1  1  1 6
## 5          5  1  1 -1         3  1  1  0  1  1  1 5
## 6          6 -1  1 -1         2  1  1  1  1  1  1 6
## 7          7  1 -1  1         2  1  1  1  1  1  1 6
## 8          8 -1 -1 -1         1  1  1  0  1  1  1 5
## 9          9  1  1 -1         2  1  1  1  1  1  1 6
## 10        10 -1 -1  1         2  1  1  1  1  1  1 6
## 11        11 -1 -1 -1         2  1  1  0  1  1  1 5
## 12        12 -1  1  1         2  1  1  1  1  1  1 6
## 13        13  1  1  1         2  1  1  1  1  1  1 6
## 14        14  1 -1  1         1  1  1  1  1  1  1 6
## 15        15 -1 -1  1         3  1  1  1  1  1  1 6
## 16        16  1  1  1         1  1  1  1  1  1  1 6
## 17        17 -1  1 -1         1  1  1  1  1  1  1 6
## 18        18 -1  1  1         1  1  1  1  1  1  1 6
## 19        19 -1 -1 -1         3  1  1  0  1  1  1 5
## 20        20 -1 -1  1         1  1  1  1  1  1  1 6
## 21        21  1  1  1         3  1  1  1  1  1  1 6
## 22        22  1 -1 -1         3  1  1  0  1  1  1 5
## 23        23  1 -1 -1         1  1  1  0  1  1  1 5
## 24        24  1 -1  1         3  1  1  1  1  1  1 6

2. Treatment Combination Means

combo_means <- aggregate(y ~ A + B + C, data = data, FUN = mean)
combo_vars  <- aggregate(y ~ A + B + C, data = data, FUN = var)
combo_summary <- merge(combo_means, combo_vars, by = c("A", "B", "C"),
                       suffixes = c("_mean", "_var"))

# Add readable labels
combo_summary$Prompt  <- ifelse(combo_summary$A == -1, "Direct", "CoT")
combo_summary$Context <- ifelse(combo_summary$B == -1, "No Example", "Example")
combo_summary$Model   <- ifelse(combo_summary$C == -1, "Flash-Lite", "Pro")

print(combo_summary[, c("Prompt", "Context", "Model", "y_mean", "y_var")])

##   Prompt    Context      Model   y_mean     y_var
## 1 Direct No Example Flash-Lite 5.000000 0.0000000
## 2 Direct No Example        Pro 6.000000 0.0000000
## 3 Direct    Example Flash-Lite 5.666667 0.3333333
## 4 Direct    Example        Pro 6.000000 0.0000000
## 5    CoT No Example Flash-Lite 5.000000 0.0000000
## 6    CoT No Example        Pro 6.000000 0.0000000
## 7    CoT    Example Flash-Lite 5.666667 0.3333333
## 8    CoT    Example        Pro 6.000000 0.0000000

3. Factorial Effects (Manual Computation)

n_reps <- 3
k <- 3
N <- 2^k

# Grand mean
grand_mean <- mean(data$y)

# Main effects
eff_A <- mean(data$y[data$A == 1]) - mean(data$y[data$A == -1])
eff_B <- mean(data$y[data$B == 1]) - mean(data$y[data$B == -1])
eff_C <- mean(data$y[data$C == 1]) - mean(data$y[data$C == -1])

# Two-factor interactions
eff_AB <- 0.5 * (
  (mean(data$y[data$A ==  1 & data$B ==  1]) + mean(data$y[data$A == -1 & data$B == -1])) -
  (mean(data$y[data$A ==  1 & data$B == -1]) + mean(data$y[data$A == -1 & data$B ==  1]))
)
eff_AC <- 0.5 * (
  (mean(data$y[data$A ==  1 & data$C ==  1]) + mean(data$y[data$A == -1 & data$C == -1])) -
  (mean(data$y[data$A ==  1 & data$C == -1]) + mean(data$y[data$A == -1 & data$C ==  1]))
)
eff_BC <- 0.5 * (
  (mean(data$y[data$B ==  1 & data$C ==  1]) + mean(data$y[data$B == -1 & data$C == -1])) -
  (mean(data$y[data$B ==  1 & data$C == -1]) + mean(data$y[data$B == -1 & data$C ==  1]))
)

# Three-factor interaction
eff_ABC <- 0.5 * (
  (mean(data$y[data$A ==  1 & data$B ==  1 & data$C ==  1]) +
   mean(data$y[data$A ==  1 & data$B == -1 & data$C == -1]) +
   mean(data$y[data$A == -1 & data$B ==  1 & data$C == -1]) +
   mean(data$y[data$A == -1 & data$B == -1 & data$C ==  1])) -
  (mean(data$y[data$A == -1 & data$B == -1 & data$C == -1]) +
   mean(data$y[data$A == -1 & data$B ==  1 & data$C ==  1]) +
   mean(data$y[data$A ==  1 & data$B == -1 & data$C ==  1]) +
   mean(data$y[data$A ==  1 & data$B ==  1 & data$C == -1]))
)

effects <- c(A = eff_A, B = eff_B, C = eff_C,
             AB = eff_AB, AC = eff_AC, BC = eff_BC, ABC = eff_ABC)

# Pooled error variance
SSE <- sum(aggregate(y ~ A + B + C, data = data,
                     FUN = function(x) sum((x - mean(x))^2))$y)
df_error <- N * (n_reps - 1)  # 8 * 2 = 16
MSE <- SSE / df_error

# Standard error of effects
SE_effect <- sqrt(4 * MSE / (N * n_reps))

# t-statistics and p-values
t_stats  <- effects / SE_effect
p_values <- 2 * pt(abs(t_stats), df = df_error, lower.tail = FALSE)

# 95% confidence intervals
t_crit   <- qt(0.975, df = df_error)
ci_lower <- effects - t_crit * SE_effect
ci_upper <- effects + t_crit * SE_effect

effects_table <- data.frame(
  Effect   = names(effects),
  Estimate = round(effects, 4),
  SE       = round(rep(SE_effect, length(effects)), 4),
  t_stat   = round(t_stats, 4),
  p_value  = round(p_values, 6),
  CI_lower = round(ci_lower, 4),
  CI_upper = round(ci_upper, 4),
  Signif   = ifelse(p_values < 0.001, "***",
             ifelse(p_values < 0.01,  "**",
             ifelse(p_values < 0.05,  "*", "")))
)
rownames(effects_table) <- NULL

cat("Grand Mean:", round(grand_mean, 4), "\n")

## Grand Mean: 5.6667

cat("Pooled MSE:", round(MSE, 4), "  df_error:", df_error, "\n")

## Pooled MSE: 0.0833   df_error: 16

cat("SE of effects:", round(SE_effect, 4), "\n\n")

## SE of effects: 0.1179

print(effects_table)

##   Effect Estimate     SE  t_stat  p_value CI_lower CI_upper Signif
## 1      A   0.0000 0.1179  0.0000 1.000000  -0.2498   0.2498       
## 2      B   0.3333 0.1179  2.8284 0.012109   0.0835   0.5832      *
## 3      C   0.6667 0.1179  5.6569 0.000036   0.4168   0.9165    ***
## 4     AB   0.0000 0.1179  0.0000 1.000000  -0.2498   0.2498       
## 5     AC   0.0000 0.1179  0.0000 1.000000  -0.2498   0.2498       
## 6     BC  -0.3333 0.1179 -2.8284 0.012109  -0.5832  -0.0835      *
## 7    ABC   0.0000 0.1179  0.0000 1.000000  -0.2498   0.2498

4. ANOVA Table

fit <- lm(y ~ A * B * C, data = data)
cat("=== ANOVA Table ===\n")

## === ANOVA Table ===

print(anova(fit))

## Analysis of Variance Table
## 
## Response: y
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## A          1 0.00000 0.00000       0   1.00000    
## B          1 0.66667 0.66667       8   0.01211 *  
## C          1 2.66667 2.66667      32 3.571e-05 ***
## A:B        1 0.00000 0.00000       0   1.00000    
## A:C        1 0.00000 0.00000       0   1.00000    
## B:C        1 0.66667 0.66667       8   0.01211 *  
## A:B:C      1 0.00000 0.00000       0   1.00000    
## Residuals 16 1.33333 0.08333                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

cat("\n=== Model Summary ===\n")

## 
## === Model Summary ===

print(summary(fit))

## 
## Call:
## lm(formula = y ~ A * B * C, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6667  0.0000  0.0000  0.0000  0.3333 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.667e+00  5.893e-02  96.167  < 2e-16 ***
## A           -4.221e-16  5.893e-02   0.000   1.0000    
## B            1.667e-01  5.893e-02   2.828   0.0121 *  
## C            3.333e-01  5.893e-02   5.657 3.57e-05 ***
## A:B         -4.931e-16  5.893e-02   0.000   1.0000    
## A:C          4.700e-16  5.893e-02   0.000   1.0000    
## B:C         -1.667e-01  5.893e-02  -2.828   0.0121 *  
## A:B:C        3.985e-16  5.893e-02   0.000   1.0000    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2887 on 16 degrees of freedom
## Multiple R-squared:   0.75,  Adjusted R-squared:  0.6406 
## F-statistic: 6.857 on 7 and 16 DF,  p-value: 0.0007243

5. Main Effects Plot

par(mfrow = c(1, 3), mar = c(5, 4, 3, 1))

# A: Prompt Strategy
means_A <- tapply(data$y, data$prompt_label, mean)
barplot(means_A, main = "Factor A: Prompt Strategy",
        ylab = "Mean Constraints Satisfied",
        col = c("#4A90D9", "#D9764A"), border = NA,
        ylim = c(0, 6.5))
text(x = c(0.7, 1.9), y = means_A + 0.15, labels = round(means_A, 2), cex = 0.9)

# B: Context
means_B <- tapply(data$y, data$context_label, mean)
barplot(means_B, main = "Factor B: Context",
        ylab = "Mean Constraints Satisfied",
        col = c("#4A90D9", "#D9764A"), border = NA,
        ylim = c(0, 6.5))
text(x = c(0.7, 1.9), y = means_B + 0.15, labels = round(means_B, 2), cex = 0.9)

# C: Model
means_C <- tapply(data$y, data$model_label, mean)
barplot(means_C, main = "Factor C: Model",
        ylab = "Mean Constraints Satisfied",
        col = c("#4A90D9", "#D9764A"), border = NA,
        ylim = c(0, 6.5))
text(x = c(0.7, 1.9), y = means_C + 0.15, labels = round(means_C, 2), cex = 0.9)

6. Interaction Plots

par(mfrow = c(1, 3), mar = c(5, 4, 3, 2))

# A x B
means_AB <- aggregate(y ~ prompt_label + context_label, data = data, FUN = mean)
interaction.plot(
  x.factor = means_AB$prompt_label,
  trace.factor = means_AB$context_label,
  response = means_AB$y,
  main = "A x B Interaction\n(Prompt x Context)",
  xlab = "Prompt Strategy", ylab = "Mean Constraints Satisfied",
  trace.label = "Context", col = c("#D94A4A", "#4A90D9"), lwd = 2,
  ylim = c(4.5, 6.5), type = "b", pch = c(16, 17)
)

# A x C
means_AC <- aggregate(y ~ prompt_label + model_label, data = data, FUN = mean)
interaction.plot(
  x.factor = means_AC$prompt_label,
  trace.factor = means_AC$model_label,
  response = means_AC$y,
  main = "A x C Interaction\n(Prompt x Model)",
  xlab = "Prompt Strategy", ylab = "Mean Constraints Satisfied",
  trace.label = "Model", col = c("#D94A4A", "#4A90D9"), lwd = 2,
  ylim = c(4.5, 6.5), type = "b", pch = c(16, 17)
)

# B x C
means_BC <- aggregate(y ~ context_label + model_label, data = data, FUN = mean)
interaction.plot(
  x.factor = means_BC$context_label,
  trace.factor = means_BC$model_label,
  response = means_BC$y,
  main = "B x C Interaction\n(Context x Model)",
  xlab = "Context", ylab = "Mean Constraints Satisfied",
  trace.label = "Model", col = c("#D94A4A", "#4A90D9"), lwd = 2,
  ylim = c(4.5, 6.5), type = "b", pch = c(16, 17)
)

7. Diagnostic Plots

par(mfrow = c(1, 2), mar = c(5, 4, 3, 1))

# Residuals vs Fitted
plot(fitted(fit), resid(fit),
     main = "Residuals vs Fitted Values",
     xlab = "Fitted Values", ylab = "Residuals",
     pch = 19, col = rgb(0, 0, 0, 0.6), cex = 1.2)
abline(h = 0, lty = 2, col = "red", lwd = 1.5)

# Normal Q-Q
qqnorm(resid(fit), main = "Normal Q-Q Plot of Residuals",
       pch = 19, col = rgb(0, 0, 0, 0.6), cex = 1.2)
qqline(resid(fit), col = "red", lwd = 1.5)

cat("=== Diagnostic Notes ===\n")

## === Diagnostic Notes ===

cat("Unique fitted values:", length(unique(fitted(fit))), "\n")

## Unique fitted values: 6

cat("Unique residual values:", length(unique(round(resid(fit), 6))), "\n")

## Unique residual values: 3

cat("Residual range:", range(resid(fit)), "\n")

## Residual range: -0.6666667 0.3333333

cat("\nNote: The discrete nature of the response (only values 5 and 6 observed)\n")

## 
## Note: The discrete nature of the response (only values 5 and 6 observed)

cat("and zero variance in 6 of 8 treatment groups creates non-standard residual\n")

## and zero variance in 6 of 8 treatment groups creates non-standard residual

cat("patterns. Homogeneity of variance assumption is violated.\n")

## patterns. Homogeneity of variance assumption is violated.

8. Per-Constraint Breakdown

cat("=== PER-CONSTRAINT PASS RATES ===\n\n")

## === PER-CONSTRAINT PASS RATES ===

constraint_names <- c(
  c1 = "Word Ban", c2 = "Sentence Cap", c3 = "5 Bullets",
  c4 = "Opening Word", c5 = "No Questions", c6 = "Sign-off"
)

pass_rates <- data.frame(
  Constraint = character(),
  Overall = numeric(),
  Direct = numeric(), CoT = numeric(),
  NoExample = numeric(), Example = numeric(),
  FlashLite = numeric(), Pro = numeric(),
  stringsAsFactors = FALSE
)

for (cname in paste0("c", 1:6)) {
  row <- data.frame(
    Constraint = constraint_names[cname],
    Overall    = round(mean(data[[cname]]), 3),
    Direct     = round(mean(data[[cname]][data$A == -1]), 3),
    CoT        = round(mean(data[[cname]][data$A ==  1]), 3),
    NoExample  = round(mean(data[[cname]][data$B == -1]), 3),
    Example    = round(mean(data[[cname]][data$B ==  1]), 3),
    FlashLite  = round(mean(data[[cname]][data$C == -1]), 3),
    Pro        = round(mean(data[[cname]][data$C ==  1]), 3)
  )
  pass_rates <- rbind(pass_rates, row)
}
print(pass_rates)

##      Constraint Overall Direct   CoT NoExample Example FlashLite Pro
## c1     Word Ban   1.000  1.000 1.000       1.0   1.000     1.000   1
## c2 Sentence Cap   1.000  1.000 1.000       1.0   1.000     1.000   1
## c3    5 Bullets   0.667  0.667 0.667       0.5   0.833     0.333   1
## c4 Opening Word   1.000  1.000 1.000       1.0   1.000     1.000   1
## c5 No Questions   1.000  1.000 1.000       1.0   1.000     1.000   1
## c6     Sign-off   1.000  1.000 1.000       1.0   1.000     1.000   1

# Heatmap-style per-constraint pass rates by treatment combination
constraint_by_combo <- matrix(NA, nrow = 6, ncol = 8)
rownames(constraint_by_combo) <- names(constraint_names)
combos <- expand.grid(A = c(-1, 1), B = c(-1, 1), C = c(-1, 1))
combo_labels <- paste0(
  ifelse(combos$A == -1, "Dir", "CoT"), "/",
  ifelse(combos$B == -1, "NoEx", "Ex"), "/",
  ifelse(combos$C == -1, "FL", "Pro")
)
colnames(constraint_by_combo) <- combo_labels

for (i in 1:8) {
  sub <- data[data$A == combos$A[i] & data$B == combos$B[i] & data$C == combos$C[i], ]
  for (j in 1:6) {
    constraint_by_combo[j, i] <- mean(sub[[paste0("c", j)]])
  }
}

par(mar = c(6, 8, 3, 2))
image(1:8, 1:6, t(constraint_by_combo), col = c("#D94A4A", "#FFD700", "#4AD97A"),
      breaks = c(-0.01, 0.4, 0.8, 1.01),
      xlab = "", ylab = "", axes = FALSE,
      main = "Pass Rate by Constraint and Treatment Combination")
axis(1, at = 1:8, labels = combo_labels, las = 2, cex.axis = 0.75)
axis(2, at = 1:6, labels = unname(constraint_names), las = 1, cex.axis = 0.85)
# Add text values
for (i in 1:8) {
  for (j in 1:6) {
    text(i, j, round(constraint_by_combo[j, i], 2), cex = 0.8)
  }
}

9. Detailed c3 (Bullet Count) Analysis

cat("=== c3 (Exactly 5 Bullets) — The Only Variable Constraint ===\n\n")

## === c3 (Exactly 5 Bullets) — The Only Variable Constraint ===

cat("Overall pass rate:", mean(data$c3), "\n\n")

## Overall pass rate: 0.6666667

cat("By Model:\n")

## By Model:

cat("  Flash-Lite:", mean(data$c3[data$C == -1]), "\n")

##   Flash-Lite: 0.3333333

cat("  Pro:       ", mean(data$c3[data$C ==  1]), "\n\n")

##   Pro:        1

cat("Within Flash-Lite, by Context:\n")

## Within Flash-Lite, by Context:

fl <- data[data$C == -1, ]
cat("  No Example:", mean(fl$c3[fl$B == -1]), "(", sum(fl$c3[fl$B == -1]), "/", sum(fl$B == -1), ")\n")

##   No Example: 0 ( 0 / 6 )

cat("  Example:   ", mean(fl$c3[fl$B ==  1]), "(", sum(fl$c3[fl$B ==  1]), "/", sum(fl$B ==  1), ")\n\n")

##   Example:    0.6666667 ( 4 / 6 )

cat("Within Flash-Lite, by Prompt:\n")

## Within Flash-Lite, by Prompt:

cat("  Direct:", mean(fl$c3[fl$A == -1]), "(", sum(fl$c3[fl$A == -1]), "/", sum(fl$A == -1), ")\n")

##   Direct: 0.3333333 ( 2 / 6 )

cat("  CoT:   ", mean(fl$c3[fl$A ==  1]), "(", sum(fl$c3[fl$A ==  1]), "/", sum(fl$A ==  1), ")\n\n")

##   CoT:    0.3333333 ( 2 / 6 )

cat("Within Flash-Lite, by A x B:\n")

## Within Flash-Lite, by A x B:

cat("  Direct/NoEx:", mean(fl$c3[fl$A == -1 & fl$B == -1]),"\n")

##   Direct/NoEx: 0

cat("  Direct/Ex:  ", mean(fl$c3[fl$A == -1 & fl$B ==  1]),"\n")

##   Direct/Ex:   0.6666667

cat("  CoT/NoEx:   ", mean(fl$c3[fl$A ==  1 & fl$B == -1]),"\n")

##   CoT/NoEx:    0

cat("  CoT/Ex:     ", mean(fl$c3[fl$A ==  1 & fl$B ==  1]),"\n")

##   CoT/Ex:      0.6666667

10. Thinking Tokens Summary

cat("=== Thinking Tokens (Pro Model Only) ===\n\n")

## === Thinking Tokens (Pro Model Only) ===

pro <- data[data$C == 1, ]
cat("Mean thinking tokens:", round(mean(pro$thinking_tokens), 1), "\n")

## Mean thinking tokens: 1724.8

cat("Range:", min(pro$thinking_tokens), "-", max(pro$thinking_tokens), "\n")

## Range: 860 - 3312

cat("SD:", round(sd(pro$thinking_tokens), 1), "\n\n")

## SD: 775.6

cat("By Prompt Strategy:\n")

## By Prompt Strategy:

cat("  Direct:", round(mean(pro$thinking_tokens[pro$A == -1]), 1), "\n")

##   Direct: 1920.8

cat("  CoT:   ", round(mean(pro$thinking_tokens[pro$A ==  1]), 1), "\n\n")

##   CoT:    1528.7

cat("By Context:\n")

## By Context:

cat("  No Example:", round(mean(pro$thinking_tokens[pro$B == -1]), 1), "\n")

##   No Example: 1594.3

cat("  Example:   ", round(mean(pro$thinking_tokens[pro$B ==  1]), 1), "\n")

##   Example:    1855.2

11. Effect Confidence Intervals Plot

par(mar = c(5, 6, 3, 2))
n_effects <- length(effects)
y_pos <- n_effects:1

plot(effects_table$Estimate, y_pos,
     xlim = c(min(effects_table$CI_lower) - 0.1, max(effects_table$CI_upper) + 0.1),
     yaxt = "n", xlab = "Effect Estimate", ylab = "",
     main = "95% Confidence Intervals for Factorial Effects",
     pch = 19, cex = 1.3)
axis(2, at = y_pos, labels = effects_table$Effect, las = 1)
abline(v = 0, lty = 2, col = "gray50")

for (i in 1:n_effects) {
  col_i <- ifelse(effects_table$p_value[i] < 0.05, "#D94A4A", "gray50")
  segments(effects_table$CI_lower[i], y_pos[i],
           effects_table$CI_upper[i], y_pos[i],
           col = col_i, lwd = 2.5)
  points(effects_table$Estimate[i], y_pos[i], pch = 19, col = col_i, cex = 1.3)
}
legend("bottomright", legend = c("Significant (p<0.05)", "Not significant"),
       col = c("#D94A4A", "gray50"), lwd = 2.5, pch = 19, cex = 0.85)

12. Summary Statistics

cat("============================================================\n")

## ============================================================

cat("EXPERIMENT SUMMARY\n")

## EXPERIMENT SUMMARY

cat("============================================================\n\n")

## ============================================================

cat("Design: 2^3 replicated factorial (3 replicates, 24 runs)\n")

## Design: 2^3 replicated factorial (3 replicates, 24 runs)

cat("Models: gemini-2.5-pro (thinking) vs gemini-2.5-flash-lite (no thinking)\n\n")

## Models: gemini-2.5-pro (thinking) vs gemini-2.5-flash-lite (no thinking)

cat("Grand Mean:", round(grand_mean, 4), "\n")

## Grand Mean: 5.6667

cat("Pooled MSE:", round(MSE, 4), "\n")

## Pooled MSE: 0.0833

cat("df_error:", df_error, "\n")

## df_error: 16

cat("SE of effects:", round(SE_effect, 4), "\n\n")

## SE of effects: 0.1179

cat("Significant effects (p < 0.05):\n")

## Significant effects (p < 0.05):

sig <- effects_table[effects_table$p_value < 0.05, ]
if (nrow(sig) > 0) {
  for (i in 1:nrow(sig)) {
    cat(sprintf("  %s: %.4f (p = %.6f) [%.4f, %.4f]\n",
                sig$Effect[i], sig$Estimate[i], sig$p_value[i],
                sig$CI_lower[i], sig$CI_upper[i]))
  }
} else {
  cat("  None\n")
}

##   B: 0.3333 (p = 0.012109) [0.0835, 0.5832]
##   C: 0.6667 (p = 0.000036) [0.4168, 0.9165]
##   BC: -0.3333 (p = 0.012109) [-0.5832, -0.0835]

cat("\nNon-significant effects:\n")

## 
## Non-significant effects:

nsig <- effects_table[effects_table$p_value >= 0.05, ]
if (nrow(nsig) > 0) {
  for (i in 1:nrow(nsig)) {
    cat(sprintf("  %s: %.4f (p = %.6f)\n",
                nsig$Effect[i], nsig$Estimate[i], nsig$p_value[i]))
  }
}

##   A: 0.0000 (p = 1.000000)
##   AB: 0.0000 (p = 1.000000)
##   AC: 0.0000 (p = 1.000000)
##   ABC: 0.0000 (p = 1.000000)

cat("\n============================================================\n")

## 
## ============================================================

cat("KEY FINDINGS\n")

## KEY FINDINGS

cat("============================================================\n")

## ============================================================

cat("1. Pro model (thinking) achieved perfect score (6/6) on all 12 runs.\n")

## 1. Pro model (thinking) achieved perfect score (6/6) on all 12 runs.

cat("2. Flash-Lite model varied, with c3 (bullet count) as the sole failure.\n")

## 2. Flash-Lite model varied, with c3 (bullet count) as the sole failure.

cat("3. Within Flash-Lite, providing an example (B=+1) improved c3 pass rate.\n")

## 3. Within Flash-Lite, providing an example (B=+1) improved c3 pass rate.

cat("4. Prompt strategy (A) had no detectable effect on compliance.\n")

## 4. Prompt strategy (A) had no detectable effect on compliance.

cat("5. Ceiling effect in Pro group violates variance homogeneity assumption.\n")

## 5. Ceiling effect in Pro group violates variance homogeneity assumption.

STA305 Assignment 2 — Analysis

LLM Instruction-Following Compliance: 2³ Replicated Factorial Design

1. Load and Validate Data

2. Treatment Combination Means

3. Factorial Effects (Manual Computation)

4. ANOVA Table

5. Main Effects Plot

6. Interaction Plots

7. Diagnostic Plots

8. Per-Constraint Breakdown

9. Detailed c3 (Bullet Count) Analysis

10. Thinking Tokens Summary

11. Effect Confidence Intervals Plot

12. Summary Statistics