1. Load and Validate Data
data <- read.csv("data.csv")
stopifnot(nrow(data) == 24)
stopifnot(all(data$y == rowSums(data[, paste0("c", 1:6)])))
# Factor labels for plotting
data$prompt_label <- factor(ifelse(data$A == -1, "Direct", "CoT"),
levels = c("Direct", "CoT"))
data$context_label <- factor(ifelse(data$B == -1, "No Example", "Example"),
levels = c("No Example", "Example"))
data$model_label <- factor(ifelse(data$C == -1, "Flash-Lite", "Pro"),
levels = c("Flash-Lite", "Pro"))
cat("Data loaded:", nrow(data), "runs\n")
## Data loaded: 24 runs
print(data[order(data$run_order),
c("run_order", "A", "B", "C", "replicate", "c1", "c2", "c3", "c4", "c5", "c6", "y")])
## run_order A B C replicate c1 c2 c3 c4 c5 c6 y
## 1 1 -1 1 -1 3 1 1 0 1 1 1 5
## 2 2 1 -1 -1 2 1 1 0 1 1 1 5
## 3 3 -1 1 1 3 1 1 1 1 1 1 6
## 4 4 1 1 -1 1 1 1 1 1 1 1 6
## 5 5 1 1 -1 3 1 1 0 1 1 1 5
## 6 6 -1 1 -1 2 1 1 1 1 1 1 6
## 7 7 1 -1 1 2 1 1 1 1 1 1 6
## 8 8 -1 -1 -1 1 1 1 0 1 1 1 5
## 9 9 1 1 -1 2 1 1 1 1 1 1 6
## 10 10 -1 -1 1 2 1 1 1 1 1 1 6
## 11 11 -1 -1 -1 2 1 1 0 1 1 1 5
## 12 12 -1 1 1 2 1 1 1 1 1 1 6
## 13 13 1 1 1 2 1 1 1 1 1 1 6
## 14 14 1 -1 1 1 1 1 1 1 1 1 6
## 15 15 -1 -1 1 3 1 1 1 1 1 1 6
## 16 16 1 1 1 1 1 1 1 1 1 1 6
## 17 17 -1 1 -1 1 1 1 1 1 1 1 6
## 18 18 -1 1 1 1 1 1 1 1 1 1 6
## 19 19 -1 -1 -1 3 1 1 0 1 1 1 5
## 20 20 -1 -1 1 1 1 1 1 1 1 1 6
## 21 21 1 1 1 3 1 1 1 1 1 1 6
## 22 22 1 -1 -1 3 1 1 0 1 1 1 5
## 23 23 1 -1 -1 1 1 1 0 1 1 1 5
## 24 24 1 -1 1 3 1 1 1 1 1 1 6
2. Treatment Combination Means
combo_means <- aggregate(y ~ A + B + C, data = data, FUN = mean)
combo_vars <- aggregate(y ~ A + B + C, data = data, FUN = var)
combo_summary <- merge(combo_means, combo_vars, by = c("A", "B", "C"),
suffixes = c("_mean", "_var"))
# Add readable labels
combo_summary$Prompt <- ifelse(combo_summary$A == -1, "Direct", "CoT")
combo_summary$Context <- ifelse(combo_summary$B == -1, "No Example", "Example")
combo_summary$Model <- ifelse(combo_summary$C == -1, "Flash-Lite", "Pro")
print(combo_summary[, c("Prompt", "Context", "Model", "y_mean", "y_var")])
## Prompt Context Model y_mean y_var
## 1 Direct No Example Flash-Lite 5.000000 0.0000000
## 2 Direct No Example Pro 6.000000 0.0000000
## 3 Direct Example Flash-Lite 5.666667 0.3333333
## 4 Direct Example Pro 6.000000 0.0000000
## 5 CoT No Example Flash-Lite 5.000000 0.0000000
## 6 CoT No Example Pro 6.000000 0.0000000
## 7 CoT Example Flash-Lite 5.666667 0.3333333
## 8 CoT Example Pro 6.000000 0.0000000
3. Factorial Effects (Manual Computation)
n_reps <- 3
k <- 3
N <- 2^k
# Grand mean
grand_mean <- mean(data$y)
# Main effects
eff_A <- mean(data$y[data$A == 1]) - mean(data$y[data$A == -1])
eff_B <- mean(data$y[data$B == 1]) - mean(data$y[data$B == -1])
eff_C <- mean(data$y[data$C == 1]) - mean(data$y[data$C == -1])
# Two-factor interactions
eff_AB <- 0.5 * (
(mean(data$y[data$A == 1 & data$B == 1]) + mean(data$y[data$A == -1 & data$B == -1])) -
(mean(data$y[data$A == 1 & data$B == -1]) + mean(data$y[data$A == -1 & data$B == 1]))
)
eff_AC <- 0.5 * (
(mean(data$y[data$A == 1 & data$C == 1]) + mean(data$y[data$A == -1 & data$C == -1])) -
(mean(data$y[data$A == 1 & data$C == -1]) + mean(data$y[data$A == -1 & data$C == 1]))
)
eff_BC <- 0.5 * (
(mean(data$y[data$B == 1 & data$C == 1]) + mean(data$y[data$B == -1 & data$C == -1])) -
(mean(data$y[data$B == 1 & data$C == -1]) + mean(data$y[data$B == -1 & data$C == 1]))
)
# Three-factor interaction
eff_ABC <- 0.5 * (
(mean(data$y[data$A == 1 & data$B == 1 & data$C == 1]) +
mean(data$y[data$A == 1 & data$B == -1 & data$C == -1]) +
mean(data$y[data$A == -1 & data$B == 1 & data$C == -1]) +
mean(data$y[data$A == -1 & data$B == -1 & data$C == 1])) -
(mean(data$y[data$A == -1 & data$B == -1 & data$C == -1]) +
mean(data$y[data$A == -1 & data$B == 1 & data$C == 1]) +
mean(data$y[data$A == 1 & data$B == -1 & data$C == 1]) +
mean(data$y[data$A == 1 & data$B == 1 & data$C == -1]))
)
effects <- c(A = eff_A, B = eff_B, C = eff_C,
AB = eff_AB, AC = eff_AC, BC = eff_BC, ABC = eff_ABC)
# Pooled error variance
SSE <- sum(aggregate(y ~ A + B + C, data = data,
FUN = function(x) sum((x - mean(x))^2))$y)
df_error <- N * (n_reps - 1) # 8 * 2 = 16
MSE <- SSE / df_error
# Standard error of effects
SE_effect <- sqrt(4 * MSE / (N * n_reps))
# t-statistics and p-values
t_stats <- effects / SE_effect
p_values <- 2 * pt(abs(t_stats), df = df_error, lower.tail = FALSE)
# 95% confidence intervals
t_crit <- qt(0.975, df = df_error)
ci_lower <- effects - t_crit * SE_effect
ci_upper <- effects + t_crit * SE_effect
effects_table <- data.frame(
Effect = names(effects),
Estimate = round(effects, 4),
SE = round(rep(SE_effect, length(effects)), 4),
t_stat = round(t_stats, 4),
p_value = round(p_values, 6),
CI_lower = round(ci_lower, 4),
CI_upper = round(ci_upper, 4),
Signif = ifelse(p_values < 0.001, "***",
ifelse(p_values < 0.01, "**",
ifelse(p_values < 0.05, "*", "")))
)
rownames(effects_table) <- NULL
cat("Grand Mean:", round(grand_mean, 4), "\n")
## Grand Mean: 5.6667
cat("Pooled MSE:", round(MSE, 4), " df_error:", df_error, "\n")
## Pooled MSE: 0.0833 df_error: 16
cat("SE of effects:", round(SE_effect, 4), "\n\n")
## SE of effects: 0.1179
print(effects_table)
## Effect Estimate SE t_stat p_value CI_lower CI_upper Signif
## 1 A 0.0000 0.1179 0.0000 1.000000 -0.2498 0.2498
## 2 B 0.3333 0.1179 2.8284 0.012109 0.0835 0.5832 *
## 3 C 0.6667 0.1179 5.6569 0.000036 0.4168 0.9165 ***
## 4 AB 0.0000 0.1179 0.0000 1.000000 -0.2498 0.2498
## 5 AC 0.0000 0.1179 0.0000 1.000000 -0.2498 0.2498
## 6 BC -0.3333 0.1179 -2.8284 0.012109 -0.5832 -0.0835 *
## 7 ABC 0.0000 0.1179 0.0000 1.000000 -0.2498 0.2498
4. ANOVA Table
fit <- lm(y ~ A * B * C, data = data)
cat("=== ANOVA Table ===\n")
## === ANOVA Table ===
print(anova(fit))
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## A 1 0.00000 0.00000 0 1.00000
## B 1 0.66667 0.66667 8 0.01211 *
## C 1 2.66667 2.66667 32 3.571e-05 ***
## A:B 1 0.00000 0.00000 0 1.00000
## A:C 1 0.00000 0.00000 0 1.00000
## B:C 1 0.66667 0.66667 8 0.01211 *
## A:B:C 1 0.00000 0.00000 0 1.00000
## Residuals 16 1.33333 0.08333
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n=== Model Summary ===\n")
##
## === Model Summary ===
print(summary(fit))
##
## Call:
## lm(formula = y ~ A * B * C, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6667 0.0000 0.0000 0.0000 0.3333
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.667e+00 5.893e-02 96.167 < 2e-16 ***
## A -4.221e-16 5.893e-02 0.000 1.0000
## B 1.667e-01 5.893e-02 2.828 0.0121 *
## C 3.333e-01 5.893e-02 5.657 3.57e-05 ***
## A:B -4.931e-16 5.893e-02 0.000 1.0000
## A:C 4.700e-16 5.893e-02 0.000 1.0000
## B:C -1.667e-01 5.893e-02 -2.828 0.0121 *
## A:B:C 3.985e-16 5.893e-02 0.000 1.0000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2887 on 16 degrees of freedom
## Multiple R-squared: 0.75, Adjusted R-squared: 0.6406
## F-statistic: 6.857 on 7 and 16 DF, p-value: 0.0007243
5. Main Effects Plot
par(mfrow = c(1, 3), mar = c(5, 4, 3, 1))
# A: Prompt Strategy
means_A <- tapply(data$y, data$prompt_label, mean)
barplot(means_A, main = "Factor A: Prompt Strategy",
ylab = "Mean Constraints Satisfied",
col = c("#4A90D9", "#D9764A"), border = NA,
ylim = c(0, 6.5))
text(x = c(0.7, 1.9), y = means_A + 0.15, labels = round(means_A, 2), cex = 0.9)
# B: Context
means_B <- tapply(data$y, data$context_label, mean)
barplot(means_B, main = "Factor B: Context",
ylab = "Mean Constraints Satisfied",
col = c("#4A90D9", "#D9764A"), border = NA,
ylim = c(0, 6.5))
text(x = c(0.7, 1.9), y = means_B + 0.15, labels = round(means_B, 2), cex = 0.9)
# C: Model
means_C <- tapply(data$y, data$model_label, mean)
barplot(means_C, main = "Factor C: Model",
ylab = "Mean Constraints Satisfied",
col = c("#4A90D9", "#D9764A"), border = NA,
ylim = c(0, 6.5))
text(x = c(0.7, 1.9), y = means_C + 0.15, labels = round(means_C, 2), cex = 0.9)

6. Interaction Plots
par(mfrow = c(1, 3), mar = c(5, 4, 3, 2))
# A x B
means_AB <- aggregate(y ~ prompt_label + context_label, data = data, FUN = mean)
interaction.plot(
x.factor = means_AB$prompt_label,
trace.factor = means_AB$context_label,
response = means_AB$y,
main = "A x B Interaction\n(Prompt x Context)",
xlab = "Prompt Strategy", ylab = "Mean Constraints Satisfied",
trace.label = "Context", col = c("#D94A4A", "#4A90D9"), lwd = 2,
ylim = c(4.5, 6.5), type = "b", pch = c(16, 17)
)
# A x C
means_AC <- aggregate(y ~ prompt_label + model_label, data = data, FUN = mean)
interaction.plot(
x.factor = means_AC$prompt_label,
trace.factor = means_AC$model_label,
response = means_AC$y,
main = "A x C Interaction\n(Prompt x Model)",
xlab = "Prompt Strategy", ylab = "Mean Constraints Satisfied",
trace.label = "Model", col = c("#D94A4A", "#4A90D9"), lwd = 2,
ylim = c(4.5, 6.5), type = "b", pch = c(16, 17)
)
# B x C
means_BC <- aggregate(y ~ context_label + model_label, data = data, FUN = mean)
interaction.plot(
x.factor = means_BC$context_label,
trace.factor = means_BC$model_label,
response = means_BC$y,
main = "B x C Interaction\n(Context x Model)",
xlab = "Context", ylab = "Mean Constraints Satisfied",
trace.label = "Model", col = c("#D94A4A", "#4A90D9"), lwd = 2,
ylim = c(4.5, 6.5), type = "b", pch = c(16, 17)
)

7. Diagnostic Plots
par(mfrow = c(1, 2), mar = c(5, 4, 3, 1))
# Residuals vs Fitted
plot(fitted(fit), resid(fit),
main = "Residuals vs Fitted Values",
xlab = "Fitted Values", ylab = "Residuals",
pch = 19, col = rgb(0, 0, 0, 0.6), cex = 1.2)
abline(h = 0, lty = 2, col = "red", lwd = 1.5)
# Normal Q-Q
qqnorm(resid(fit), main = "Normal Q-Q Plot of Residuals",
pch = 19, col = rgb(0, 0, 0, 0.6), cex = 1.2)
qqline(resid(fit), col = "red", lwd = 1.5)

cat("=== Diagnostic Notes ===\n")
## === Diagnostic Notes ===
cat("Unique fitted values:", length(unique(fitted(fit))), "\n")
## Unique fitted values: 6
cat("Unique residual values:", length(unique(round(resid(fit), 6))), "\n")
## Unique residual values: 3
cat("Residual range:", range(resid(fit)), "\n")
## Residual range: -0.6666667 0.3333333
cat("\nNote: The discrete nature of the response (only values 5 and 6 observed)\n")
##
## Note: The discrete nature of the response (only values 5 and 6 observed)
cat("and zero variance in 6 of 8 treatment groups creates non-standard residual\n")
## and zero variance in 6 of 8 treatment groups creates non-standard residual
cat("patterns. Homogeneity of variance assumption is violated.\n")
## patterns. Homogeneity of variance assumption is violated.
8. Per-Constraint Breakdown
cat("=== PER-CONSTRAINT PASS RATES ===\n\n")
## === PER-CONSTRAINT PASS RATES ===
constraint_names <- c(
c1 = "Word Ban", c2 = "Sentence Cap", c3 = "5 Bullets",
c4 = "Opening Word", c5 = "No Questions", c6 = "Sign-off"
)
pass_rates <- data.frame(
Constraint = character(),
Overall = numeric(),
Direct = numeric(), CoT = numeric(),
NoExample = numeric(), Example = numeric(),
FlashLite = numeric(), Pro = numeric(),
stringsAsFactors = FALSE
)
for (cname in paste0("c", 1:6)) {
row <- data.frame(
Constraint = constraint_names[cname],
Overall = round(mean(data[[cname]]), 3),
Direct = round(mean(data[[cname]][data$A == -1]), 3),
CoT = round(mean(data[[cname]][data$A == 1]), 3),
NoExample = round(mean(data[[cname]][data$B == -1]), 3),
Example = round(mean(data[[cname]][data$B == 1]), 3),
FlashLite = round(mean(data[[cname]][data$C == -1]), 3),
Pro = round(mean(data[[cname]][data$C == 1]), 3)
)
pass_rates <- rbind(pass_rates, row)
}
print(pass_rates)
## Constraint Overall Direct CoT NoExample Example FlashLite Pro
## c1 Word Ban 1.000 1.000 1.000 1.0 1.000 1.000 1
## c2 Sentence Cap 1.000 1.000 1.000 1.0 1.000 1.000 1
## c3 5 Bullets 0.667 0.667 0.667 0.5 0.833 0.333 1
## c4 Opening Word 1.000 1.000 1.000 1.0 1.000 1.000 1
## c5 No Questions 1.000 1.000 1.000 1.0 1.000 1.000 1
## c6 Sign-off 1.000 1.000 1.000 1.0 1.000 1.000 1
# Heatmap-style per-constraint pass rates by treatment combination
constraint_by_combo <- matrix(NA, nrow = 6, ncol = 8)
rownames(constraint_by_combo) <- names(constraint_names)
combos <- expand.grid(A = c(-1, 1), B = c(-1, 1), C = c(-1, 1))
combo_labels <- paste0(
ifelse(combos$A == -1, "Dir", "CoT"), "/",
ifelse(combos$B == -1, "NoEx", "Ex"), "/",
ifelse(combos$C == -1, "FL", "Pro")
)
colnames(constraint_by_combo) <- combo_labels
for (i in 1:8) {
sub <- data[data$A == combos$A[i] & data$B == combos$B[i] & data$C == combos$C[i], ]
for (j in 1:6) {
constraint_by_combo[j, i] <- mean(sub[[paste0("c", j)]])
}
}
par(mar = c(6, 8, 3, 2))
image(1:8, 1:6, t(constraint_by_combo), col = c("#D94A4A", "#FFD700", "#4AD97A"),
breaks = c(-0.01, 0.4, 0.8, 1.01),
xlab = "", ylab = "", axes = FALSE,
main = "Pass Rate by Constraint and Treatment Combination")
axis(1, at = 1:8, labels = combo_labels, las = 2, cex.axis = 0.75)
axis(2, at = 1:6, labels = unname(constraint_names), las = 1, cex.axis = 0.85)
# Add text values
for (i in 1:8) {
for (j in 1:6) {
text(i, j, round(constraint_by_combo[j, i], 2), cex = 0.8)
}
}

9. Detailed c3 (Bullet Count) Analysis
cat("=== c3 (Exactly 5 Bullets) — The Only Variable Constraint ===\n\n")
## === c3 (Exactly 5 Bullets) — The Only Variable Constraint ===
cat("Overall pass rate:", mean(data$c3), "\n\n")
## Overall pass rate: 0.6666667
cat("By Model:\n")
## By Model:
cat(" Flash-Lite:", mean(data$c3[data$C == -1]), "\n")
## Flash-Lite: 0.3333333
cat(" Pro: ", mean(data$c3[data$C == 1]), "\n\n")
## Pro: 1
cat("Within Flash-Lite, by Context:\n")
## Within Flash-Lite, by Context:
fl <- data[data$C == -1, ]
cat(" No Example:", mean(fl$c3[fl$B == -1]), "(", sum(fl$c3[fl$B == -1]), "/", sum(fl$B == -1), ")\n")
## No Example: 0 ( 0 / 6 )
cat(" Example: ", mean(fl$c3[fl$B == 1]), "(", sum(fl$c3[fl$B == 1]), "/", sum(fl$B == 1), ")\n\n")
## Example: 0.6666667 ( 4 / 6 )
cat("Within Flash-Lite, by Prompt:\n")
## Within Flash-Lite, by Prompt:
cat(" Direct:", mean(fl$c3[fl$A == -1]), "(", sum(fl$c3[fl$A == -1]), "/", sum(fl$A == -1), ")\n")
## Direct: 0.3333333 ( 2 / 6 )
cat(" CoT: ", mean(fl$c3[fl$A == 1]), "(", sum(fl$c3[fl$A == 1]), "/", sum(fl$A == 1), ")\n\n")
## CoT: 0.3333333 ( 2 / 6 )
cat("Within Flash-Lite, by A x B:\n")
## Within Flash-Lite, by A x B:
cat(" Direct/NoEx:", mean(fl$c3[fl$A == -1 & fl$B == -1]),"\n")
## Direct/NoEx: 0
cat(" Direct/Ex: ", mean(fl$c3[fl$A == -1 & fl$B == 1]),"\n")
## Direct/Ex: 0.6666667
cat(" CoT/NoEx: ", mean(fl$c3[fl$A == 1 & fl$B == -1]),"\n")
## CoT/NoEx: 0
cat(" CoT/Ex: ", mean(fl$c3[fl$A == 1 & fl$B == 1]),"\n")
## CoT/Ex: 0.6666667
10. Thinking Tokens Summary
cat("=== Thinking Tokens (Pro Model Only) ===\n\n")
## === Thinking Tokens (Pro Model Only) ===
pro <- data[data$C == 1, ]
cat("Mean thinking tokens:", round(mean(pro$thinking_tokens), 1), "\n")
## Mean thinking tokens: 1724.8
cat("Range:", min(pro$thinking_tokens), "-", max(pro$thinking_tokens), "\n")
## Range: 860 - 3312
cat("SD:", round(sd(pro$thinking_tokens), 1), "\n\n")
## SD: 775.6
cat("By Prompt Strategy:\n")
## By Prompt Strategy:
cat(" Direct:", round(mean(pro$thinking_tokens[pro$A == -1]), 1), "\n")
## Direct: 1920.8
cat(" CoT: ", round(mean(pro$thinking_tokens[pro$A == 1]), 1), "\n\n")
## CoT: 1528.7
cat("By Context:\n")
## By Context:
cat(" No Example:", round(mean(pro$thinking_tokens[pro$B == -1]), 1), "\n")
## No Example: 1594.3
cat(" Example: ", round(mean(pro$thinking_tokens[pro$B == 1]), 1), "\n")
## Example: 1855.2
11. Effect Confidence Intervals Plot
par(mar = c(5, 6, 3, 2))
n_effects <- length(effects)
y_pos <- n_effects:1
plot(effects_table$Estimate, y_pos,
xlim = c(min(effects_table$CI_lower) - 0.1, max(effects_table$CI_upper) + 0.1),
yaxt = "n", xlab = "Effect Estimate", ylab = "",
main = "95% Confidence Intervals for Factorial Effects",
pch = 19, cex = 1.3)
axis(2, at = y_pos, labels = effects_table$Effect, las = 1)
abline(v = 0, lty = 2, col = "gray50")
for (i in 1:n_effects) {
col_i <- ifelse(effects_table$p_value[i] < 0.05, "#D94A4A", "gray50")
segments(effects_table$CI_lower[i], y_pos[i],
effects_table$CI_upper[i], y_pos[i],
col = col_i, lwd = 2.5)
points(effects_table$Estimate[i], y_pos[i], pch = 19, col = col_i, cex = 1.3)
}
legend("bottomright", legend = c("Significant (p<0.05)", "Not significant"),
col = c("#D94A4A", "gray50"), lwd = 2.5, pch = 19, cex = 0.85)

12. Summary Statistics
cat("============================================================\n")
## ============================================================
cat("EXPERIMENT SUMMARY\n")
## EXPERIMENT SUMMARY
cat("============================================================\n\n")
## ============================================================
cat("Design: 2^3 replicated factorial (3 replicates, 24 runs)\n")
## Design: 2^3 replicated factorial (3 replicates, 24 runs)
cat("Models: gemini-2.5-pro (thinking) vs gemini-2.5-flash-lite (no thinking)\n\n")
## Models: gemini-2.5-pro (thinking) vs gemini-2.5-flash-lite (no thinking)
cat("Grand Mean:", round(grand_mean, 4), "\n")
## Grand Mean: 5.6667
cat("Pooled MSE:", round(MSE, 4), "\n")
## Pooled MSE: 0.0833
cat("df_error:", df_error, "\n")
## df_error: 16
cat("SE of effects:", round(SE_effect, 4), "\n\n")
## SE of effects: 0.1179
cat("Significant effects (p < 0.05):\n")
## Significant effects (p < 0.05):
sig <- effects_table[effects_table$p_value < 0.05, ]
if (nrow(sig) > 0) {
for (i in 1:nrow(sig)) {
cat(sprintf(" %s: %.4f (p = %.6f) [%.4f, %.4f]\n",
sig$Effect[i], sig$Estimate[i], sig$p_value[i],
sig$CI_lower[i], sig$CI_upper[i]))
}
} else {
cat(" None\n")
}
## B: 0.3333 (p = 0.012109) [0.0835, 0.5832]
## C: 0.6667 (p = 0.000036) [0.4168, 0.9165]
## BC: -0.3333 (p = 0.012109) [-0.5832, -0.0835]
cat("\nNon-significant effects:\n")
##
## Non-significant effects:
nsig <- effects_table[effects_table$p_value >= 0.05, ]
if (nrow(nsig) > 0) {
for (i in 1:nrow(nsig)) {
cat(sprintf(" %s: %.4f (p = %.6f)\n",
nsig$Effect[i], nsig$Estimate[i], nsig$p_value[i]))
}
}
## A: 0.0000 (p = 1.000000)
## AB: 0.0000 (p = 1.000000)
## AC: 0.0000 (p = 1.000000)
## ABC: 0.0000 (p = 1.000000)
cat("\n============================================================\n")
##
## ============================================================
cat("KEY FINDINGS\n")
## KEY FINDINGS
cat("============================================================\n")
## ============================================================
cat("1. Pro model (thinking) achieved perfect score (6/6) on all 12 runs.\n")
## 1. Pro model (thinking) achieved perfect score (6/6) on all 12 runs.
cat("2. Flash-Lite model varied, with c3 (bullet count) as the sole failure.\n")
## 2. Flash-Lite model varied, with c3 (bullet count) as the sole failure.
cat("3. Within Flash-Lite, providing an example (B=+1) improved c3 pass rate.\n")
## 3. Within Flash-Lite, providing an example (B=+1) improved c3 pass rate.
cat("4. Prompt strategy (A) had no detectable effect on compliance.\n")
## 4. Prompt strategy (A) had no detectable effect on compliance.
cat("5. Ceiling effect in Pro group violates variance homogeneity assumption.\n")
## 5. Ceiling effect in Pro group violates variance homogeneity assumption.