ELIA / user_study /scripts /perform_statistical_tests.py
aaron0eidt's picture
Deploy static demo
5b6c556
import pandas as pd
from scipy.stats import kruskal, mannwhitneyu, spearmanr
import os
def load_and_preprocess_data(filepath='user_study/data/user_data.csv'):
# Loads and preprocesses the user study data.
if not os.path.exists(filepath):
raise FileNotFoundError(f"The data file was not found at {filepath}")
df = pd.read_csv(filepath)
if 'attr_q_cognitive_load' in df.columns:
df['attr_q_ease_of_use'] = 6 - df['attr_q_cognitive_load']
return df
def run_ux_ratings_test(df):
# Compares UX ratings across the three pages.
print("\n--- 1. UX Ratings Comparison Across Pages ---")
page_ratings = {
'Attribution': df[['attr_q_visual_clarity', 'attr_q_ease_of_use', 'attr_q_influencer_plausibility']].mean(axis=1),
'Function Vectors': df[['fv_q_pca_clarity', 'fv_q_type_attribution_clarity', 'fv_q_layer_evolution_plausibility']].mean(axis=1),
'Circuit Trace': df[['ct_q_main_graph_clarity', 'ct_q_feature_explorer_usefulness', 'ct_q_subnetwork_clarity']].mean(axis=1)
}
attr_scores = page_ratings['Attribution'].dropna()
fv_scores = page_ratings['Function Vectors'].dropna()
ct_scores = page_ratings['Circuit Trace'].dropna()
if len(attr_scores) > 0 and len(fv_scores) > 0 and len(ct_scores) > 0:
stat, p = kruskal(attr_scores, fv_scores, ct_scores)
print("Kruskal-Wallis test for overall UX ratings across the three pages:")
print(f"H-statistic: {stat:.4f}, p-value: {p:.4f}")
if p < 0.05:
print("Result: There is a statistically significant difference in UX ratings between the pages.")
else:
print("Result: There is no statistically significant difference in UX ratings between the pages.")
else:
print("Could not perform Kruskal-Wallis test due to insufficient data.")
def run_language_comparison_test(df):
# Compares ease of use for the Attribution page between English and German speakers.
print("\n--- 2. Language Comparison for Attribution Page Ease of Use ---")
en_df = df[df['language'] == 'en']
de_df = df[df['language'] == 'de']
en_scores = en_df['attr_q_ease_of_use'].dropna()
de_scores = de_df['attr_q_ease_of_use'].dropna()
if len(en_scores) > 0 and len(de_scores) > 0:
stat, p = mannwhitneyu(en_scores, de_scores, alternative='two-sided')
print("Mann-Whitney U test for 'Ease of Use' on Attribution page (English vs. German):")
print(f"U-statistic: {stat:.4f}, p-value: {p:.4f}")
if p < 0.05:
print("Result: There is a statistically significant difference between the language groups.")
else:
print("Result: There is no statistically significant difference between the language groups.")
else:
print("Could not perform Mann-Whitney U test due to insufficient data.")
def run_experience_correctness_test(df):
# Tests for a correlation between LLM experience and correctness.
print("\n--- 3. Correlation between LLM Experience and Comprehension Correctness ---")
experience_map = {'novice': 1, 'intermediate': 2, 'expert': 3}
df['llm_experience_ordinal'] = df['llm_experience'].map(experience_map)
correct_cols = [col for col in df.columns if 'correct' in col]
df['overall_correctness'] = df[correct_cols].mean(axis=1)
corr_df = df[['llm_experience_ordinal', 'overall_correctness']].dropna()
if not corr_df.empty:
corr, p = spearmanr(corr_df['llm_experience_ordinal'], corr_df['overall_correctness'])
print("Spearman correlation between LLM experience and overall comprehension correctness:")
print(f"Rho: {corr:.4f}, p-value: {p:.4f}")
if p < 0.05:
print("Result: There is a statistically significant correlation.")
else:
print("Result: There is no statistically significant correlation.")
else:
print("Could not perform Spearman correlation due to insufficient data.")
if __name__ == '__main__':
try:
data = load_and_preprocess_data('../../user_study/data/user_data.csv')
run_ux_ratings_test(data)
run_language_comparison_test(data)
run_experience_correctness_test(data)
except Exception as e:
print(f"An error occurred: {e}")