import pandas as pd from scipy.stats import kruskal, mannwhitneyu, spearmanr import os def load_and_preprocess_data(filepath='user_study/data/user_data.csv'): # Loads and preprocesses the user study data. if not os.path.exists(filepath): raise FileNotFoundError(f"The data file was not found at {filepath}") df = pd.read_csv(filepath) if 'attr_q_cognitive_load' in df.columns: df['attr_q_ease_of_use'] = 6 - df['attr_q_cognitive_load'] return df def run_ux_ratings_test(df): # Compares UX ratings across the three pages. print("\n--- 1. UX Ratings Comparison Across Pages ---") page_ratings = { 'Attribution': df[['attr_q_visual_clarity', 'attr_q_ease_of_use', 'attr_q_influencer_plausibility']].mean(axis=1), 'Function Vectors': df[['fv_q_pca_clarity', 'fv_q_type_attribution_clarity', 'fv_q_layer_evolution_plausibility']].mean(axis=1), 'Circuit Trace': df[['ct_q_main_graph_clarity', 'ct_q_feature_explorer_usefulness', 'ct_q_subnetwork_clarity']].mean(axis=1) } attr_scores = page_ratings['Attribution'].dropna() fv_scores = page_ratings['Function Vectors'].dropna() ct_scores = page_ratings['Circuit Trace'].dropna() if len(attr_scores) > 0 and len(fv_scores) > 0 and len(ct_scores) > 0: stat, p = kruskal(attr_scores, fv_scores, ct_scores) print("Kruskal-Wallis test for overall UX ratings across the three pages:") print(f"H-statistic: {stat:.4f}, p-value: {p:.4f}") if p < 0.05: print("Result: There is a statistically significant difference in UX ratings between the pages.") else: print("Result: There is no statistically significant difference in UX ratings between the pages.") else: print("Could not perform Kruskal-Wallis test due to insufficient data.") def run_language_comparison_test(df): # Compares ease of use for the Attribution page between English and German speakers. print("\n--- 2. Language Comparison for Attribution Page Ease of Use ---") en_df = df[df['language'] == 'en'] de_df = df[df['language'] == 'de'] en_scores = en_df['attr_q_ease_of_use'].dropna() de_scores = de_df['attr_q_ease_of_use'].dropna() if len(en_scores) > 0 and len(de_scores) > 0: stat, p = mannwhitneyu(en_scores, de_scores, alternative='two-sided') print("Mann-Whitney U test for 'Ease of Use' on Attribution page (English vs. German):") print(f"U-statistic: {stat:.4f}, p-value: {p:.4f}") if p < 0.05: print("Result: There is a statistically significant difference between the language groups.") else: print("Result: There is no statistically significant difference between the language groups.") else: print("Could not perform Mann-Whitney U test due to insufficient data.") def run_experience_correctness_test(df): # Tests for a correlation between LLM experience and correctness. print("\n--- 3. Correlation between LLM Experience and Comprehension Correctness ---") experience_map = {'novice': 1, 'intermediate': 2, 'expert': 3} df['llm_experience_ordinal'] = df['llm_experience'].map(experience_map) correct_cols = [col for col in df.columns if 'correct' in col] df['overall_correctness'] = df[correct_cols].mean(axis=1) corr_df = df[['llm_experience_ordinal', 'overall_correctness']].dropna() if not corr_df.empty: corr, p = spearmanr(corr_df['llm_experience_ordinal'], corr_df['overall_correctness']) print("Spearman correlation between LLM experience and overall comprehension correctness:") print(f"Rho: {corr:.4f}, p-value: {p:.4f}") if p < 0.05: print("Result: There is a statistically significant correlation.") else: print("Result: There is no statistically significant correlation.") else: print("Could not perform Spearman correlation due to insufficient data.") if __name__ == '__main__': try: data = load_and_preprocess_data('../../user_study/data/user_data.csv') run_ux_ratings_test(data) run_language_comparison_test(data) run_experience_correctness_test(data) except Exception as e: print(f"An error occurred: {e}")