MT-Consistency/evaluate_health.py at main · yubol-bobo/MT-Consistency · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#!/usr/bin/env python3

import sys
import os
import pandas as pd
import shutil

def parse_health_data(value):
    """Parse health data format with np.float64"""
    if pd.isna(value) or str(value).strip() == '':
        return None, None

    s = str(value).strip()

    # If it's just a number (like round_0)
    try:
        return int(float(s)), None
    except:
        pass

    # If it's in tuple format with np.float64
    if s.startswith('(') and s.endswith(')'):
        try:
            # Remove np.float64( and extract values
            clean_s = s.replace('np.float64(', '').replace(')', '')
            # Split by comma and extract correctness and confidence
            parts = clean_s.strip('()').split(',')
            correctness = int(float(parts[0].strip()))
            confidence = float(parts[1].strip()) if len(parts) > 1 else None
            return correctness, confidence
        except Exception as e:
            print(f"Error parsing {s}: {e}")
            return None, None

    return None, None

def transform_health_dataframe(df):
    """Transform health data format to standard format"""
    df = df.copy()

    # Transform round columns 1-8
    for i in range(1, 9):
        col = f'round_{i}'
        if col in df.columns:
            parsed_data = df[col].apply(parse_health_data)
            df[f'{col}_ans'] = parsed_data.apply(lambda x: x[0] if x[0] is not None else float('nan'))
            df[f'{col}_conf'] = parsed_data.apply(lambda x: x[1] if x[1] is not None else float('nan'))

    return df

def merge_csv_files_in_directory(input_dir, output_dir):
    """Merge all CSV files in a directory into one file and transform data"""
    if not os.path.exists(input_dir):
        print(f"Directory {input_dir} does not exist!")
        return None

    # Find all CSV files
    csv_files = []
    for file in os.listdir(input_dir):
        if file.endswith('.csv'):
            csv_files.append(os.path.join(input_dir, file))

    if not csv_files:
        print(f"No CSV files found in {input_dir}")
        return None

    print(f"Found {len(csv_files)} CSV files in {input_dir}")

    # Read and merge all CSV files
    dfs = []
    for file_path in csv_files:
        try:
            df = pd.read_csv(file_path)
            # Transform the dataframe to parse health data format
            df_transformed = transform_health_dataframe(df)
            dfs.append(df_transformed)
            print(f"  - Loaded and transformed {file_path}: {len(df)} rows")
        except Exception as e:
            print(f"  - Error loading {file_path}: {e}")

    if dfs:
        # Merge all dataframes
        merged_df = pd.concat(dfs, ignore_index=True)

        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Save merged file
        # Extract model name from directory structure
        if 'carg' in input_dir.lower():
            model_name = 'carg'
        elif 'gpt' in input_dir.lower():
            model_name = 'gpt'
        else:
            model_name = 'unknown'

        output_file = os.path.join(output_dir, f"{model_name}.csv")
        merged_df.to_csv(output_file, index=False)
        print(f"  Merged and transformed file saved: {output_file} ({len(merged_df)} rows)")

        return output_file

    return None

def prepare_health_data():
    """Merge CSV files from each directory and prepare for evaluation"""
    print("="*60)
    print("PREPARING HEALTH DATA FOR EVALUATION")
    print("="*60)

    # Define directories
    carg_input_dir = "Outputs/health/diverse/carg"
    gpt_input_dir = "Outputs/health/diverse/gpt"
    merged_output_dir = "data/health_cleaned_results"

    # Clean up existing merged results
    if os.path.exists(merged_output_dir):
        shutil.rmtree(merged_output_dir)

    os.makedirs(merged_output_dir, exist_ok=True)

    # Merge CARG files
    print("\n1. Merging CARG results...")
    carg_merged = merge_csv_files_in_directory(carg_input_dir, merged_output_dir)

    # Merge GPT files
    print("\n2. Merging GPT results...")
    gpt_merged = merge_csv_files_in_directory(gpt_input_dir, merged_output_dir)

    if carg_merged or gpt_merged:
        print(f"\nData preparation complete!")
        print(f"Merged results saved in: {merged_output_dir}")
        return True
    else:
        print("No data was merged!")
        return False

def main():
    # Add src directory to path
    src_dir = os.path.join(os.path.dirname(__file__), 'src')
    if src_dir not in sys.path:
        sys.path.insert(0, src_dir)

    # Step 1: Prepare the data by merging CSV files
    if not prepare_health_data():
        print("Data preparation failed!")
        return

    # Step 2: Import and run evaluation using eval_visualize
    print("\n" + "="*60)
    print("RUNNING HEALTH DATA EVALUATION")
    print("="*60)

    from eval_visualize import (
        evaluate_all_models, plot_accuracy_trends, calculate_model_metrics,
        plot_model_metrics_comparison, plot_model_round_accuracies
    )

    # Set paths for health data evaluation
    results_dir = "data/health_cleaned_results"
    plot_dir = "Outputs/health/plots"
    csv_dir = os.path.join(plot_dir, "csv")

    # Create output directories
    os.makedirs(plot_dir, exist_ok=True)
    os.makedirs(csv_dir, exist_ok=True)

    print(f"Loading data from: {results_dir}")
    print(f"Saving plots to: {plot_dir}")

    try:
        # 1. Accuracy table and round trends
        print("\n1. Computing accuracy table and trends...")
        accuracy_table, all_data = evaluate_all_models(results_dir)
        accuracy_table.to_csv(os.path.join(csv_dir, "accuracy_table.csv"))
        plot_accuracy_trends(accuracy_table, save_path=os.path.join(plot_dir, "accuracy_trends.png"))

        # 2. Model metrics comparison
        print("2. Computing model metrics...")
        metrics = {'model': [], 'initial_accuracy': [], 'average_pwc': [], 'average_first_sway': [], 'average_SR_pair': []}
        for model in accuracy_table.index:
            df = all_data[model]
            metrics['model'].append(model)
            metrics['initial_accuracy'].append(accuracy_table.loc[model, 'round_0'])
            avg_pwc, avg_first_sway, avg_sway_recovery = calculate_model_metrics(df)
            metrics['average_pwc'].append(avg_pwc)
            metrics['average_first_sway'].append(avg_first_sway)
            metrics['average_SR_pair'].append(avg_sway_recovery)

        results_df = pd.DataFrame(metrics)
        results_df.to_csv(os.path.join(csv_dir, "model_metrics.csv"), index=False)
        plot_model_metrics_comparison(results_df, plot_dir)

        # 3. Round-by-round accuracy for all models
        print("3. Plotting round-by-round accuracies...")
        plot_model_round_accuracies(all_data, plot_dir)

        print(f"\nHealth experiment evaluation complete!")
        print(f"Results saved to: {plot_dir}")
        print(f"CSV files saved to: {csv_dir}")

        # Print summary
        print(f"\nSummary:")
        print(f"- Models evaluated: {list(all_data.keys())}")
        print(f"- Total data points: {sum(len(df) for df in all_data.values())}")
        for model, df in all_data.items():
            print(f"  - {model}: {len(df)} samples")

        # Print key metrics
        print(f"\nKey Metrics:")
        for _, row in results_df.iterrows():
            print(f"- {row['model']}:")
            print(f"  Initial Accuracy: {row['initial_accuracy']:.1f}%")
            print(f"  PWC Score: {row['average_pwc']:.1f}%")
            print(f"  First Sway: {row['average_first_sway']:.1f}%")
            print(f"  Sway Recovery: {row['average_SR_pair']:.1f}%")

    except Exception as e:
        print(f"Error during evaluation: {e}")
        import traceback
        traceback.print_exc()

if __name__ == '__main__':
    main()