added predicted win rates reporting to bradley terry subj eval method…

…s with an option to switch between win rates and elo ratings (#1815)
open-compass · Jan 10, 2025 · 7f2aeef · 7f2aeef
1 parent 121d482
commit 7f2aeef
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 1 deletion.
diff --git a/configs/eval_compassarena_subjectivebench_bradleyterry.py b/configs/eval_compassarena_subjectivebench_bradleyterry.py
@@ -121,6 +121,7 @@
 summarizer = dict(
     type=CompassArenaBradleyTerrySummarizer,
     rating_system='bradleyterry',
+    report_pred_win_rates=True,
     num_bootstrap=100,
     num_cpu=None,
     with_control_vars=True,

diff --git a/configs/eval_subjective_bradleyterry.py b/configs/eval_subjective_bradleyterry.py
@@ -122,6 +122,7 @@
 summarizer = dict(
     type=CompassArenaBradleyTerrySummarizer,
     rating_system='bradleyterry',
+    report_pred_win_rates=True,
     num_bootstrap=100,
     num_cpu=None,
     with_control_vars=True,

diff --git a/opencompass/summarizers/subjective/compass_arena_bradley_terry.py b/opencompass/summarizers/subjective/compass_arena_bradley_terry.py
@@ -6,6 +6,7 @@
 import multiprocessing as mp
 import os
 import os.path as osp
+from collections import defaultdict
 from datetime import datetime
 from functools import partial
 from typing import Any, Dict, List, Optional, Tuple
@@ -607,6 +608,7 @@ class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
         summary_groups (List, optional): Passed to DefaultSubjectiveSummarizer. Not used for this class. Defaults to None.
         prompt_db (_type_, optional): Legacy parameter kept for backward compatibility. Defaults to None.
         rating_system (str, optional): Rating system used. Currently only supports "bradleyterry". Defaults to "bradleyterry".
+        report_pred_win_rates (bool, optional): Whether to report the predicted win rates (against the baseline model) instead of the arena ratings. Defaults to True.
         num_bootstrap (int, optional): The number of bootstraps for estimating the confidence intervals. Defaults to 300.
         num_cpu (int, optional): The number of CPUs to use for the BT bootstrapping process. Defaults to None.
         with_control_vars (bool, optional): Whether to include additional covariates (including style features and group variables) when fitting the BT model. Defaults to True.
@@ -622,6 +624,7 @@ def __init__(
         summary_groups: List = None,
         prompt_db=None,
         rating_system: str = 'bradleyterry',
+        report_pred_win_rates: bool = True,
         num_bootstrap: int = 300,
         num_cpu: int = None,
         with_control_vars: bool = True,
@@ -634,6 +637,7 @@ def __init__(
 
         self.summarizer_cfg = self.cfg['summarizer']
         self.rating_system = 'bradleyterry'  # Only bradleyterry supported
+        self.report_pred_win_rates = report_pred_win_rates
         self.num_bootstrap = num_bootstrap
         self.num_cpu = num_cpu
         self.with_control_vars = with_control_vars
@@ -897,6 +901,7 @@ def _output_to_file(
             'ranking',
             'ranking_ub',
             'model_name',
+            'predicted_win_rate',
             'rating',
             'rating_q975',
             'rating_q025',
@@ -942,6 +947,55 @@ def flip_dict_levels(self, original_dict: Dict):
 
         return flipped_dict
 
+    def predict_win_rate(
+        self,
+        ratings_df: pd.DataFrame,
+        baseline_model: str,
+        base: float = 10.0,
+        scaling_factor: float = 400.0,
+        round_win_rate: int = None,
+    ) -> pd.DataFrame:
+        """Predict win rates between all models using their ELO ratings.
+
+        Args:
+            ratings_df (pd.DataFrame): DataFrame containing model ratings with model names as index
+            baseline_model (str): Name of baseline model to use as reference
+            base (float): Base for the ELO formula (default 10.0)
+            scaling_factor (float): Scaling factor for rating differences (default 400.0)
+
+        Returns:
+            pd.DataFrame: DataFrame with an additional column 'predicted_win_rate' containing
+                the predicted win rate against the baseline model
+        """
+        if baseline_model not in ratings_df.index:
+            raise ValueError(
+                f'Baseline model {baseline_model} not found in ratings')
+
+        # Create a copy of the ratings dataframe to avoid modifying the original
+        result_df = ratings_df.copy()
+
+        # Initialize the predicted_win_rate column with 0.5 for the baseline model
+
+        result_df['predicted_win_rate'] = 0.5
+
+        # Get the baseline model's rating
+        baseline_rating = ratings_df.loc[baseline_model, 'rating']
+
+        # Calculate win probabilities for all models against the baseline
+        for model, row in ratings_df.iterrows():
+            if model != baseline_model:
+                model_rating = row['rating']
+                # ELO win probability formula
+                win_rate = 1 / (1 + base**(
+                    (baseline_rating - model_rating) / scaling_factor))
+                result_df.loc[model, 'predicted_win_rate'] = win_rate
+
+        if round_win_rate is not None:
+            result_df['predicted_win_rate'] = result_df[
+                'predicted_win_rate'].round(round_win_rate)
+
+        return result_df
+
     def summarize(
             self,
             output_path: str = None,
@@ -981,6 +1035,13 @@ def summarize(
                         groups=self.groups,
                     )
 
+                    # Calculate predicted win_rate
+                    cur_table_df = self.predict_win_rate(
+                        ratings_df=cur_table_df,
+                        baseline_model=base_model_abbr,
+                        round_win_rate=4,
+                    )
+
                     control_coefficients[dataset_abbr][
                         base_model_abbr] = cur_ctrl_coefs
                     leaderboard_tables[dataset_abbr][
@@ -1011,12 +1072,24 @@ def summarize(
                     base_model=list(base_models)[0],
                     groups=self.groups,
                 ))
+            # Calculate predicted win_rate
+            cur_judge_all_scores_df = self.predict_win_rate(
+                ratings_df=cur_judge_all_scores_df,
+                baseline_model=list(base_models)[0],
+                round_win_rate=4,
+            )
             cur_judge_all_scores_df['judge'] = judge_abbr
 
             all_scores_df_list.append(cur_judge_all_scores_df)
 
+            # Report predicted win rate or ratings
+            if self.report_pred_win_rates:
+                _scores = cur_judge_all_scores_df['predicted_win_rate']
+            else:
+                _scores = cur_judge_all_scores_df['rating']
+
             all_scores[judge_abbr] = pd.Series(
-                cur_judge_all_scores_df['rating'],
+                _scores,
                 index=cur_judge_all_scores_df['model_name'],
             ).to_dict()