-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b21f9f4
commit ccb99c8
Showing
18 changed files
with
2,800 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,37 @@ | ||
# P-GED | ||
P_GED | ||
# Language Model Preference Evaluation with Multiple Weak Evaluators | ||
This paper introduces GED (Preference Graph Ensemble and Denoise), a method designed to improve the evaluation of large language models' (LLMs) outputs by ensembling multiple weak evaluators and applying denoising techniques to resolve cyclic inconsistencies in preference graphs, resulting in more reliable, non-contradictory preference evaluations | ||
|
||
|
||
<h1 style="text-align:left"> | ||
<img style="vertical-align:middle; width:50%" src="./images/demo.png" /> | ||
</h1> | ||
|
||
|
||
|
||
## Setup | ||
|
||
Install all required dependencies to ensure all scripts function correctly. | ||
|
||
```bash | ||
pip install -r requirements.txt | ||
``` | ||
|
||
## Rank result generation | ||
|
||
```bash | ||
python rank_gen.py \ | ||
--eval_model $eval_model \ | ||
--answer_model $answer_model \ | ||
--task_name $task_name \ | ||
--w_type $w_type \ | ||
--rank_type $rank_type | ||
``` | ||
|
||
|
||
- `--eval_model`: The model used for evaluation. (Like: 'llama3-8b'). | ||
- `--answer_model`: The model generating the answers. (Like: 'qwen1.5-32b'). | ||
- `--task_name`: The task for evaluation. (Like: '10k-ultra'). | ||
- `--rank_type`: The ranking method. (Like: 'pairwise_majority'). | ||
- `--ensemble_type`: The type of ensemble method used. (Like: 'graph_ensemble'). | ||
|
||
This script generates updated rankings, denoising conflicting evaluations from the weak evaluators to produce reliable results. |
Empty file.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from abc import ABC, abstractmethod | ||
|
||
class AbstractRankingLF(ABC): | ||
''' | ||
lst_feature_map : list of dictionaries, each dictornary | ||
contains feature_name: feature_value | ||
return: ranking on indices from 0 to len(lst_feature_map)-1 | ||
''' | ||
@abstractmethod | ||
def apply(self,lst_feature_map): | ||
pass | ||
|
||
|
||
class AbstractRegressionLF(ABC): | ||
""" | ||
LF abstraction for regression label function | ||
""" | ||
@abstractmethod | ||
def apply(self, df): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
from .base_lf import * | ||
from ranking_utils import * | ||
class FeatureRankingLF(AbstractRankingLF): | ||
def __init__(self,rank_on_feature, d, highest_first=True ): | ||
self.rank_on_feature = rank_on_feature | ||
self.highest_first = highest_first | ||
self.r_utils = RankingUtils(d) | ||
|
||
def apply(self,lst_feature_map): | ||
k = len(lst_feature_map) | ||
lst_id_f_val = [(i,lst_feature_map[i][self.rank_on_feature]) for i in range(k) ] | ||
out = sorted(lst_id_f_val, key = lambda x: x[1], reverse=self.highest_first) | ||
out = Ranking([i for i,v in out], self.r_utils) | ||
return out | ||
|
||
def apply_mat(self, X): | ||
L = [] | ||
k = X.shape[1] | ||
for row in range(X.shape[0]): | ||
lst_id_f_val = [(i, X[row][i][self.rank_on_feature]) for i in range(k)] | ||
out = sorted(lst_id_f_val, key=lambda x: x[1], reverse=self.highest_first) | ||
out = Ranking([i for i, v in out], self.r_utils) | ||
L.append(out) | ||
return L |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
from .base_lf import AbstractRegressionLF | ||
import pandas as pd | ||
import numpy as np | ||
|
||
class ContinuousLF(AbstractRegressionLF): | ||
def __init__(self, feature_names): | ||
if type(feature_names) == str: | ||
feature_names = [feature_names] | ||
self.feature_names = feature_names | ||
|
||
def apply(self, df, logarithmic=False, default_val=6, y_min=0, y_max=10): | ||
L = pd.DataFrame() | ||
for col in self.feature_names: | ||
if col not in df.columns: | ||
print(col, "doest not exist in df!") | ||
continue | ||
feature_col = df[col] | ||
|
||
if logarithmic: | ||
feature_col = np.log(feature_col + 1) | ||
|
||
# init | ||
weak_label = feature_col | ||
|
||
# scaling to the range of rating | ||
weak_label = np.clip(weak_label / weak_label.quantile(0.98) * 10, y_min, y_max) | ||
|
||
# handling zero values with the predetermined default_val | ||
weak_label.loc[weak_label == 0] = default_val | ||
|
||
L[col] = weak_label | ||
return L | ||
|
||
|
||
class DiscreteLF(AbstractRegressionLF): | ||
def __init__(self, feature_names, label_feature): | ||
if type(feature_names) == str: | ||
feature_names = [feature_names] | ||
self.feature_names = feature_names | ||
self.label_feature = label_feature | ||
|
||
def apply(self, df, debug=False): | ||
L = pd.DataFrame() | ||
for col in self.feature_names: | ||
if col not in df.columns: | ||
print(col, "doest not exist in df!") | ||
continue | ||
|
||
groupby_mean = df.groupby(col).mean()[self.label_feature] | ||
if debug: | ||
print(groupby_mean) | ||
weak_label = df[[col]].merge(groupby_mean, how='left', on=col).drop(col, axis=1) | ||
L[col] = np.ndarray.flatten(weak_label.values) | ||
return L |
Oops, something went wrong.