Skip to content

Commit

Permalink
up
Browse files Browse the repository at this point in the history
  • Loading branch information
zhengyuhu-01 committed Oct 17, 2024
1 parent b21f9f4 commit ccb99c8
Show file tree
Hide file tree
Showing 18 changed files with 2,800 additions and 2 deletions.
39 changes: 37 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,37 @@
# P-GED
P_GED
# Language Model Preference Evaluation with Multiple Weak Evaluators
This paper introduces GED (Preference Graph Ensemble and Denoise), a method designed to improve the evaluation of large language models' (LLMs) outputs by ensembling multiple weak evaluators and applying denoising techniques to resolve cyclic inconsistencies in preference graphs, resulting in more reliable, non-contradictory preference evaluations


<h1 style="text-align:left">
<img style="vertical-align:middle; width:50%" src="./images/demo.png" />
</h1>



## Setup

Install all required dependencies to ensure all scripts function correctly.

```bash
pip install -r requirements.txt
```

## Rank result generation

```bash
python rank_gen.py \
--eval_model $eval_model \
--answer_model $answer_model \
--task_name $task_name \
--w_type $w_type \
--rank_type $rank_type
```


- `--eval_model`: The model used for evaluation. (Like: 'llama3-8b').
- `--answer_model`: The model generating the answers. (Like: 'qwen1.5-32b').
- `--task_name`: The task for evaluation. (Like: '10k-ultra').
- `--rank_type`: The ranking method. (Like: 'pairwise_majority').
- `--ensemble_type`: The type of ensemble method used. (Like: 'graph_ensemble').

This script generates updated rankings, denoising conflicting evaluations from the weak evaluators to produce reliable results.
Empty file added __init__.py
Empty file.
Binary file added images/demo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
20 changes: 20 additions & 0 deletions labelling/base_lf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from abc import ABC, abstractmethod

class AbstractRankingLF(ABC):
'''
lst_feature_map : list of dictionaries, each dictornary
contains feature_name: feature_value
return: ranking on indices from 0 to len(lst_feature_map)-1
'''
@abstractmethod
def apply(self,lst_feature_map):
pass


class AbstractRegressionLF(ABC):
"""
LF abstraction for regression label function
"""
@abstractmethod
def apply(self, df):
pass
24 changes: 24 additions & 0 deletions labelling/feature_lf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from .base_lf import *
from ranking_utils import *
class FeatureRankingLF(AbstractRankingLF):
def __init__(self,rank_on_feature, d, highest_first=True ):
self.rank_on_feature = rank_on_feature
self.highest_first = highest_first
self.r_utils = RankingUtils(d)

def apply(self,lst_feature_map):
k = len(lst_feature_map)
lst_id_f_val = [(i,lst_feature_map[i][self.rank_on_feature]) for i in range(k) ]
out = sorted(lst_id_f_val, key = lambda x: x[1], reverse=self.highest_first)
out = Ranking([i for i,v in out], self.r_utils)
return out

def apply_mat(self, X):
L = []
k = X.shape[1]
for row in range(X.shape[0]):
lst_id_f_val = [(i, X[row][i][self.rank_on_feature]) for i in range(k)]
out = sorted(lst_id_f_val, key=lambda x: x[1], reverse=self.highest_first)
out = Ranking([i for i, v in out], self.r_utils)
L.append(out)
return L
54 changes: 54 additions & 0 deletions labelling/regression_lf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from .base_lf import AbstractRegressionLF
import pandas as pd
import numpy as np

class ContinuousLF(AbstractRegressionLF):
def __init__(self, feature_names):
if type(feature_names) == str:
feature_names = [feature_names]
self.feature_names = feature_names

def apply(self, df, logarithmic=False, default_val=6, y_min=0, y_max=10):
L = pd.DataFrame()
for col in self.feature_names:
if col not in df.columns:
print(col, "doest not exist in df!")
continue
feature_col = df[col]

if logarithmic:
feature_col = np.log(feature_col + 1)

# init
weak_label = feature_col

# scaling to the range of rating
weak_label = np.clip(weak_label / weak_label.quantile(0.98) * 10, y_min, y_max)

# handling zero values with the predetermined default_val
weak_label.loc[weak_label == 0] = default_val

L[col] = weak_label
return L


class DiscreteLF(AbstractRegressionLF):
def __init__(self, feature_names, label_feature):
if type(feature_names) == str:
feature_names = [feature_names]
self.feature_names = feature_names
self.label_feature = label_feature

def apply(self, df, debug=False):
L = pd.DataFrame()
for col in self.feature_names:
if col not in df.columns:
print(col, "doest not exist in df!")
continue

groupby_mean = df.groupby(col).mean()[self.label_feature]
if debug:
print(groupby_mean)
weak_label = df[[col]].merge(groupby_mean, how='left', on=col).drop(col, axis=1)
L[col] = np.ndarray.flatten(weak_label.values)
return L
Loading

0 comments on commit ccb99c8

Please sign in to comment.