Skip to content

Commit

Permalink
Merge pull request #100 from dod-advana/task/UOT-130520
Browse files Browse the repository at this point in the history
task/UOT-130520 - more fixes finetuning/training data
  • Loading branch information
rha930 committed Mar 7, 2022
2 parents 53ebf09 + 8fb4c21 commit 5a5b9ca
Show file tree
Hide file tree
Showing 10 changed files with 226 additions and 174 deletions.
6 changes: 6 additions & 0 deletions gamechangerml/api/fastapi/routers/controls.py
Original file line number Diff line number Diff line change
Expand Up @@ -669,11 +669,17 @@ def finetune_sentence(model_dict):
testing_only = model_dict["testing_only"]
except:
testing_only = False
try:
remake_train_data = model_dict["remake_train_data"]
except:
remake_train_data = False
args = {
"batch_size": 8,
"epochs": int(model_dict["epochs"]),
"warmup_steps": int(model_dict["warmup_steps"]),
"testing_only": bool(testing_only),
"remake_train_data": bool(remake_train_data),
"retriever": MODELS.sentence_searcher,
}
pipeline.run(
build_type="sent_finetune",
Expand Down
1 change: 1 addition & 0 deletions gamechangerml/data/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
corpus/*
187 changes: 116 additions & 71 deletions gamechangerml/scripts/make_training_data.py

Large diffs are not rendered by default.

22 changes: 18 additions & 4 deletions gamechangerml/scripts/update_eval_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,18 @@
make_timestamp_directory, check_directory, CustomJSONizer
)
from gamechangerml import DATA_PATH
from gamechangerml.api.utils.pathselect import get_model_paths
import logging
logger = logging.getLogger()

model_path_dict = get_model_paths()
SENT_INDEX = model_path_dict['sentence']


def make_tiered_eval_data(index_path):

if not index_path:
index_path = SENT_INDEX

if not os.path.exists(os.path.join(DATA_PATH, "validation", "domain", "sent_transformer")):
os.mkdir(os.path.join(DATA_PATH, "validation", "domain", "sent_transformer"))
Expand Down Expand Up @@ -83,7 +93,8 @@ def save_data(

with open(metafile, "w") as outfile:
json.dump(metadata, outfile)

logger.info(f"***Saved intelligent search validation data to: {intel_path}")

return metadata

all_data = save_data(
Expand All @@ -94,7 +105,7 @@ def save_data(

silver_data = save_data(
level='silver',
filter_queries = True,
filter_queries = False,
**ValidationConfig.TRAINING_ARGS
)

Expand All @@ -107,5 +118,8 @@ def save_data(
return all_data, silver_data, gold_data

if __name__ == '__main__':

make_tiered_eval_data()

try:
make_tiered_eval_data(index_path=None)
except Exception as e:
logger.warning(e, exc_info=True)
18 changes: 18 additions & 0 deletions gamechangerml/src/model_testing/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,24 @@ def reciprocal_rank(ranked_results: List[str], expected: List[str]) -> float:
else:
return 0

def reciprocal_rank_score(ranked_scores: List[str]) -> float:
'''
Calculates the reciprocal of the rank of the first correct score (returns single value from 0 to 1).
'''
first_relevant_rank = 0 # if no relevant results show up, the RR will be 0
count = 1
for i in ranked_scores: # list in order of rank
if i == 1:
first_relevant_rank = count
break
else:
count += 1

if first_relevant_rank > 0:
return np.round((1 / first_relevant_rank), 3)
else:
return 0

def get_MRR(reciprocal_ranks: List[float]) -> float:
'''Takes list of reciprocal rank scores for each search and averages them.'''
return np.round(np.mean(reciprocal_ranks), 3)
Expand Down
2 changes: 1 addition & 1 deletion gamechangerml/src/model_testing/validation_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ def clean_quot(string):
return string.replace(""", "'").replace("'", "'").lower()

def clean_doc(string):
return string.strip(".pdf")
return string.split(".pdf")[0]

def is_question(string):
"""If we find a good way to use search history for QA validation (not used currently)"""
Expand Down
108 changes: 23 additions & 85 deletions gamechangerml/src/search/sent_transformer/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import torch.nn.functional as F
from torch import nn
torch.cuda.empty_cache()
from gamechangerml.src.model_testing.metrics import reciprocal_rank_score, get_MRR

S3_DATA_PATH = "bronze/gamechanger/ml-data"

Expand Down Expand Up @@ -47,19 +48,7 @@ def fix_model_config(model_load_path):
logger.info("Could not update model config file")


def get_cos_sim(model, pair):

emb1 = model.encode(pair[0], show_progress_bar=False)
emb2 = model.encode(pair[1], show_progress_bar=False)
try:
sim = float(util.cos_sim(emb1, emb2))
except:
sim = float(cos_sim(emb1, emb2))

return sim


def format_inputs(train, test):
def format_inputs(train, test, data_dir):
"""Create input data for dataloader and df for tracking cosine sim"""

train_samples = []
Expand All @@ -71,20 +60,21 @@ def format_inputs(train, test):
score = float(train[i]["label"])
inputex = InputExample(str(count), texts, score)
train_samples.append(inputex)
all_data.append([i, texts, score, "train"])
all_data.append([train[i]["query"], texts, score, "train"])
count += 1
#processmanager.update_status(processmanager.loading_data, count, total)

for x in test.keys():
texts = [test[x]["query"], test[x]["paragraph"]]
score = float(test[x]["label"])
all_data.append([x, texts, score, "test"])
all_data.append([test[x]["query"], texts, score, "test"])
count += 1
#processmanager.update_status(processmanager.loading_data, count, total)
processmanager.update_status(processmanager.loading_data, count, total)

df = pd.DataFrame(all_data, columns=["key", "pair", "score", "label"])
df.to_csv(os.path.join(data_dir, timestamp_filename("finetuning_data", ".csv")))

return train_samples, df
return train_samples


class STFinetuner():
Expand All @@ -98,10 +88,8 @@ def __init__(self, model_load_path, model_save_path, shuffle, batch_size, epochs
self.batch_size = batch_size
self.epochs = epochs
self.warmup_steps = warmup_steps
self.device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu")
#self.pin_memory = True if torch.cuda.is_available() else False

self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def retrain(self, data_dir, testing_only, version):

try:
Expand All @@ -123,12 +111,7 @@ def retrain(self, data_dir, testing_only, version):
processmanager.update_status(processmanager.training, 0, 1)
sleep(0.1)
# make formatted training data
train_samples, df = format_inputs(train, test)

# get cosine sim before finetuning
# TODO: you should be able to encode this more efficiently
df["original_cos_sim"] = df["pair"].apply(
lambda x: get_cos_sim(self.model, x))
train_samples = format_inputs(train, test, data_dir)

# finetune on samples
logger.info("Starting dataloader...")
Expand All @@ -150,63 +133,7 @@ def retrain(self, data_dir, testing_only, version):

# when not testing only, save to S3
if not testing_only:
dst_path = self.model_save_path + ".tar.gz"
utils.create_tgz_from_dir(
src_dir=self.model_save_path, dst_archive=dst_path)
model_id = self.model_save_path.split('_')[1]
logger.info(f"*** Created tgz file and saved to {dst_path}")

S3_MODELS_PATH = "bronze/gamechanger/models"
s3_path = os.path.join(S3_MODELS_PATH, str(version))
utils.upload(s3_path, dst_path, "transformers", model_id)
logger.info(f"*** Saved model to S3: {s3_path}")

logger.info("*** Making finetuning results csv")
# get new cosine sim
df["new_cos_sim"] = df["pair"].apply(
lambda x: get_cos_sim(self.model, x))
df["change_cos_sim"] = df["new_cos_sim"] - df["original_cos_sim"]

# save all results to CSV
df.to_csv(os.path.join(data_dir, timestamp_filename(
"finetuning_results", ".csv")))

# create training metadata
positive_change_train = df[(df["score"] == 1.0) & (
df["label"] == "train")]["change_cos_sim"].median()
negative_change_train = df[(
df["score"] == -1.0) & (df["label"] == "train")]["change_cos_sim"].median()
neutral_change_train = df[(df["score"] == 0.0) & (
df["label"] == "train")]["change_cos_sim"].median()
positive_change_test = df[(df["score"] == 1.0) & (
df["label"] == "test")]["change_cos_sim"].median()
negative_change_test = df[(
df["score"] == -1.0) & (df["label"] == "test")]["change_cos_sim"].median()
neutral_change_test = df[(df["score"] == 0.0) & (
df["label"] == "test")]["change_cos_sim"].median()

ft_metadata = {
"date_finetuned": str(date.today()),
"data_dir": str(data_dir),
"positive_change_train": positive_change_train,
"negative_change_train": negative_change_train,
"neutral_change_train": neutral_change_train,
"positive_change_test": positive_change_test,
"negative_change_test": negative_change_test,
"neutral_change_test": neutral_change_test
}

# save metadata file
ft_metadata_path = os.path.join(
data_dir, timestamp_filename("finetuning_metadata", ".json"))
with open(ft_metadata_path, "w") as outfile:
json.dump(ft_metadata, outfile)

logger.info("Metadata saved to {}".format(ft_metadata_path))
logger.info(str(ft_metadata))

# when not testing only, save to S3
if not testing_only:
logger.info("Saving data to S3...")
s3_path = os.path.join(S3_DATA_PATH, f"{version}")
logger.info(f"**** Saving new data files to S3: {s3_path}")
dst_path = data_dir + ".tar.gz"
Expand All @@ -216,7 +143,18 @@ def retrain(self, data_dir, testing_only, version):
logger.info("*** Attempting to upload data to s3")
utils.upload(s3_path, dst_path, "data", model_name)

return ft_metadata
logger.info("Saving model to S3...")
dst_path = self.model_save_path + ".tar.gz"
utils.create_tgz_from_dir(src_dir=self.model_save_path, dst_archive=dst_path)
model_id = self.model_save_path.split('_')[1]
logger.info(f"*** Created tgz file and saved to {dst_path}")

S3_MODELS_PATH = "bronze/gamechanger/models"
s3_path = os.path.join(S3_MODELS_PATH, str(version))
utils.upload(s3_path, dst_path, "transformers", model_id)
logger.info(f"*** Saved model to S3: {s3_path}")

return {}

except Exception as e:
logger.warning("Could not complete finetuning")
Expand Down
8 changes: 6 additions & 2 deletions gamechangerml/src/search/sent_transformer/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from gamechangerml.src.model_testing.validation_data import MSMarcoData



class SentenceEncoder(object):
"""
Handles text encoding and creating of ANNOY index
Expand Down Expand Up @@ -92,6 +93,7 @@ def _index(self, corpus, index_path, overwrite=False, save_embedding=False):
dataframe_path = os.path.join(index_path, "data.csv")
ids_path = os.path.join(index_path, "doc_ids.txt")

'''
# Load new data
if os.path.isfile(embedding_path) and (overwrite is False):
logger.info(f"Loading new data from {embedding_path}")
Expand All @@ -113,6 +115,7 @@ def _index(self, corpus, index_path, overwrite=False, save_embedding=False):
# Append new dataframe
old_df = pd.read_csv(dataframe_path)
df = pd.concat([old_df, df])
'''

# Store embeddings and document index
# for future reference
Expand Down Expand Up @@ -161,10 +164,11 @@ def index_documents(self, corpus_path, index_path):
verbose=self.verbose,
)
corpus = [(para_id, " ".join(tokens), None)
for tokens, para_id in corp]
for tokens, para_id in corp]
logger.info(
f"\nLength of batch (in par ids) for indexing : {str(len(corpus))}"
)

else:
logger.info(
"Did not include path to corpus, making test index with msmarco data"
Expand All @@ -178,7 +182,7 @@ def index_documents(self, corpus_path, index_path):

self._index(corpus, index_path)
processmanager.update_status(
processmanager.training, 1, 1, "finished building sent index"
processmanager.training, 1, 0, "finished building sent index"
)

self.embedder.save(index_path)
Expand Down
2 changes: 2 additions & 0 deletions gamechangerml/src/utilities/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,7 @@ def concat_csvs(directory):
df = pd.DataFrame()
logger.info(str(directory))
csvs = [i for i in os.listdir(directory) if i.split('.')[-1]=='csv']
csvs = [i for i in csvs if i[:2] != '._']
logger.info(f"Combining csvs: {str(csvs)}")
for i in csvs:
try:
Expand All @@ -347,3 +348,4 @@ def get_most_recent_dir(parent_dir):
return max(subdirs, key=os.path.getctime)
else:
logger.error("There are no subdirectories to retrieve most recent data from")
return None
Loading

0 comments on commit 5a5b9ca

Please sign in to comment.