Merge pull request #100 from dod-advana/task/UOT-130520

task/UOT-130520 - more fixes finetuning/training data
dod-advana · Mar 7, 2022 · 5a5b9ca · 5a5b9ca
2 parents 53ebf09 + 8fb4c21
commit 5a5b9ca
Show file tree

Hide file tree

Showing 10 changed files with 226 additions and 174 deletions.
diff --git a/gamechangerml/api/fastapi/routers/controls.py b/gamechangerml/api/fastapi/routers/controls.py
@@ -669,11 +669,17 @@ def finetune_sentence(model_dict):
  testing_only = model_dict["testing_only"]
  except:
  testing_only = False
+ try:
+ remake_train_data = model_dict["remake_train_data"]
+ except:
+ remake_train_data = False
  args = {
  "batch_size": 8,
  "epochs": int(model_dict["epochs"]),
  "warmup_steps": int(model_dict["warmup_steps"]),
  "testing_only": bool(testing_only),
+ "remake_train_data": bool(remake_train_data),
+ "retriever": MODELS.sentence_searcher,
  }
  pipeline.run(
  build_type="sent_finetune",

diff --git a/gamechangerml/data/.gitignore b/gamechangerml/data/.gitignore
@@ -0,0 +1 @@
+corpus/*
diff --git a/gamechangerml/scripts/make_training_data.py b/gamechangerml/scripts/make_training_data.py
diff --git a/gamechangerml/scripts/update_eval_data.py b/gamechangerml/scripts/update_eval_data.py
@@ -8,8 +8,18 @@
  make_timestamp_directory, check_directory, CustomJSONizer
  )
 from gamechangerml import DATA_PATH
+from gamechangerml.api.utils.pathselect import get_model_paths
+import logging
+logger = logging.getLogger()
+
+model_path_dict = get_model_paths()
+SENT_INDEX = model_path_dict['sentence']
+
 
 def make_tiered_eval_data(index_path):
+
+ if not index_path:
+ index_path = SENT_INDEX
 
  if not os.path.exists(os.path.join(DATA_PATH, "validation", "domain", "sent_transformer")):
  os.mkdir(os.path.join(DATA_PATH, "validation", "domain", "sent_transformer"))
@@ -83,7 +93,8 @@ def save_data(
 
  with open(metafile, "w") as outfile:
  json.dump(metadata, outfile)
-
+ logger.info(f"***Saved intelligent search validation data to: {intel_path}") 
+
  return metadata
 
  all_data = save_data(
@@ -94,7 +105,7 @@ def save_data(
 
  silver_data = save_data(
  level='silver',
- filter_queries = True,
+ filter_queries = False,
  **ValidationConfig.TRAINING_ARGS
  )
 
@@ -107,5 +118,8 @@ def save_data(
  return all_data, silver_data, gold_data
 
 if __name__ == '__main__':
-
- make_tiered_eval_data()
+
+ try:
+ make_tiered_eval_data(index_path=None)
+ except Exception as e:
+ logger.warning(e, exc_info=True)
diff --git a/gamechangerml/src/model_testing/metrics.py b/gamechangerml/src/model_testing/metrics.py
@@ -72,6 +72,24 @@ def reciprocal_rank(ranked_results: List[str], expected: List[str]) -> float:
  else:
  return 0
 
+def reciprocal_rank_score(ranked_scores: List[str]) -> float:
+ '''
+ Calculates the reciprocal of the rank of the first correct score (returns single value from 0 to 1).
+ '''
+ first_relevant_rank = 0 # if no relevant results show up, the RR will be 0
+ count = 1
+ for i in ranked_scores: # list in order of rank
+ if i == 1:
+ first_relevant_rank = count
+ break
+ else:
+ count += 1
+
+ if first_relevant_rank > 0:
+ return np.round((1 / first_relevant_rank), 3)
+ else:
+ return 0
+
 def get_MRR(reciprocal_ranks: List[float]) -> float:
  '''Takes list of reciprocal rank scores for each search and averages them.'''
  return np.round(np.mean(reciprocal_ranks), 3)

diff --git a/gamechangerml/src/model_testing/validation_data.py b/gamechangerml/src/model_testing/validation_data.py
@@ -428,7 +428,7 @@ def clean_quot(string):
  return string.replace("&quot;", "'").replace("&#039;", "'").lower()
 
  def clean_doc(string):
- return string.strip(".pdf")
+ return string.split(".pdf")[0]
 
  def is_question(string):
  """If we find a good way to use search history for QA validation (not used currently)"""

diff --git a/gamechangerml/src/search/sent_transformer/finetune.py b/gamechangerml/src/search/sent_transformer/finetune.py
@@ -20,6 +20,7 @@
 import torch.nn.functional as F
 from torch import nn
 torch.cuda.empty_cache()
+from gamechangerml.src.model_testing.metrics import reciprocal_rank_score, get_MRR
 
 S3_DATA_PATH = "bronze/gamechanger/ml-data"
 
@@ -47,19 +48,7 @@ def fix_model_config(model_load_path):
  logger.info("Could not update model config file")
 
 
-def get_cos_sim(model, pair):
-
- emb1 = model.encode(pair[0], show_progress_bar=False)
- emb2 = model.encode(pair[1], show_progress_bar=False)
- try:
- sim = float(util.cos_sim(emb1, emb2))
- except:
- sim = float(cos_sim(emb1, emb2))
-
- return sim
-
-
-def format_inputs(train, test):
+def format_inputs(train, test, data_dir):
  """Create input data for dataloader and df for tracking cosine sim"""
 
  train_samples = []
@@ -71,20 +60,21 @@ def format_inputs(train, test):
  score = float(train[i]["label"])
  inputex = InputExample(str(count), texts, score)
  train_samples.append(inputex)
- all_data.append([i, texts, score, "train"])
+ all_data.append([train[i]["query"], texts, score, "train"])
  count += 1
  #processmanager.update_status(processmanager.loading_data, count, total)
 
  for x in test.keys():
  texts = [test[x]["query"], test[x]["paragraph"]]
  score = float(test[x]["label"])
- all_data.append([x, texts, score, "test"])
+ all_data.append([test[x]["query"], texts, score, "test"])
  count += 1
- #processmanager.update_status(processmanager.loading_data, count, total)
+ processmanager.update_status(processmanager.loading_data, count, total)
 
  df = pd.DataFrame(all_data, columns=["key", "pair", "score", "label"])
+ df.to_csv(os.path.join(data_dir, timestamp_filename("finetuning_data", ".csv")))
 
- return train_samples, df
+ return train_samples
 
 
 class STFinetuner():
@@ -98,10 +88,8 @@ def __init__(self, model_load_path, model_save_path, shuffle, batch_size, epochs
  self.batch_size = batch_size
  self.epochs = epochs
  self.warmup_steps = warmup_steps
- self.device = torch.device(
- "cuda" if torch.cuda.is_available() else "cpu")
- #self.pin_memory = True if torch.cuda.is_available() else False
-
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
  def retrain(self, data_dir, testing_only, version):
 
  try:
@@ -123,12 +111,7 @@ def retrain(self, data_dir, testing_only, version):
  processmanager.update_status(processmanager.training, 0, 1)
  sleep(0.1)
  # make formatted training data
- train_samples, df = format_inputs(train, test)
-
- # get cosine sim before finetuning
- # TODO: you should be able to encode this more efficiently
- df["original_cos_sim"] = df["pair"].apply(
- lambda x: get_cos_sim(self.model, x))
+ train_samples = format_inputs(train, test, data_dir)
 
  # finetune on samples
  logger.info("Starting dataloader...")
@@ -150,63 +133,7 @@ def retrain(self, data_dir, testing_only, version):
 
  # when not testing only, save to S3
  if not testing_only:
- dst_path = self.model_save_path + ".tar.gz"
- utils.create_tgz_from_dir(
- src_dir=self.model_save_path, dst_archive=dst_path)
- model_id = self.model_save_path.split('_')[1]
- logger.info(f"*** Created tgz file and saved to {dst_path}")
-
- S3_MODELS_PATH = "bronze/gamechanger/models"
- s3_path = os.path.join(S3_MODELS_PATH, str(version))
- utils.upload(s3_path, dst_path, "transformers", model_id)
- logger.info(f"*** Saved model to S3: {s3_path}")
-
- logger.info("*** Making finetuning results csv")
- # get new cosine sim
- df["new_cos_sim"] = df["pair"].apply(
- lambda x: get_cos_sim(self.model, x))
- df["change_cos_sim"] = df["new_cos_sim"] - df["original_cos_sim"]
-
- # save all results to CSV
- df.to_csv(os.path.join(data_dir, timestamp_filename(
- "finetuning_results", ".csv")))
-
- # create training metadata
- positive_change_train = df[(df["score"] == 1.0) & (
- df["label"] == "train")]["change_cos_sim"].median()
- negative_change_train = df[(
- df["score"] == -1.0) & (df["label"] == "train")]["change_cos_sim"].median()
- neutral_change_train = df[(df["score"] == 0.0) & (
- df["label"] == "train")]["change_cos_sim"].median()
- positive_change_test = df[(df["score"] == 1.0) & (
- df["label"] == "test")]["change_cos_sim"].median()
- negative_change_test = df[(
- df["score"] == -1.0) & (df["label"] == "test")]["change_cos_sim"].median()
- neutral_change_test = df[(df["score"] == 0.0) & (
- df["label"] == "test")]["change_cos_sim"].median()
-
- ft_metadata = {
- "date_finetuned": str(date.today()),
- "data_dir": str(data_dir),
- "positive_change_train": positive_change_train,
- "negative_change_train": negative_change_train,
- "neutral_change_train": neutral_change_train,
- "positive_change_test": positive_change_test,
- "negative_change_test": negative_change_test,
- "neutral_change_test": neutral_change_test
- }
-
- # save metadata file
- ft_metadata_path = os.path.join(
- data_dir, timestamp_filename("finetuning_metadata", ".json"))
- with open(ft_metadata_path, "w") as outfile:
- json.dump(ft_metadata, outfile)
-
- logger.info("Metadata saved to {}".format(ft_metadata_path))
- logger.info(str(ft_metadata))
-
- # when not testing only, save to S3
- if not testing_only:
+ logger.info("Saving data to S3...")
  s3_path = os.path.join(S3_DATA_PATH, f"{version}")
  logger.info(f"**** Saving new data files to S3: {s3_path}")
  dst_path = data_dir + ".tar.gz"
@@ -216,7 +143,18 @@ def retrain(self, data_dir, testing_only, version):
  logger.info("*** Attempting to upload data to s3")
  utils.upload(s3_path, dst_path, "data", model_name)
 
- return ft_metadata
+ logger.info("Saving model to S3...")
+ dst_path = self.model_save_path + ".tar.gz"
+ utils.create_tgz_from_dir(src_dir=self.model_save_path, dst_archive=dst_path)
+ model_id = self.model_save_path.split('_')[1]
+ logger.info(f"*** Created tgz file and saved to {dst_path}")
+
+ S3_MODELS_PATH = "bronze/gamechanger/models"
+ s3_path = os.path.join(S3_MODELS_PATH, str(version))
+ utils.upload(s3_path, dst_path, "transformers", model_id)
+ logger.info(f"*** Saved model to S3: {s3_path}")
+
+ return {}
 
  except Exception as e:
  logger.warning("Could not complete finetuning")

diff --git a/gamechangerml/src/search/sent_transformer/model.py b/gamechangerml/src/search/sent_transformer/model.py
@@ -17,6 +17,7 @@
 from gamechangerml.src.model_testing.validation_data import MSMarcoData
 
 
+
 class SentenceEncoder(object):
  """
  Handles text encoding and creating of ANNOY index
@@ -92,6 +93,7 @@ def _index(self, corpus, index_path, overwrite=False, save_embedding=False):
  dataframe_path = os.path.join(index_path, "data.csv")
  ids_path = os.path.join(index_path, "doc_ids.txt")
 
+ '''
  # Load new data
  if os.path.isfile(embedding_path) and (overwrite is False):
  logger.info(f"Loading new data from {embedding_path}")
@@ -113,6 +115,7 @@ def _index(self, corpus, index_path, overwrite=False, save_embedding=False):
  # Append new dataframe
  old_df = pd.read_csv(dataframe_path)
  df = pd.concat([old_df, df])
+ '''
 
  # Store embeddings and document index
  # for future reference
@@ -161,10 +164,11 @@ def index_documents(self, corpus_path, index_path):
  verbose=self.verbose,
  )
  corpus = [(para_id, " ".join(tokens), None)
-  for tokens, para_id in corp]
+ for tokens, para_id in corp]
  logger.info(
  f"\nLength of batch (in par ids) for indexing : {str(len(corpus))}"
  )
+
  else:
  logger.info(
  "Did not include path to corpus, making test index with msmarco data"
@@ -178,7 +182,7 @@ def index_documents(self, corpus_path, index_path):
 
  self._index(corpus, index_path)
  processmanager.update_status(
- processmanager.training, 1, 1, "finished building sent index"
+ processmanager.training, 1, 0, "finished building sent index"
  )
 
  self.embedder.save(index_path)

diff --git a/gamechangerml/src/utilities/test_utils.py b/gamechangerml/src/utilities/test_utils.py
@@ -324,6 +324,7 @@ def concat_csvs(directory):
  df = pd.DataFrame()
  logger.info(str(directory))
  csvs = [i for i in os.listdir(directory) if i.split('.')[-1]=='csv']
+ csvs = [i for i in csvs if i[:2] != '._']
  logger.info(f"Combining csvs: {str(csvs)}")
  for i in csvs:
  try:
@@ -347,3 +348,4 @@ def get_most_recent_dir(parent_dir):
  return max(subdirs, key=os.path.getctime)
  else:
  logger.error("There are no subdirectories to retrieve most recent data from")
+ return None