From 4b24b998046a2b384e2b5b2aef391d6641765b62 Mon Sep 17 00:00:00 2001 From: 604840 Date: Thu, 17 Feb 2022 21:44:18 -0500 Subject: [PATCH 01/12] adding auto tokenizer based on encoder model --- .../src/search/sent_transformer/model.py | 1 + gamechangerml/src/text_handling/corpus.py | 20 +++++++++++++++---- gamechangerml/src/text_handling/process.py | 14 ++++++++++--- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/gamechangerml/src/search/sent_transformer/model.py b/gamechangerml/src/search/sent_transformer/model.py index f384624e..ab43fbe3 100644 --- a/gamechangerml/src/search/sent_transformer/model.py +++ b/gamechangerml/src/search/sent_transformer/model.py @@ -159,6 +159,7 @@ def index_documents(self, corpus_path, index_path): return_id=self.return_id, min_token_len=self.min_token_len, verbose=self.verbose, + bert_based_tokenizer=self.encoder_model, ) corpus = [(para_id, " ".join(tokens), None) for tokens, para_id in corp] diff --git a/gamechangerml/src/text_handling/corpus.py b/gamechangerml/src/text_handling/corpus.py index c0721533..64c8ed58 100644 --- a/gamechangerml/src/text_handling/corpus.py +++ b/gamechangerml/src/text_handling/corpus.py @@ -3,14 +3,19 @@ # import pandas as pd from gensim.models.doc2vec import TaggedDocument -from gamechangerml.src.text_handling.process import preprocess +from gamechangerml.src.text_handling.process import preprocess, get_tokenizer from gamechangerml.api.utils import processmanager from tqdm import tqdm class LocalCorpus(object): def __init__( - self, directory, return_id=False, min_token_len=3, verbose=False + self, + directory, + return_id=False, + min_token_len=3, + verbose=False, + bert_based_tokenizer=None, ): self.directory = directory self.file_list = [ @@ -22,6 +27,9 @@ def __init__( self.return_id = return_id self.min_token_len = min_token_len self.verbose = verbose + self.bert_based_tokenizer = bert_based_tokenizer + if self.bert_based_tokenizer: + self.auto_token = get_tokenizer(self.bert_based_tokenizer) def __iter__(self): if self.verbose: @@ -39,7 +47,10 @@ def __iter__(self): paragraphs = [p["par_raw_text_t"] for p in doc["paragraphs"]] paragraph_ids = [p["id"] for p in doc["paragraphs"]] for para_text, para_id in zip(paragraphs, paragraph_ids): - tokens = preprocess(para_text, min_len=1) + if self.bert_based_tokens: + tokens = self.auto_token.tokenize(para_text) + else: + tokens = preprocess(para_text, min_len=1) if len(tokens) > self.min_token_len: if self.return_id: yield tokens, para_id @@ -47,7 +58,8 @@ def __iter__(self): yield tokens progress += 1 processmanager.update_status( - processmanager.loading_corpus, progress, total) + processmanager.loading_corpus, progress, total + ) except Exception as e: print(e) print(f"Error with {file_name} in creating local corpus") diff --git a/gamechangerml/src/text_handling/process.py b/gamechangerml/src/text_handling/process.py index 602a4a1c..a51f2c8f 100644 --- a/gamechangerml/src/text_handling/process.py +++ b/gamechangerml/src/text_handling/process.py @@ -1,7 +1,16 @@ from gensim.parsing.preprocessing import STOPWORDS from gensim.utils import simple_preprocess +from gamechangerml import MODEL_PATH +from transformers import BertTokenizer, AutoTokenizer +import os -from transformers import BertTokenizer + +def get_tokenizer( + model_name: str = os.path.join( + MODEL_PATH, "transformers/msmarco-distilbert-base-v2" + ) +): + return AutoTokenizer.from_pretrained(model_name) def preprocess( @@ -54,8 +63,7 @@ def __init__(self, vocab_file=None): if vocab_file is None: vocab_file = "./assets/bert_vocab.txt" self.tokenizer = BertTokenizer( - vocab_file=vocab_file, do_lower_case=True - ) + vocab_file=vocab_file, do_lower_case=True) def tokenize(self, text): tokens = self.tokenizer.tokenize(text) From 9567d5210503cfa4e5b0e08e1009647c511c4bbe Mon Sep 17 00:00:00 2001 From: 604840 Date: Thu, 17 Feb 2022 22:12:35 -0500 Subject: [PATCH 02/12] fixing spelling --- gamechangerml/src/text_handling/corpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gamechangerml/src/text_handling/corpus.py b/gamechangerml/src/text_handling/corpus.py index 64c8ed58..e90348b8 100644 --- a/gamechangerml/src/text_handling/corpus.py +++ b/gamechangerml/src/text_handling/corpus.py @@ -47,7 +47,7 @@ def __iter__(self): paragraphs = [p["par_raw_text_t"] for p in doc["paragraphs"]] paragraph_ids = [p["id"] for p in doc["paragraphs"]] for para_text, para_id in zip(paragraphs, paragraph_ids): - if self.bert_based_tokens: + if self.bert_based_tokenizer: tokens = self.auto_token.tokenize(para_text) else: tokens = preprocess(para_text, min_len=1) From 293c1bd3d987ed03b4fed51d10b72e3c2236a521 Mon Sep 17 00:00:00 2001 From: 604840 Date: Fri, 18 Feb 2022 10:38:23 -0500 Subject: [PATCH 03/12] adding corpus directory but ignore everything inside --- .gitignore | 2 +- gamechangerml/api/.dockerignore | 1 + gamechangerml/corpus/.gitignore | 4 ++++ 3 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 gamechangerml/corpus/.gitignore diff --git a/.gitignore b/.gitignore index 7981c3b4..3a41713a 100644 --- a/.gitignore +++ b/.gitignore @@ -234,7 +234,7 @@ mlruns/ # Transformer models models/transformers # Corpus files -corpus +corpus/* # Don't ignore .gitkeep files !**/.gitkeep diff --git a/gamechangerml/api/.dockerignore b/gamechangerml/api/.dockerignore index ab3fb197..9a1f3303 100644 --- a/gamechangerml/api/.dockerignore +++ b/gamechangerml/api/.dockerignore @@ -6,6 +6,7 @@ ../models ../transformer_cache.zip ../src/DrQA/data/ +../corpus dataPipelines out common diff --git a/gamechangerml/corpus/.gitignore b/gamechangerml/corpus/.gitignore new file mode 100644 index 00000000..5e7d2734 --- /dev/null +++ b/gamechangerml/corpus/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore From eebd7eb56e011ec07e0efb6629b30dcbc71a801c Mon Sep 17 00:00:00 2001 From: 604840 Date: Fri, 18 Feb 2022 10:57:50 -0500 Subject: [PATCH 04/12] removing corpus git ignore --- gamechangerml/corpus/.gitignore | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 gamechangerml/corpus/.gitignore diff --git a/gamechangerml/corpus/.gitignore b/gamechangerml/corpus/.gitignore deleted file mode 100644 index 5e7d2734..00000000 --- a/gamechangerml/corpus/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -# Ignore everything in this directory -* -# Except this file -!.gitignore From 9a7fcfe5ce50f12005f9fc83c01b0b08c56f4fe8 Mon Sep 17 00:00:00 2001 From: 604840 Date: Fri, 18 Feb 2022 10:59:50 -0500 Subject: [PATCH 05/12] moving a global variable into def --- gamechangerml/scripts/make_training_data.py | 355 +++++++++++++------- 1 file changed, 227 insertions(+), 128 deletions(-) diff --git a/gamechangerml/scripts/make_training_data.py b/gamechangerml/scripts/make_training_data.py index d96d8635..1a4bbb0c 100644 --- a/gamechangerml/scripts/make_training_data.py +++ b/gamechangerml/scripts/make_training_data.py @@ -8,45 +8,52 @@ import spacy -from gamechangerml.configs.config import TrainingConfig, ValidationConfig, SimilarityConfig +from gamechangerml.configs.config import ( + TrainingConfig, + ValidationConfig, + SimilarityConfig, +) from gamechangerml.src.search.sent_transformer.model import SentenceSearcher from gamechangerml.src.utilities.text_utils import normalize_query from gamechangerml.src.utilities.test_utils import * from gamechangerml.api.utils.logger import logger from gamechangerml.api.utils.pathselect import get_model_paths from gamechangerml.scripts.update_eval_data import make_tiered_eval_data -from gamechangerml import DATA_PATH +from gamechangerml import DATA_PATH, CORPUS_PATH model_path_dict = get_model_paths() random.seed(42) LOCAL_TRANSFORMERS_DIR = model_path_dict["transformers"] SIM_MODEL = SimilarityConfig.BASE_MODEL -training_dir= os.path.join(DATA_PATH, "training", "sent_transformer") -tts_ratio=TrainingConfig.DATA_ARGS["train_test_split_ratio"] +training_dir = os.path.join(DATA_PATH, "training", "sent_transformer") +tts_ratio = TrainingConfig.DATA_ARGS["train_test_split_ratio"] gold_standard_path = os.path.join( - "gamechangerml/data/user_data", ValidationConfig.DATA_ARGS["retriever_gc"]["gold_standard"] - ) + "gamechangerml/data/user_data", + ValidationConfig.DATA_ARGS["retriever_gc"]["gold_standard"], +) -CORPUS_DIR = "gamechangerml/corpus" -corpus_docs = [i.split('.json')[0] for i in os.listdir(CORPUS_DIR) if os.path.isfile(os.path.join(CORPUS_DIR, i))] def get_sample_paragraphs(pars, par_limit=100, min_length=150): - '''Collect sample paragraphs longer than min_length (char), up to par_limit paragraphs''' - + """Collect sample paragraphs longer than min_length (char), up to par_limit paragraphs""" + count = 0 collected_pars = [] for i in pars: if count < par_limit: - if len(i['par_raw_text_t']) >= min_length: + if len(i["par_raw_text_t"]) >= min_length: count += 1 - collected_pars.append({"text": i['par_raw_text_t'], "id": i['id']}) + collected_pars.append( + {"text": i["par_raw_text_t"], "id": i["id"]}) else: break - + return collected_pars -def get_best_paragraphs(data: pd.DataFrame, query: str, doc_id: str, nlp, min_score: float=0.60) -> List[Dict[str,str]]: + +def get_best_paragraphs( + data: pd.DataFrame, query: str, doc_id: str, nlp, min_score: float = 0.60 +) -> List[Dict[str, str]]: """Retrieves the best paragraphs for expected doc using similarity model Args: data [pd.DataFrame]: data df with processed text at paragraph_id level for sent_index @@ -57,17 +64,24 @@ def get_best_paragraphs(data: pd.DataFrame, query: str, doc_id: str, nlp, min_sc Returns: [List[Dict[str,str]]]: List of dictionaries of paragraph matches """ - logger.info(f"Retrieving matches for query: {query}, expected doc: {doc_id}") + logger.info( + f"Retrieving matches for query: {query}, expected doc: {doc_id}") + + corpus_docs = [ + i.split(".json")[0] + for i in os.listdir(CORPUS_PATH) + if os.path.isfile(os.path.join(CORPUS_PATH, i)) + ] pars = [] doc1 = nlp(query) if doc_id not in corpus_docs: logger.warning(f"---Did not find {doc_id} in the corpus") - json = open_json(doc_id + '.json', CORPUS_DIR) - paragraphs = json['paragraphs'] - sents = get_sample_paragraphs(paragraphs)[:50] # get top 50 paragraphs + json = open_json(doc_id + ".json", CORPUS_PATH) + paragraphs = json["paragraphs"] + sents = get_sample_paragraphs(paragraphs)[:50] # get top 50 paragraphs for sent in sents: - short = ' '.join(sent['text'].split(' ')[:400]) # shorten paragraphs + short = " ".join(sent["text"].split(" ")[:400]) # shorten paragraphs pars.append(short) ranked = [] @@ -75,34 +89,45 @@ def get_best_paragraphs(data: pd.DataFrame, query: str, doc_id: str, nlp, min_sc if len(sents) == 0: logger.info("---No paragraphs retrieved for this expected doc") elif len(sents) == 1: - ranked = [{"score": 'na', "id": sents[0]['id'], "text": sents[0]['text']}] + ranked = [{"score": "na", "id": sents[0] + ["id"], "text": sents[0]["text"]}] else: comparisons = [] for sent in sents: - doc2 = nlp(sent['text']) + doc2 = nlp(sent["text"]) sim = doc1.similarity(doc2) if sim >= min_score: - record = {"score": sim, "id": sent['id'], "text": sent['text']} + record = {"score": sim, + "id": sent["id"], "text": sent["text"]} comparisons.append(record) else: pass - ranked = sorted(comparisons, key = lambda z: z['score'], reverse=True) - logger.info(f"*** Collected {str(len(ranked))} / {str(len(sents))} paragraphs (passing sim threshold) retrieved for {doc_id}") + ranked = sorted( + comparisons, key=lambda z: z["score"], reverse=True) + logger.info( + f"*** Collected {str(len(ranked))} / {str(len(sents))} paragraphs (passing sim threshold) retrieved for {doc_id}" + ) except Exception as e: logger.info(f"---Could not re-rank the paragraphs for {query}") - logger.warning(e) - + logger.warning(e) + return ranked + def check_no_match(expected_id: str, par_id: str) -> bool: """Checks if paragraph ID matches the expected doc ID""" - if par_id.split('.pdf')[0].upper().strip().lstrip() == expected_id.upper().strip().lstrip(): + if ( + par_id.split(".pdf")[0].upper().strip().lstrip() + == expected_id.upper().strip().lstrip() + ): return False else: return True + def get_negative_paragraphs( - data: pd.DataFrame, query: str, doc_id: str, retriever, n_returns: int) -> List[Dict[str,str]]: + data: pd.DataFrame, query: str, doc_id: str, retriever, n_returns: int +) -> List[Dict[str, str]]: """Looks up negative (not matching) paragraphs for each query Args: data [pd.DataFrame]: data df with processed text at paragraph_id level for sent_index @@ -118,19 +143,27 @@ def get_negative_paragraphs( checked_results = [] try: results = retriever.retrieve_topn(query, n_returns) - logger.info(f"Retrieved {str(len(results))} negative samples for query: {query} / doc: {doc_id}") + logger.info( + f"Retrieved {str(len(results))} negative samples for query: {query} / doc: {doc_id}" + ) for result in results: - par = data[data["paragraph_id"]==result['id']].iloc[0]["text"] - par = ' '.join(par.split(' ')[:400]) - if check_no_match(doc_id, result['id']): - checked_results.append({"query": query, "doc": result['id'], "paragraph": par, "label": 0}) + par = data[data["paragraph_id"] == result["id"]].iloc[0]["text"] + par = " ".join(par.split(" ")[:400]) + if check_no_match(doc_id, result["id"]): + checked_results.append( + {"query": query, + "doc": result["id"], "paragraph": par, "label": 0} + ) except Exception as e: logger.warning("Could not get negative paragraphs") logger.warning(e) - + return checked_results -def add_gold_standard(intel: Dict[str,str], gold_standard_path: Union[str, os.PathLike]) -> Dict[str,str]: + +def add_gold_standard( + intel: Dict[str, str], gold_standard_path: Union[str, os.PathLike] +) -> Dict[str, str]: """Adds original gold standard data to the intel training data. Args: intel [Dict[str,str]: intelligent search evaluation data @@ -138,61 +171,62 @@ def add_gold_standard(intel: Dict[str,str], gold_standard_path: Union[str, os.Pa Returns: intel [Dict[str,str]: intelligent search evaluation data with manual entries added """ - gold = pd.read_csv(gold_standard_path, names=['query', 'document']) - gold['query_clean'] = gold['query'].apply(lambda x: normalize_query(x)) - gold['docs_split'] = gold['document'].apply(lambda x: x.split(';')) - all_docs = list(set([a for b in gold['docs_split'].tolist() for a in b])) + gold = pd.read_csv(gold_standard_path, names=["query", "document"]) + gold["query_clean"] = gold["query"].apply(lambda x: normalize_query(x)) + gold["docs_split"] = gold["document"].apply(lambda x: x.split(";")) + all_docs = list(set([a for b in gold["docs_split"].tolist() for a in b])) - def add_key(mydict: Dict[str,str]) -> str: + def add_key(mydict: Dict[str, str]) -> str: """Adds new key to queries/collections dictionaries""" last_key = sorted([*mydict.keys()])[-1] key_len = len(last_key) - 1 last_prefix = last_key[0] last_num = int(last_key[1:]) new_num = str(last_num + 1) - - return last_prefix + str(str(0)*(key_len - len(new_num)) + new_num) + + return last_prefix + str(str(0) * (key_len - len(new_num)) + new_num) # check if queries already in dict, if not add - for i in gold['query_clean']: - if i in intel['queries'].values(): + for i in gold["query_clean"]: + if i in intel["queries"].values(): logger.info(f"'{i}' already in intel queries") continue else: logger.info(f"adding '{i}' to intel queries") - new_key = add_key(intel['queries']) - intel['queries'][new_key] = i - + new_key = add_key(intel["queries"]) + intel["queries"][new_key] = i + # check if docs already in dict, if not add for i in all_docs: - if i in intel['collection'].values(): + if i in intel["collection"].values(): logger.info(f"'{i}' already in intel collection") continue else: logger.info(f"adding '{i}' to intel collection") - new_key = add_key(intel['collection']) - intel['collection'][new_key] = i + new_key = add_key(intel["collection"]) + intel["collection"][new_key] = i # check if rels already in intel, if not add - reverse_q = {v:k for k,v in intel['queries'].items()} - reverse_d = {v:k for k,v in intel['collection'].items()} + reverse_q = {v: k for k, v in intel["queries"].items()} + reverse_d = {v: k for k, v in intel["collection"].items()} for i in gold.index: - q = gold.loc[i, 'query_clean'] - docs = gold.loc[i, 'docs_split'] + q = gold.loc[i, "query_clean"] + docs = gold.loc[i, "docs_split"] for j in docs: q_id = reverse_q[q] d_id = reverse_d[j] - if q_id in intel['correct']: # if query in rels, add new docs - if d_id in intel['correct'][q_id]: + if q_id in intel["correct"]: # if query in rels, add new docs + if d_id in intel["correct"][q_id]: continue else: - intel['correct'][q_id] += [d_id] + intel["correct"][q_id] += [d_id] else: - intel['correct'][q_id] = [d_id] - + intel["correct"][q_id] = [d_id] + return intel -def train_test_split(data: Dict[str,str], tts_ratio: float) -> Tuple[Dict[str, str]]: + +def train_test_split(data: Dict[str, str], tts_ratio: float) -> Tuple[Dict[str, str]]: """Splits a dictionary into train/test set based on split ratio""" train_size = round(len(data) * tts_ratio) @@ -204,14 +238,15 @@ def train_test_split(data: Dict[str,str], tts_ratio: float) -> Tuple[Dict[str, s return train, test + def collect_matches( - data: pd.DataFrame, + data: pd.DataFrame, nlp, relations: Dict[str, str], queries: Dict[str, str], collection: Dict[str, str], label: int, - ) -> Tuple[Dict[str, str]]: +) -> Tuple[Dict[str, str]]: """Gets matching paragraphs for each query/docid pair Args: data [pd.DataFrame]: data df with processed text at paragraph_id level for sent_index @@ -233,27 +268,36 @@ def collect_matches( query = queries[i] for k in relations[i]: doc = collection[k] - uid = str(i) + '_' + str(k) # backup UID, overwritten if there are results + # backup UID, overwritten if there are results + uid = str(i) + "_" + str(k) try: matching = get_best_paragraphs(data, query, doc, nlp) for match in matching: - uid = str(i) + '_' + str(match['id']) - text = ' '.join(match['text'].split(' ')[:400]) # truncate to 400 tokens - found[uid] = {"query": query, "doc": doc, "paragraph": text, "label": label} + uid = str(i) + "_" + str(match["id"]) + text = " ".join( + match["text"].split(" ")[:400] + ) # truncate to 400 tokens + found[uid] = { + "query": query, + "doc": doc, + "paragraph": text, + "label": label, + } except Exception as e: logger.warning("Could not get positive matches") logger.warning(e) not_found[uid] = {"query": query, "doc": doc, "label": label} return found, not_found + def collect_negative_samples( - data: pd.DataFrame, - retriever, + data: pd.DataFrame, + retriever, n_returns: int, relations: Dict[str, str], queries: Dict[str, str], collection: Dict[str, str], - ) -> Tuple[Dict[str, str]]: +) -> Tuple[Dict[str, str]]: """Gets negative samples each query/docid pair Args: data [pd.DataFrame]: data df with processed text at paragraph_id level for sent_index @@ -272,31 +316,48 @@ def collect_negative_samples( query = queries[i] for k in relations[i]: doc = collection[k] - uid = str(i) + '_' + str(k) + '_neg' # backup UID, overwritten if there are results + uid = ( + str(i) + "_" + str(k) + "_neg" + ) # backup UID, overwritten if there are results try: - not_matching = get_negative_paragraphs(data=data, query=query, doc_id=k, retriever=retriever, n_returns=n_returns) + not_matching = get_negative_paragraphs( + data=data, + query=query, + doc_id=k, + retriever=retriever, + n_returns=n_returns, + ) for match in not_matching: - uid = str(i) + '_' + str(match['doc']) - text = ' '.join(match['paragraph'].split(' ')[:400]) # truncate to 400 tokens - found[uid] = {"query": query, "doc": doc, "paragraph": text, "label": 0} + uid = str(i) + "_" + str(match["doc"]) + text = " ".join( + match["paragraph"].split(" ")[:400] + ) # truncate to 400 tokens + found[uid] = { + "query": query, + "doc": doc, + "paragraph": text, + "label": 0, + } except Exception as e: logger.warning(e) not_found[uid] = {"query": query, "doc": doc, "label": 0} - + return found, not_found + def make_training_data( index_path: Union[str, os.PathLike], n_returns: int, n_matching: int, - level: str, - update_eval_data: bool, + level: str, + update_eval_data: bool, retriever=None, - sim_model_name: str=SIM_MODEL, - transformers_dir: Union[str,os.PathLike]=LOCAL_TRANSFORMERS_DIR, - gold_standard_path: Union[str,os.PathLike]=gold_standard_path, - tts_ratio: float=tts_ratio, - training_dir: Union[str,os.PathLike]=training_dir) -> Tuple[Dict[str,str]]: + sim_model_name: str = SIM_MODEL, + transformers_dir: Union[str, os.PathLike] = LOCAL_TRANSFORMERS_DIR, + gold_standard_path: Union[str, os.PathLike] = gold_standard_path, + tts_ratio: float = tts_ratio, + training_dir: Union[str, os.PathLike] = training_dir, +) -> Tuple[Dict[str, str]]: """Makes training data based on new user search history data Args: index_path [str|os.PathLike]: path to the sent index for retrieving the training data (should be most recent index) @@ -311,26 +372,34 @@ def make_training_data( training_dir [Union[str,os.PathLike]]: directory for saving training data Returns: [Tuple[Dict[str,str]]]: training data and training metadata dictionaries - """ - ## open json files - if not os.path.exists(os.path.join(DATA_PATH, "validation", "domain", "sent_transformer")) or update_eval_data: + """ + # open json files + if ( + not os.path.exists( + os.path.join(DATA_PATH, "validation", "domain", "sent_transformer") + ) + or update_eval_data + ): logger.info("**** Updating the evaluation data") make_tiered_eval_data(index_path) - validation_dir = get_most_recent_dir(os.path.join(DATA_PATH, "validation", "domain", "sent_transformer")) + validation_dir = get_most_recent_dir( + os.path.join(DATA_PATH, "validation", "domain", "sent_transformer") + ) directory = os.path.join(validation_dir, level) - logger.info(f"**** Loading in intelligent search data from {str(directory)}") + logger.info( + f"**** Loading in intelligent search data from {str(directory)}") try: - f = open_json('intelligent_search_data.json', directory) + f = open_json("intelligent_search_data.json", directory) intel = json.loads(f) except Exception as e: logger.warning("Could not load intelligent search data") logger.warning(e) - ## add gold standard samples + # add gold standard samples logger.info("**** Adding gold standard examples") intel = add_gold_standard(intel, gold_standard_path) - ## set up save dir + # set up save dir save_dir = make_timestamp_directory(training_dir) try: @@ -341,76 +410,101 @@ def make_training_data( if not retriever: logger.info("Did not init SentenceSearcher, loading now") retriever = SentenceSearcher( - sim_model_name=sim_model_name, - index_path=index_path, - transformer_path=transformers_dir - ) - ## read in sent_index data + sim_model_name=sim_model_name, + index_path=index_path, + transformer_path=transformers_dir, + ) + # read in sent_index data logger.info("**** Loading in sent index data from retriever") try: data = retriever.data - data['doc_id'] = data['paragraph_id'].apply(lambda x: x.split('.pdf')[0]) + data["doc_id"] = data["paragraph_id"].apply( + lambda x: x.split(".pdf")[0]) except Exception as e: logger.info("Could not load in data from retriever") logger.warning(e) - ## get matching paragraphs + # get matching paragraphs try: correct_found, correct_notfound = collect_matches( - data=data, queries=intel['queries'], collection=intel['collection'], - relations=intel['correct'], label=1, nlp = nlp) - logger.info(f"---Number of correct query/result pairs that were not found: {str(len(correct_notfound))}") + data=data, + queries=intel["queries"], + collection=intel["collection"], + relations=intel["correct"], + label=1, + nlp=nlp, + ) + logger.info( + f"---Number of correct query/result pairs that were not found: {str(len(correct_notfound))}" + ) except Exception as e: logger.warning(e) logger.warning("\nCould not retrieve positive matches\n") try: incorrect_found, incorrect_notfound = collect_matches( - data=data, queries=intel['queries'], collection=intel['collection'], - relations=intel['incorrect'], label=-1, nlp = nlp) - logger.info(f"---Number of incorrect query/result pairs that were not found: {str(len(incorrect_notfound))}") + data=data, + queries=intel["queries"], + collection=intel["collection"], + relations=intel["incorrect"], + label=-1, + nlp=nlp, + ) + logger.info( + f"---Number of incorrect query/result pairs that were not found: {str(len(incorrect_notfound))}" + ) except Exception as e: logger.warning(e) logger.warning("\nCould not retrieve negative matches\n") - ## get negative samples + # get negative samples try: - all_relations = {**intel['correct'], **intel['incorrect']} + all_relations = {**intel["correct"], **intel["incorrect"]} neutral_found, neutral_notfound = collect_negative_samples( - data=data, retriever=retriever, n_returns=n_returns, queries=intel['queries'], collection=intel['collection'], - relations=all_relations) - logger.info(f"---Number of negative sample pairs that were not found: {str(len(neutral_notfound))}") + data=data, + retriever=retriever, + n_returns=n_returns, + queries=intel["queries"], + collection=intel["collection"], + relations=all_relations, + ) + logger.info( + f"---Number of negative sample pairs that were not found: {str(len(neutral_notfound))}" + ) except Exception as e: logger.warning(e) logger.warning("\nCould not retrieve negative samples\n") - ## save a json of the query-doc pairs that did not retrieve an ES paragraph for training data + # save a json of the query-doc pairs that did not retrieve an ES paragraph for training data notfound = {**correct_notfound, **incorrect_notfound, **neutral_notfound} - logger.info(f"---Number of total query/result pairs that were not found: {str(len(notfound))}") - notfound_path = os.path.join(save_dir, 'not_found_search_pairs.json') + logger.info( + f"---Number of total query/result pairs that were not found: {str(len(notfound))}" + ) + notfound_path = os.path.join(save_dir, "not_found_search_pairs.json") with open(notfound_path, "w") as outfile: json.dump(notfound, outfile) - ## train/test split (separate on correct/incorrect for balance) + # train/test split (separate on correct/incorrect for balance) correct_train, correct_test = train_test_split(correct_found, tts_ratio) - incorrect_train, incorrect_test = train_test_split(incorrect_found, tts_ratio) + incorrect_train, incorrect_test = train_test_split( + incorrect_found, tts_ratio) neutral_train, neutral_test = train_test_split(neutral_found, tts_ratio) train = {**correct_train, **incorrect_train, **neutral_train} test = {**correct_test, **incorrect_test, **neutral_test} - try:## check labels - pos = len([i for i in train if train[i]['label'] == 1]) + try: # check labels + pos = len([i for i in train if train[i]["label"] == 1]) logger.info(f"*** {str(pos)} positive samples in TRAIN") - neutral = len([i for i in train if train[i]['label'] == 0]) + neutral = len([i for i in train if train[i]["label"] == 0]) logger.info(f"*** {str(neutral)} neutral samples in TRAIN") - neg = len([i for i in train if train[i]['label'] == -1]) + neg = len([i for i in train if train[i]["label"] == -1]) logger.info(f"*** {str(neg)} negative samples in TRAIN") - ## check labels - pos_test = len([i for i in test if test[i]['label'] == 1]) + # check labels + pos_test = len([i for i in test if test[i]["label"] == 1]) logger.info(f"*** {str(pos_test)} positive samples in TEST") - neutral_test = len([i for i in test if test[i]['label'] == 0]) + neutral_test = len([i for i in test if test[i]["label"] == 0]) logger.info(f"*** {str(neutral_test)} neutral samples in TEST") - neg_test = len([i for i in test if test[i]['label'] == -1]) + neg_test = len([i for i in test if test[i]["label"] == -1]) logger.info(f"*** {str(neg_test)} negative samples in TEST") except Exception as e: logger.warning("Could not check stats for train/test") @@ -424,14 +518,14 @@ def make_training_data( "n_negative_samples": f"{str(neg)} train / {str(neg_test)} test", "train_size": len(train), "test_size": len(test), - "split_ratio": tts_ratio + "split_ratio": tts_ratio, } logger.info(f"**** Generated training data: \n {metadata}") - ## save data and metadata files - data_path = os.path.join(save_dir, 'training_data.json') - metadata_path = os.path.join(save_dir, 'training_metadata.json') + # save data and metadata files + data_path = os.path.join(save_dir, "training_data.json") + metadata_path = os.path.join(save_dir, "training_metadata.json") with open(data_path, "w") as outfile: json.dump(data, outfile) @@ -439,8 +533,13 @@ def make_training_data( with open(metadata_path, "w") as outfile: json.dump(metadata, outfile) -if __name__ == '__main__': + +if __name__ == "__main__": make_training_data( - index_path="gamechangerml/models/sent_index_20210715", n_returns=50, n_matching=3, level="silver", - update_eval_data=False) \ No newline at end of file + index_path="gamechangerml/models/sent_index_20210715", + n_returns=50, + n_matching=3, + level="silver", + update_eval_data=False, + ) From d4d9d74c9d6af7bf0d49709bc94e6db359dccbb9 Mon Sep 17 00:00:00 2001 From: 604840 Date: Wed, 2 Mar 2022 13:31:57 -0500 Subject: [PATCH 06/12] updating min length to be higher and updating tokens --- gamechangerml/configs/config.py | 4 ++-- gamechangerml/src/text_handling/corpus.py | 20 +++++++++++++++----- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/gamechangerml/configs/config.py b/gamechangerml/configs/config.py index 001ec98e..e36aea23 100644 --- a/gamechangerml/configs/config.py +++ b/gamechangerml/configs/config.py @@ -74,7 +74,7 @@ class QAConfig: class EmbedderConfig: BASE_MODEL = "msmarco-distilbert-base-v2" MODEL_ARGS = { - "min_token_len": 10, + "min_token_len": 25, "verbose": True, # for creating LocalCorpus "return_id": True, # for creating LocalCorpus } @@ -104,7 +104,7 @@ class QexpConfig: "num_keywords": 2, "ngram": (1, 3), "abbrv_file": None, - "merge_word_sim": True + "merge_word_sim": True, }, } diff --git a/gamechangerml/src/text_handling/corpus.py b/gamechangerml/src/text_handling/corpus.py index e90348b8..1c2b1b7f 100644 --- a/gamechangerml/src/text_handling/corpus.py +++ b/gamechangerml/src/text_handling/corpus.py @@ -49,13 +49,23 @@ def __iter__(self): for para_text, para_id in zip(paragraphs, paragraph_ids): if self.bert_based_tokenizer: tokens = self.auto_token.tokenize(para_text) + print(tokens) + process_tokens = preprocess(para_text, min_len=1) + # half of the tokens are actual words + if tokens: + if (len(process_tokens) / len(tokens)) > 0.5: + if len(tokens) > self.min_token_len: + if self.return_id: + yield tokens, para_id + else: + yield tokens else: tokens = preprocess(para_text, min_len=1) - if len(tokens) > self.min_token_len: - if self.return_id: - yield tokens, para_id - else: - yield tokens + if len(tokens) > self.min_token_len: + if self.return_id: + yield tokens, para_id + else: + yield tokens progress += 1 processmanager.update_status( processmanager.loading_corpus, progress, total From 047858f6064407ca2b03cf6befe20ed20251f1b9 Mon Sep 17 00:00:00 2001 From: 604840 Date: Wed, 2 Mar 2022 13:37:15 -0500 Subject: [PATCH 07/12] updating to not use bert tokenizer --- gamechangerml/src/search/sent_transformer/model.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gamechangerml/src/search/sent_transformer/model.py b/gamechangerml/src/search/sent_transformer/model.py index ab43fbe3..ebbcdd80 100644 --- a/gamechangerml/src/search/sent_transformer/model.py +++ b/gamechangerml/src/search/sent_transformer/model.py @@ -37,6 +37,7 @@ def __init__( transformer_path, model=None, use_gpu=False, + bert_tokenize=False, ): if model: @@ -44,6 +45,9 @@ def __init__( else: self.encoder_model = os.path.join( transformer_path, encoder_model_name) + self.bert_tokenizer = None + if bert_tokenize: + self.bert_tokenizer = self.encoder_model self.min_token_len = min_token_len self.return_id = return_id self.verbose = verbose @@ -159,7 +163,7 @@ def index_documents(self, corpus_path, index_path): return_id=self.return_id, min_token_len=self.min_token_len, verbose=self.verbose, - bert_based_tokenizer=self.encoder_model, + bert_based_tokenizer=self.bert_tokenizer, ) corpus = [(para_id, " ".join(tokens), None) for tokens, para_id in corp] From dbdb8055f07357e52d3d0655b1d9ea3cea3a699e Mon Sep 17 00:00:00 2001 From: 604840 Date: Thu, 3 Mar 2022 13:56:42 -0500 Subject: [PATCH 08/12] removing text length score --- gamechangerml/src/search/sent_transformer/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gamechangerml/src/search/sent_transformer/model.py b/gamechangerml/src/search/sent_transformer/model.py index ebbcdd80..efdfa13d 100644 --- a/gamechangerml/src/search/sent_transformer/model.py +++ b/gamechangerml/src/search/sent_transformer/model.py @@ -276,7 +276,7 @@ def search(self, query, num_results=10, externalSim=True): ) for idx, doc in enumerate(top_results): doc["text_length"] = length_scores[idx] - doc["score"] = doc["score"] + length_scores[idx] + doc["score"] = doc["score"] finalResults.append(doc) finalResults = sorted( finalResults, key=lambda i: i["score"], reverse=True) From 2b79d8302abefa85d838d23875009cf15bd780c9 Mon Sep 17 00:00:00 2001 From: 604840 Date: Fri, 4 Mar 2022 13:52:16 -0500 Subject: [PATCH 09/12] adding search generator and adding preprocessing to sentence --- gamechangerml/src/model_testing/queries.csv | 30 ++++++++++++++++ gamechangerml/src/model_testing/search_gen.py | 36 +++++++++++++++++++ .../src/search/sent_transformer/model.py | 4 ++- gamechangerml/src/text_handling/corpus.py | 1 - 4 files changed, 69 insertions(+), 2 deletions(-) create mode 100644 gamechangerml/src/model_testing/queries.csv create mode 100644 gamechangerml/src/model_testing/search_gen.py diff --git a/gamechangerml/src/model_testing/queries.csv b/gamechangerml/src/model_testing/queries.csv new file mode 100644 index 00000000..fbb465a4 --- /dev/null +++ b/gamechangerml/src/model_testing/queries.csv @@ -0,0 +1,30 @@ +queries,score,text +electronic warfare,, +covid-19,, +coronavirus,, +navy,, +cloud computing,, +cyber warfare,, +naval logistics,, +commerce and trade,, +artificial intelligence,, +intelligence,, +environment policy,, +arbitraton,, +information technology,, +network systems,, +government employee leave,, +common operating,, +budget guidance,, +telework,, +public affairs,, +DHA,, +DTIC,, +command and control,, +autonomous systems,, +climate change,, +1322.29,, +af910,, +jad2c,, +DoD 5240,, +keeler supreme court,, \ No newline at end of file diff --git a/gamechangerml/src/model_testing/search_gen.py b/gamechangerml/src/model_testing/search_gen.py new file mode 100644 index 00000000..736fdfc5 --- /dev/null +++ b/gamechangerml/src/model_testing/search_gen.py @@ -0,0 +1,36 @@ +import requests +import json +import pandas as pd +from tqdm import tqdm +import datetime + +url = "http://10.194.9.119:5000/getLoadedModels" + +headers = {} + +response = requests.request("GET", url, headers=headers) +sent_index = json.loads(response.content)["sentence_index"] +df = pd.read_csv("gamechangerml/src/model_testing/queries.csv") + +url = "http://10.194.9.119:5000/transSentenceSearch" + +headers = {"Content-Type": "application/json"} +queries = df.queries +model_resp = [] +for i in tqdm(queries): + payload = json.dumps({"text": i}) + response = requests.request("POST", url, headers=headers, data=payload) + resp = {} + cont = json.loads(response.content)[0] + resp["text"] = cont["text"][:300] + resp["score"] = cont["score"] + model_resp.append(resp) + +new_df = df.copy() +new_df["score"] = [x["score"] for x in model_resp] + +new_df["text"] = [x["text"] for x in model_resp] +new_df["model"] = sent_index +time = datetime.datetime.today().strftime("%Y%M%d%M") + +new_df.to_csv(f"query_results_{time}.csv") diff --git a/gamechangerml/src/search/sent_transformer/model.py b/gamechangerml/src/search/sent_transformer/model.py index efdfa13d..e0b73e61 100644 --- a/gamechangerml/src/search/sent_transformer/model.py +++ b/gamechangerml/src/search/sent_transformer/model.py @@ -13,6 +13,7 @@ from gamechangerml.api.utils import processmanager from gamechangerml.api.utils.logger import logger from gamechangerml.src.utilities.test_utils import * +from gamechangerml.src.text_handling.process import preprocess from gamechangerml.api.utils.pathselect import get_model_paths from gamechangerml.src.model_testing.validation_data import MSMarcoData @@ -263,7 +264,8 @@ def search(self, query, num_results=10, externalSim=True): rerank (list): List of tuples following a (score, paragraph_id, paragraph_text) format ranked based on similarity with query """ - top_results = self.retrieve_topn(query, num_results) + processed_q = " ".join(preprocess(query)) + top_results = self.retrieve_topn(processed_q, num_results) # choose to use an external similarity transformer if externalSim: return self.similarity.re_rank(query, top_results) diff --git a/gamechangerml/src/text_handling/corpus.py b/gamechangerml/src/text_handling/corpus.py index 1c2b1b7f..a08d7ed5 100644 --- a/gamechangerml/src/text_handling/corpus.py +++ b/gamechangerml/src/text_handling/corpus.py @@ -49,7 +49,6 @@ def __iter__(self): for para_text, para_id in zip(paragraphs, paragraph_ids): if self.bert_based_tokenizer: tokens = self.auto_token.tokenize(para_text) - print(tokens) process_tokens = preprocess(para_text, min_len=1) # half of the tokens are actual words if tokens: From 546cbc656181db057ad444e4350af4a15c4e82ef Mon Sep 17 00:00:00 2001 From: 604840 Date: Fri, 4 Mar 2022 13:53:47 -0500 Subject: [PATCH 10/12] adding flag --- gamechangerml/src/search/sent_transformer/model.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gamechangerml/src/search/sent_transformer/model.py b/gamechangerml/src/search/sent_transformer/model.py index e0b73e61..d1da4d05 100644 --- a/gamechangerml/src/search/sent_transformer/model.py +++ b/gamechangerml/src/search/sent_transformer/model.py @@ -254,7 +254,7 @@ def retrieve_topn(self, query, num_results=10): results.append(doc) return results - def search(self, query, num_results=10, externalSim=True): + def search(self, query, num_results=10, process=False, externalSim=True): """ Search the index and perform a similarity scoring reranker at the topn returned documents @@ -264,8 +264,9 @@ def search(self, query, num_results=10, externalSim=True): rerank (list): List of tuples following a (score, paragraph_id, paragraph_text) format ranked based on similarity with query """ - processed_q = " ".join(preprocess(query)) - top_results = self.retrieve_topn(processed_q, num_results) + if process: + query = " ".join(preprocess(query)) + top_results = self.retrieve_topn(query, num_results) # choose to use an external similarity transformer if externalSim: return self.similarity.re_rank(query, top_results) From cd595cae90a390746feca6bdaf254a370dd0b8bc Mon Sep 17 00:00:00 2001 From: 604840 Date: Mon, 7 Mar 2022 09:49:16 -0500 Subject: [PATCH 11/12] adding a process flag --- gamechangerml/api/fastapi/routers/search.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/gamechangerml/api/fastapi/routers/search.py b/gamechangerml/api/fastapi/routers/search.py index 5dee7c2f..80ac857d 100644 --- a/gamechangerml/api/fastapi/routers/search.py +++ b/gamechangerml/api/fastapi/routers/search.py @@ -84,7 +84,11 @@ async def textExtract_infer(body: dict, extractType: str, response: Response) -> @router.post("/transSentenceSearch", status_code=200) async def trans_sentence_infer( - body: dict, response: Response, num_results: int = 10, externalSim: bool = False + body: dict, + response: Response, + num_results: int = 10, + process: bool = True, + externalSim: bool = False, ) -> dict: """trans_sentence_infer - endpoint for sentence transformer inference Args: @@ -99,7 +103,7 @@ async def trans_sentence_infer( try: query_text = body["text"] results = MODELS.sentence_searcher.search( - query_text, num_results, externalSim=False + query_text, num_results, process=process, externalSim=False ) logger.info(results) except Exception: @@ -160,7 +164,8 @@ async def post_expand_query_terms(body: dict, response: Response) -> dict: logger.info(f"Expanding: {body}") query_expander = ( MODELS.query_expander - if body.get("qe_model", "gc_core") != "jbook" or MODELS.query_expander_jbook==None + if body.get("qe_model", "gc_core") != "jbook" + or MODELS.query_expander_jbook == None else MODELS.query_expander_jbook ) try: @@ -172,13 +177,14 @@ async def post_expand_query_terms(body: dict, response: Response) -> dict: # Removes original word from the return terms unless it is combined with another word logger.info(f"original expanded terms: {expansion_list}") finalTerms = remove_original_kw(expansion_list, terms_string) - expansion_dict[terms_string] = ['"{}"'.format(exp) for exp in finalTerms] + expansion_dict[terms_string] = [ + '"{}"'.format(exp) for exp in finalTerms] logger.info(f"-- Expanded {terms_string} to \n {finalTerms}") # Perform word similarity logger.info(f"Finding similiar words for: {terms_string}") sim_words_dict = MODELS.word_sim.most_similiar_tokens(terms_string) logger.info(f"-- Expanded {terms_string} to \n {sim_words_dict}") - ## Construct return payload + # Construct return payload expanded_words = {} expanded_words["qexp"] = expansion_dict expanded_words["wordsim"] = sim_words_dict @@ -222,7 +228,7 @@ async def post_recommender(body: dict, response: Response) -> dict: logger.info(f"Recommending similar documents to {filenames}") results = MODELS.recommender.get_recs( filenames=filenames, sample=sample) - if results['results'] != []: + if results["results"] != []: logger.info(f"Found similar docs: \n {str(results)}") else: logger.info("Did not find any similar docs") From e2f4175d456493f41d42e2488187f1730d8e78d3 Mon Sep 17 00:00:00 2001 From: 604840 Date: Mon, 7 Mar 2022 14:33:00 -0500 Subject: [PATCH 12/12] updating to use corpus path --- gamechangerml/scripts/make_training_data.py | 414 +++++++++++++------- 1 file changed, 272 insertions(+), 142 deletions(-) diff --git a/gamechangerml/scripts/make_training_data.py b/gamechangerml/scripts/make_training_data.py index 71ea05ae..c1a9b5ee 100644 --- a/gamechangerml/scripts/make_training_data.py +++ b/gamechangerml/scripts/make_training_data.py @@ -7,7 +7,11 @@ import spacy -from gamechangerml.configs.config import TrainingConfig, ValidationConfig, SimilarityConfig +from gamechangerml.configs.config import ( + TrainingConfig, + ValidationConfig, + SimilarityConfig, +) from gamechangerml.src.search.sent_transformer.model import SentenceSearcher from gamechangerml.src.utilities.text_utils import normalize_query from gamechangerml.src.utilities.test_utils import * @@ -16,38 +20,48 @@ from gamechangerml.scripts.update_eval_data import make_tiered_eval_data from gamechangerml.src.text_handling.corpus import LocalCorpus from gensim.utils import simple_preprocess -from gamechangerml import DATA_PATH +from gamechangerml import DATA_PATH, CORPUS_PATH model_path_dict = get_model_paths() random.seed(42) LOCAL_TRANSFORMERS_DIR = model_path_dict["transformers"] SIM_MODEL = SimilarityConfig.BASE_MODEL -training_dir= os.path.join(DATA_PATH, "training", "sent_transformer") -tts_ratio=TrainingConfig.DATA_ARGS["train_test_split_ratio"] +training_dir = os.path.join(DATA_PATH, "training", "sent_transformer") +tts_ratio = TrainingConfig.DATA_ARGS["train_test_split_ratio"] gold_standard_path = os.path.join( - "gamechangerml/data/user_data", ValidationConfig.DATA_ARGS["retriever_gc"]["gold_standard"] - ) + "gamechangerml/data/user_data", + ValidationConfig.DATA_ARGS["retriever_gc"]["gold_standard"], +) + +CORPUS_DIR = CORPUS_PATH +corpus_docs = [ + i.split(".json")[0] + for i in os.listdir(CORPUS_DIR) + if os.path.isfile(os.path.join(CORPUS_DIR, i)) +] -CORPUS_DIR = "gamechangerml/corpus" -corpus_docs = [i.split('.json')[0] for i in os.listdir(CORPUS_DIR) if os.path.isfile(os.path.join(CORPUS_DIR, i))] def get_sample_paragraphs(pars, par_limit=100, min_length=150): - '''Collect sample paragraphs longer than min_length (char), up to par_limit paragraphs''' - + """Collect sample paragraphs longer than min_length (char), up to par_limit paragraphs""" + count = 0 collected_pars = [] for i in pars: if count < par_limit: - if len(i['par_raw_text_t']) >= min_length: + if len(i["par_raw_text_t"]) >= min_length: count += 1 - collected_pars.append({"text": i['par_raw_text_t'], "id": i['id']}) + collected_pars.append( + {"text": i["par_raw_text_t"], "id": i["id"]}) else: break - + return collected_pars -def get_best_paragraphs(query: str, doc_id: str, nlp, n_returns, min_score: float=0.60) -> List[Dict[str,str]]: + +def get_best_paragraphs( + query: str, doc_id: str, nlp, n_returns, min_score: float = 0.60 +) -> List[Dict[str, str]]: """Retrieves the best paragraphs for expected doc using similarity model Args: data [pd.DataFrame]: data df with processed text at paragraph_id level for sent_index @@ -57,17 +71,19 @@ def get_best_paragraphs(query: str, doc_id: str, nlp, n_returns, min_score: floa Returns: [List[Dict[str,str]]]: List of dictionaries of paragraph matches """ - logger.info(f"Retrieving matches for query: {query}, expected doc: {doc_id}") + logger.info( + f"Retrieving matches for query: {query}, expected doc: {doc_id}") pars = [] doc1 = nlp(query) if doc_id not in corpus_docs: logger.warning(f"---Did not find {doc_id} in the corpus") - json = open_json(doc_id + '.json', CORPUS_DIR) - paragraphs = json['paragraphs'] - sents = get_sample_paragraphs(paragraphs)[:n_returns] # get top n_returns + json = open_json(doc_id + ".json", CORPUS_DIR) + paragraphs = json["paragraphs"] + sents = get_sample_paragraphs(paragraphs)[:n_returns] # get top n_returns for sent in sents: - processed = ' '.join(simple_preprocess(sent['text'], min_len=2, max_len=100)) + processed = " ".join(simple_preprocess( + sent["text"], min_len=2, max_len=100)) pars.append({"id": sent["id"], "text": processed}) ranked = [] @@ -75,39 +91,57 @@ def get_best_paragraphs(query: str, doc_id: str, nlp, n_returns, min_score: floa if len(pars) == 0: logger.info("---No paragraphs retrieved for this expected doc") elif len(pars) == 1: - ranked = [{"score": 'na', "id": pars[0]['id'], "text": pars[0]['text']}] + ranked = [{"score": "na", "id": pars[0] + ["id"], "text": pars[0]["text"]}] else: comparisons = [] for par in pars: - doc2 = nlp(par['text']) + doc2 = nlp(par["text"]) sim = doc1.similarity(doc2) if sim >= min_score: - record = {"score": sim, "id": par['id'], "text": par['text']} + record = {"score": sim, + "id": par["id"], "text": par["text"]} comparisons.append(record) else: pass - ranked = sorted(comparisons, key = lambda z: z['score'], reverse=True) - logger.info(f"*** Collected {str(len(ranked))} / {str(len(pars))} paragraphs (passing sim threshold) retrieved for {doc_id}") + ranked = sorted( + comparisons, key=lambda z: z["score"], reverse=True) + logger.info( + f"*** Collected {str(len(ranked))} / {str(len(pars))} paragraphs (passing sim threshold) retrieved for {doc_id}" + ) except Exception as e: logger.info(f"---Could not re-rank the paragraphs for {query}") - logger.warning(e) + logger.warning(e) # if no paragraphs are returned, get the title - if len(ranked)==0: - clean_title = ' '.join(simple_preprocess(json['title'], min_len=2, max_len=100)) - ranked.append({"score": 1, "id": doc_id + ".pdf_0", "text": clean_title}) - + if len(ranked) == 0: + clean_title = " ".join(simple_preprocess( + json["title"], min_len=2, max_len=100)) + ranked.append({"score": 1, "id": doc_id + + ".pdf_0", "text": clean_title}) + return ranked + def check_no_match(id_1: str, id_2: str) -> bool: """Checks if paragraph ID matches the expected doc ID""" - if id_1.split('.pdf')[0].upper().strip().lstrip() == id_2.split('.pdf')[0].upper().strip().lstrip(): + if ( + id_1.split(".pdf")[0].upper().strip().lstrip() + == id_2.split(".pdf")[0].upper().strip().lstrip() + ): return False else: return True + def get_negative_paragraphs( - data: pd.DataFrame, query: str, doc_id: str, retriever, n_returns: int, any_matches: Dict[str,str]) -> List[Dict[str,str]]: + data: pd.DataFrame, + query: str, + doc_id: str, + retriever, + n_returns: int, + any_matches: Dict[str, str], +) -> List[Dict[str, str]]: """Looks up negative (not matching) paragraphs for each query Args: data [pd.DataFrame]: data df with processed text at paragraph_id level for sent_index @@ -122,28 +156,49 @@ def get_negative_paragraphs( checked_results = [] try: - single_matching_docs = [i for i in any_matches[query] if check_no_match(i, doc_id)] + single_matching_docs = [ + i for i in any_matches[query] if check_no_match(i, doc_id) + ] except: single_matching_docs = [] try: results = retriever.retrieve_topn(query, n_returns) - logger.info(f"Retrieved {str(len(results))} negative samples for query: {query} / doc: {doc_id}") + logger.info( + f"Retrieved {str(len(results))} negative samples for query: {query} / doc: {doc_id}" + ) for result in results: - par = data[data["paragraph_id"]==result['id']].iloc[0]["text"] - par = ' '.join(par.split(' ')[:400]) - if check_no_match(doc_id, result['id']): + par = data[data["paragraph_id"] == result["id"]].iloc[0]["text"] + par = " ".join(par.split(" ")[:400]) + if check_no_match(doc_id, result["id"]): for s in single_matching_docs: - if s and check_no_match(s, result['id']): - checked_results.append({"query": query, "doc": result['id'], "paragraph": par, "label": 0}) + if s and check_no_match(s, result["id"]): + checked_results.append( + { + "query": query, + "doc": result["id"], + "paragraph": par, + "label": 0, + } + ) else: - checked_results.append({"query": query, "doc": result['id'], "paragraph": par, "label": 0.5}) + checked_results.append( + { + "query": query, + "doc": result["id"], + "paragraph": par, + "label": 0.5, + } + ) except Exception as e: logger.warning("Could not get negative paragraphs") logger.warning(e, exc_info=True) - + return checked_results -def add_gold_standard(intel: Dict[str,str], gold_standard_path: Union[str, os.PathLike]) -> Dict[str,str]: + +def add_gold_standard( + intel: Dict[str, str], gold_standard_path: Union[str, os.PathLike] +) -> Dict[str, str]: """Adds original gold standard data to the intel training data. Args: intel [Dict[str,str]: intelligent search evaluation data @@ -151,83 +206,88 @@ def add_gold_standard(intel: Dict[str,str], gold_standard_path: Union[str, os.Pa Returns: intel [Dict[str,str]: intelligent search evaluation data with manual entries added """ - gold = pd.read_csv(gold_standard_path, names=['query', 'document']) - gold['query_clean'] = gold['query'].apply(lambda x: normalize_query(x)) - gold['docs_split'] = gold['document'].apply(lambda x: x.split(';')) - all_docs = list(set([a for b in gold['docs_split'].tolist() for a in b])) + gold = pd.read_csv(gold_standard_path, names=["query", "document"]) + gold["query_clean"] = gold["query"].apply(lambda x: normalize_query(x)) + gold["docs_split"] = gold["document"].apply(lambda x: x.split(";")) + all_docs = list(set([a for b in gold["docs_split"].tolist() for a in b])) - def add_key(mydict: Dict[str,str]) -> str: + def add_key(mydict: Dict[str, str]) -> str: """Adds new key to queries/collections dictionaries""" last_key = sorted([*mydict.keys()])[-1] key_len = len(last_key) - 1 last_prefix = last_key[0] last_num = int(last_key[1:]) new_num = str(last_num + 1) - - return last_prefix + str(str(0)*(key_len - len(new_num)) + new_num) + + return last_prefix + str(str(0) * (key_len - len(new_num)) + new_num) # check if queries already in dict, if not add - for i in gold['query_clean']: - if i in intel['queries'].values(): + for i in gold["query_clean"]: + if i in intel["queries"].values(): logger.info(f"'{i}' already in intel queries") continue else: logger.info(f"adding '{i}' to intel queries") - new_key = add_key(intel['queries']) - intel['queries'][new_key] = i - + new_key = add_key(intel["queries"]) + intel["queries"][new_key] = i + # check if docs already in dict, if not add for i in all_docs: - if i in intel['collection'].values(): + if i in intel["collection"].values(): logger.info(f"'{i}' already in intel collection") continue else: logger.info(f"adding '{i}' to intel collection") - new_key = add_key(intel['collection']) - intel['collection'][new_key] = i + new_key = add_key(intel["collection"]) + intel["collection"][new_key] = i # check if rels already in intel, if not add - reverse_q = {v:k for k,v in intel['queries'].items()} - reverse_d = {v:k for k,v in intel['collection'].items()} + reverse_q = {v: k for k, v in intel["queries"].items()} + reverse_d = {v: k for k, v in intel["collection"].items()} for i in gold.index: - q = gold.loc[i, 'query_clean'] - docs = gold.loc[i, 'docs_split'] + q = gold.loc[i, "query_clean"] + docs = gold.loc[i, "docs_split"] for j in docs: q_id = reverse_q[q] d_id = reverse_d[j] - if q_id in intel['correct']: # if query in rels, add new docs - if d_id in intel['correct'][q_id]: + if q_id in intel["correct"]: # if query in rels, add new docs + if d_id in intel["correct"][q_id]: continue else: - intel['correct'][q_id] += [d_id] + intel["correct"][q_id] += [d_id] else: - intel['correct'][q_id] = [d_id] - + intel["correct"][q_id] = [d_id] + return intel -def train_test_split(data: Dict[str,str], tts_ratio: float) -> Tuple[Dict[str, str]]: + +def train_test_split(data: Dict[str, str], tts_ratio: float) -> Tuple[Dict[str, str]]: """Splits a dictionary into train/test set based on split ratio""" - queries = list(set([data[i]['query'] for i in data])) + queries = list(set([data[i]["query"] for i in data])) # split the data into positive and negative examples grouped by query neg_passing = {} pos_passing = {} for q in queries: - subset = {i:data[i] for i in data.keys() if data[i]['query']==q} - pos_sample = [i for i in subset.keys() if subset[i]['label']==1] - neg_sample = [i for i in subset.keys() if subset[i]['label']==-1] - if len(neg_sample)>0: #since we have so few negative samples, add to neg list if it has a negative ex + subset = {i: data[i] for i in data.keys() if data[i]["query"] == q} + pos_sample = [i for i in subset.keys() if subset[i]["label"] == 1] + neg_sample = [i for i in subset.keys() if subset[i]["label"] == -1] + if ( + len(neg_sample) > 0 + ): # since we have so few negative samples, add to neg list if it has a negative ex neg_passing[q] = subset - elif len(pos_sample)>0: # only add the other samples if they have a positive matching sample + elif ( + len(pos_sample) > 0 + ): # only add the other samples if they have a positive matching sample pos_passing[q] = subset pos_train_size = round(len(pos_passing.keys()) * tts_ratio) neg_train_size = round(len(neg_passing.keys()) * tts_ratio) pos_train_keys = random.sample(pos_passing.keys(), pos_train_size) - neg_train_keys = random.sample(neg_passing.keys(), neg_train_size) - + neg_train_keys = random.sample(neg_passing.keys(), neg_train_size) + pos_test_keys = [i for i in pos_passing.keys() if i not in pos_train_keys] neg_test_keys = [i for i in neg_passing.keys() if i not in neg_train_keys] @@ -241,9 +301,9 @@ def train_test_split(data: Dict[str,str], tts_ratio: float) -> Tuple[Dict[str, s train_keys.extend(neg_passing[x]) for x in neg_test_keys: test_keys.extend(neg_passing[x]) - - train = {i:data[i] for i in train_keys} - test = {i:data[i] for i in test_keys} + + train = {i: data[i] for i in train_keys} + test = {i: data[i] for i in test_keys} metadata = { "date_created": str(date.today()), @@ -253,20 +313,21 @@ def train_test_split(data: Dict[str,str], tts_ratio: float) -> Tuple[Dict[str, s "total_test_samples_size": len(test), "train_queries": pos_train_keys + neg_train_keys, "test_queries": pos_test_keys + neg_test_keys, - "split_ratio": tts_ratio + "split_ratio": tts_ratio, } return train, test, metadata + def collect_matches( - data: pd.DataFrame, + data: pd.DataFrame, nlp, n_returns, relations: Dict[str, str], queries: Dict[str, str], collection: Dict[str, str], label: int, - ) -> Tuple[Dict[str, str]]: +) -> Tuple[Dict[str, str]]: """Gets matching paragraphs for each query/docid pair Args: data [pd.DataFrame]: data df with processed text at paragraph_id level for sent_index @@ -287,28 +348,37 @@ def collect_matches( query = queries[i] for k in relations[i]: doc = collection[k] - uid = str(i) + '_' + str(k) # backup UID, overwritten if there are results + # backup UID, overwritten if there are results + uid = str(i) + "_" + str(k) try: matching = get_best_paragraphs(query, doc, nlp, n_returns) for match in matching: - uid = str(i) + '_' + str(match['id']) - text = ' '.join(match['text'].split(' ')[:400]) # truncate to 400 tokens - found[uid] = {"query": query, "doc": doc, "paragraph": text, "label": label} + uid = str(i) + "_" + str(match["id"]) + text = " ".join( + match["text"].split(" ")[:400] + ) # truncate to 400 tokens + found[uid] = { + "query": query, + "doc": doc, + "paragraph": text, + "label": label, + } except Exception as e: logger.warning("Could not get positive matches") logger.warning(e) not_found[uid] = {"query": query, "doc": doc, "label": label} return found, not_found + def collect_negative_samples( - data: pd.DataFrame, - retriever, + data: pd.DataFrame, + retriever, n_returns: int, relations: Dict[str, str], queries: Dict[str, str], collection: Dict[str, str], any_matches: Dict[str, str], - ) -> Tuple[Dict[str, str]]: +) -> Tuple[Dict[str, str]]: """Gets negative samples each query/docid pair Args: data [pd.DataFrame]: data df with processed text at paragraph_id level for sent_index @@ -327,30 +397,49 @@ def collect_negative_samples( query = queries[i] for k in relations[i]: doc = collection[k] - uid = str(i) + '_' + str(k) + '_neg' # backup UID, overwritten if there are results + uid = ( + str(i) + "_" + str(k) + "_neg" + ) # backup UID, overwritten if there are results try: - not_matching = get_negative_paragraphs(data=data, query=query, doc_id=k, retriever=retriever, n_returns=n_returns, any_matches=any_matches) + not_matching = get_negative_paragraphs( + data=data, + query=query, + doc_id=k, + retriever=retriever, + n_returns=n_returns, + any_matches=any_matches, + ) for match in not_matching: - uid = str(i) + '_' + str(match['doc']) - text = ' '.join(match['paragraph'].split(' ')[:400]) # truncate to 400 tokens - found[uid] = {"query": query, "doc": doc, "paragraph": text, "label": 0} + uid = str(i) + "_" + str(match["doc"]) + text = " ".join( + match["paragraph"].split(" ")[:400] + ) # truncate to 400 tokens + found[uid] = { + "query": query, + "doc": doc, + "paragraph": text, + "label": 0, + } except Exception as e: logger.warning(e) not_found[uid] = {"query": query, "doc": doc, "label": 0} - + return found, not_found + def get_all_single_matches(): - validation_dir = get_most_recent_dir(os.path.join(DATA_PATH, "validation", "domain", "sent_transformer")) + validation_dir = get_most_recent_dir( + os.path.join(DATA_PATH, "validation", "domain", "sent_transformer") + ) directory = os.path.join(validation_dir, "any") any_matches = {} try: - f = open_json('intelligent_search_data.json', directory) + f = open_json("intelligent_search_data.json", directory) intel = json.loads(f) - for x in intel['correct'].keys(): - query = intel['queries'][x] - doc_keys = intel['correct'][x] - docs = [intel['collection'][k] for k in doc_keys] + for x in intel["correct"].keys(): + query = intel["queries"][x] + doc_keys = intel["correct"][x] + docs = [intel["collection"][k] for k in doc_keys] any_matches[query] = docs except Exception as e: logger.warning("Could not load all validation data") @@ -358,17 +447,19 @@ def get_all_single_matches(): return any_matches + def make_training_data( index_path: Union[str, os.PathLike], n_returns: int, - level: str, - update_eval_data: bool, + level: str, + update_eval_data: bool, retriever=None, - sim_model_name: str=SIM_MODEL, - transformers_dir: Union[str,os.PathLike]=LOCAL_TRANSFORMERS_DIR, - gold_standard_path: Union[str,os.PathLike]=gold_standard_path, - tts_ratio: float=tts_ratio, - training_dir: Union[str,os.PathLike]=training_dir) -> Tuple[Dict[str,str]]: + sim_model_name: str = SIM_MODEL, + transformers_dir: Union[str, os.PathLike] = LOCAL_TRANSFORMERS_DIR, + gold_standard_path: Union[str, os.PathLike] = gold_standard_path, + tts_ratio: float = tts_ratio, + training_dir: Union[str, os.PathLike] = training_dir, +) -> Tuple[Dict[str, str]]: """Makes training data based on new user search history data Args: index_path [str|os.PathLike]: path to the sent index for retrieving the training data (should be most recent index) @@ -382,28 +473,36 @@ def make_training_data( training_dir [Union[str,os.PathLike]]: directory for saving training data Returns: [Tuple[Dict[str,str]]]: training data and training metadata dictionaries - """ - ## open json files - if not os.path.exists(os.path.join(DATA_PATH, "validation", "domain", "sent_transformer")) or update_eval_data: + """ + # open json files + if ( + not os.path.exists( + os.path.join(DATA_PATH, "validation", "domain", "sent_transformer") + ) + or update_eval_data + ): logger.info("**** Updating the evaluation data") make_tiered_eval_data(index_path) - validation_dir = get_most_recent_dir(os.path.join(DATA_PATH, "validation", "domain", "sent_transformer")) + validation_dir = get_most_recent_dir( + os.path.join(DATA_PATH, "validation", "domain", "sent_transformer") + ) directory = os.path.join(validation_dir, level) - logger.info(f"**** Loading in intelligent search data from {str(directory)}") + logger.info( + f"**** Loading in intelligent search data from {str(directory)}") try: - f = open_json('intelligent_search_data.json', directory) + f = open_json("intelligent_search_data.json", directory) intel = json.loads(f) except Exception as e: logger.warning("Could not load intelligent search data") logger.warning(e) intel = {} - ## add gold standard samples + # add gold standard samples logger.info("**** Adding gold standard examples") intel = add_gold_standard(intel, gold_standard_path) - ## set up save dir + # set up save dir save_dir = make_timestamp_directory(training_dir) try: @@ -414,69 +513,96 @@ def make_training_data( if not retriever: logger.info("Did not init SentenceSearcher, loading now") retriever = SentenceSearcher( - sim_model_name=sim_model_name, - index_path=index_path, - transformer_path=transformers_dir - ) - ## read in sent_index data + sim_model_name=sim_model_name, + index_path=index_path, + transformer_path=transformers_dir, + ) + # read in sent_index data logger.info("**** Loading in sent index data from retriever") try: data = retriever.data - data['doc_id'] = data['paragraph_id'].apply(lambda x: x.split('.pdf')[0]) + data["doc_id"] = data["paragraph_id"].apply( + lambda x: x.split(".pdf")[0]) except Exception as e: logger.info("Could not load in data from retriever") logger.warning(e) any_matches = get_all_single_matches() - ## get matching paragraphs + # get matching paragraphs try: correct_found, correct_notfound = collect_matches( - data=data, queries=intel['queries'], collection=intel['collection'], - relations=intel['correct'], label=1, nlp = nlp, n_returns=n_returns) - logger.info(f"---Number of correct query/result pairs that were not found: {str(len(correct_notfound))}") + data=data, + queries=intel["queries"], + collection=intel["collection"], + relations=intel["correct"], + label=1, + nlp=nlp, + n_returns=n_returns, + ) + logger.info( + f"---Number of correct query/result pairs that were not found: {str(len(correct_notfound))}" + ) except Exception as e: logger.warning(e) logger.warning("\nCould not retrieve positive matches\n") try: incorrect_found, incorrect_notfound = collect_matches( - data=data, queries=intel['queries'], collection=intel['collection'], - relations=intel['incorrect'], label=-1, nlp = nlp, n_returns=n_returns) - logger.info(f"---Number of incorrect query/result pairs that were not found: {str(len(incorrect_notfound))}") + data=data, + queries=intel["queries"], + collection=intel["collection"], + relations=intel["incorrect"], + label=-1, + nlp=nlp, + n_returns=n_returns, + ) + logger.info( + f"---Number of incorrect query/result pairs that were not found: {str(len(incorrect_notfound))}" + ) except Exception as e: logger.warning(e) logger.warning("\nCould not retrieve negative matches\n") - ## get negative samples + # get negative samples try: - all_relations = {**intel['correct'], **intel['incorrect']} + all_relations = {**intel["correct"], **intel["incorrect"]} neutral_found, neutral_notfound = collect_negative_samples( - data=data, retriever=retriever, n_returns=n_returns, queries=intel['queries'], collection=intel['collection'], - relations=all_relations, any_matches=any_matches) - logger.info(f"---Number of negative sample pairs that were not found: {str(len(neutral_notfound))}") + data=data, + retriever=retriever, + n_returns=n_returns, + queries=intel["queries"], + collection=intel["collection"], + relations=all_relations, + any_matches=any_matches, + ) + logger.info( + f"---Number of negative sample pairs that were not found: {str(len(neutral_notfound))}" + ) except Exception as e: logger.warning(e) logger.warning("\nCould not retrieve negative samples\n") - ## save a json of the query-doc pairs that did not retrieve an ES paragraph for training data + # save a json of the query-doc pairs that did not retrieve an ES paragraph for training data notfound = {**correct_notfound, **incorrect_notfound, **neutral_notfound} - logger.info(f"---Number of total query/result pairs that were not found: {str(len(notfound))}") - notfound_path = os.path.join(save_dir, 'not_found_search_pairs.json') + logger.info( + f"---Number of total query/result pairs that were not found: {str(len(notfound))}" + ) + notfound_path = os.path.join(save_dir, "not_found_search_pairs.json") with open(notfound_path, "w") as outfile: json.dump(notfound, outfile) all_examples = {**neutral_found, **incorrect_found, **correct_found} logger.info(f"Total size of query-doc pairs: {str(len(all_examples))}") - ## train/test split + # train/test split train, test, metadata = train_test_split(all_examples, tts_ratio) data = {"train": train, "test": test} logger.info(f"**** Generated training data: \n {metadata}") - ## save data and metadata files - data_path = os.path.join(save_dir, 'training_data.json') - metadata_path = os.path.join(save_dir, 'training_metadata.json') + # save data and metadata files + data_path = os.path.join(save_dir, "training_data.json") + metadata_path = os.path.join(save_dir, "training_metadata.json") with open(data_path, "w") as outfile: json.dump(data, outfile) @@ -484,8 +610,12 @@ def make_training_data( with open(metadata_path, "w") as outfile: json.dump(metadata, outfile) -if __name__ == '__main__': + +if __name__ == "__main__": make_training_data( - index_path="gamechangerml/models/sent_index_20220103", n_returns=50, level="silver", - update_eval_data=True) \ No newline at end of file + index_path="gamechangerml/models/sent_index_20220103", + n_returns=50, + level="silver", + update_eval_data=True, + )