From 5f9964dde01613732b4799b0d2e13a0f1bf027d3 Mon Sep 17 00:00:00 2001
From: Kate Dowdy <dowdy_katherine@bah.com>
Date: Fri, 15 Apr 2022 23:50:19 -0600
Subject: [PATCH 1/5] adding functions to check paragraph quality

---
 gamechangerml/src/text_handling/corpus.py |  4 +-
 gamechangerml/src/utilities/text_utils.py | 55 +++++++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/gamechangerml/src/text_handling/corpus.py b/gamechangerml/src/text_handling/corpus.py
index 37745358..3ceee522 100644
--- a/gamechangerml/src/text_handling/corpus.py
+++ b/gamechangerml/src/text_handling/corpus.py
@@ -5,6 +5,7 @@
 # import pandas as pd
 from gensim.models.doc2vec import TaggedDocument
 from gamechangerml.src.text_handling.process import preprocess, get_tokenizer
+from gamechangerml.src.utilities.text_utils import check_quality_paragraph
 from gamechangerml.api.utils import processmanager
 from tqdm import tqdm
 
@@ -65,7 +66,8 @@ def __iter__(self):
                         process_tokens = preprocess(para_text, min_len=1)
                         # half of the tokens are actual words
                         if tokens:
-                            if (len(process_tokens) / len(tokens)) > 0.5:
+                            if check_quality_paragraph(process_tokens, tokens, para_text):
+                            #if (len(process_tokens) / len(tokens)) > 0.5:
                                 if len(tokens) > self.min_token_len:
                                     if self.return_id:
                                         yield tokens, para_id
diff --git a/gamechangerml/src/utilities/text_utils.py b/gamechangerml/src/utilities/text_utils.py
index 260014d5..ccbe61bc 100644
--- a/gamechangerml/src/utilities/text_utils.py
+++ b/gamechangerml/src/utilities/text_utils.py
@@ -208,6 +208,61 @@ def get_tokens(s: str) -> List[str]:
     if not s: return []
     return s.split()
 
+def has_many_short_tokens(processed_tokens, threshold = 4.0):
+    '''Checks if the median length of tokens is less than the expected threshold'''
+    median_len = np.median([len(i) for i in processed_tokens])
+    if median_len <= threshold:
+        return True
+    else:
+        return False
+
+def has_many_repeating(text, processed_tokens, threshold = 0.6):
+    '''Checks if the ratio of unique tokens is less than an expected threshold'''
+    ratio_unique = len(set(processed_tokens)) / len(text.split(' '))
+    if ratio_unique < threshold:
+        return True
+    else:
+        return False
+
+def has_extralong_tokens(text, threshold = 25):
+    '''Checks if the paragraph has a token that exceeds the threshold for normal token length'''
+    longest_token = np.max([len(i) for i in text.split(' ')])
+    if longest_token > threshold:
+        return True
+    else:
+        return False
+
+def is_a_toc(text):
+    '''Checks if a paragraph appears to be a table of contents'''
+    toc_separation = re.findall(r'(\.{2,})', text)
+    if len(toc_separation) > 0:
+        return True
+    else:
+        return False
+
+def majority_tokens_filtered(processed_tokens, tokens):
+    '''Checks if most of the tokens were filtered out'''
+    if (len(processed_tokens) / len(tokens)) <= 0.5:
+        return True
+    else:
+        return False
+
+def check_quality_paragraph(processed_tokens, tokens, text):
+    '''Runs functions to check that a paragraph isn't a junk paragraph'''
+
+    if majority_tokens_filtered(processed_tokens, tokens):
+        return False
+    if has_many_short_tokens(processed_tokens, threshold = 4.0):
+        return False
+    elif has_many_repeating(text, processed_tokens, threshold = 0.6):
+        return False
+    elif has_extralong_tokens(text, threshold = 25):
+        return False
+    elif is_a_toc(text):
+        return False
+    else:
+        return True
+
 # Adapted from https://www.datacamp.com/community/tutorials/fuzzy-string-python
 def levenshtein_ratio_and_distance(s: str, t: str, ratio_calc: bool=False) -> Tuple[int,float]:
     """ levenshtein_ratio_and_distance:

From 83c4ede1e56be98d693572348972509e87949f20 Mon Sep 17 00:00:00 2001
From: Kate Dowdy <dowdy_katherine@bah.com>
Date: Mon, 18 Apr 2022 11:06:38 -0600
Subject: [PATCH 2/5] adding tests, simplifying text checking in corpus.py

---
 gamechangerml/api/tests/api_tests.py      | 54 +++++++++++++++++++++++
 gamechangerml/api/tests/test_examples.py  | 15 +++++++
 gamechangerml/src/text_handling/corpus.py | 24 ++++------
 gamechangerml/src/utilities/text_utils.py | 18 ++++----
 4 files changed, 87 insertions(+), 24 deletions(-)

diff --git a/gamechangerml/api/tests/api_tests.py b/gamechangerml/api/tests/api_tests.py
index 49222b5f..60293bf6 100644
--- a/gamechangerml/api/tests/api_tests.py
+++ b/gamechangerml/api/tests/api_tests.py
@@ -11,6 +11,10 @@
 from http.client import HTTPConnection  # py3
 
 from gamechangerml.src.search.query_expansion.utils import remove_original_kw
+from gamechangerml.src.text_handling.process import preprocess
+from gamechamgerml.src.utilities.text_utils import (
+    has_many_short_tokens, has_many_repeating, has_extralong_tokens, is_a_toc, majority_tokens_filtered, check_quality_paragraph
+)
 #from gamechangerml import DATA_PATH
 
 from .test_examples import TestSet
@@ -99,6 +103,56 @@ def getCurrentTrans():
     resp = http.get(API_URL + "/getCurrentTransformer")
     return resp.json()
 
+## Sent Index Processing Tests
+
+def test_has_many_short_tokens():
+    test_pars = TestSet.sent_index_processing_pars
+    results = []
+    for x in test_pars.keys():
+        text = test_pars[x]
+        tokens = preprocess(text)
+        check = has_many_short_tokens(tokens, threshold = 4.0)
+        results.append(check)
+    assert results == TestSet.sent_index_processing_results['has_many_short_tokens']
+ 
+def test_has_many_repeating():
+    test_pars = TestSet.sent_index_processing_pars
+    results = []
+    for x in test_pars.keys():
+        text = test_pars[x]
+        tokens = preprocess(text)
+        check = has_many_repeating(text, tokens, threshold = 0.6)
+        results.append(check)
+    assert results == TestSet.sent_index_processing_results['has_many_repeating']
+    
+def test_has_extralong_tokens():
+    test_pars = TestSet.sent_index_processing_pars
+    results = []
+    for x in test_pars.keys():
+        text = test_pars[x]
+        check = has_extralong_tokens(text, threshold = 25)
+        results.append(check)
+    assert results == TestSet.sent_index_processing_results['has_extralong_tokens']
+
+def test_is_a_toc():
+    test_pars = TestSet.sent_index_processing_pars
+    results = []
+    for x in test_pars.keys():
+        text = test_pars[x]
+        check = is_a_toc(text)
+        results.append(check)
+    assert results == TestSet.sent_index_processing_results['is_a_toc']
+
+def test_check_quality_paragraph():
+    test_pars = TestSet.sent_index_processing_pars
+    results = []
+    for x in test_pars.keys():
+        text = test_pars[x]
+        tokens = preprocess(text)
+        check = check_quality_paragraph(tokens, text)
+        results.append(check)
+    assert results == TestSet.sent_index_processing_results['check_quality']
+
 
 # def test_changeModels():
 
diff --git a/gamechangerml/api/tests/test_examples.py b/gamechangerml/api/tests/test_examples.py
index 48ec58c8..14dd9687 100644
--- a/gamechangerml/api/tests/test_examples.py
+++ b/gamechangerml/api/tests/test_examples.py
@@ -160,3 +160,18 @@ class TestSet:
         # 'msmarco-distilbert-base-v2_2021-10-17',
         # 'msmarco-distilbert-base-v2_20211210',
     }
+
+    sent_index_processing_pars = {
+        "good": "6. U.S. Army Corps of Engineers (USACE). USACE is involved with waterways dredging, flood prevention, permitting obstructions within U.S. waters, and the construction, maintenance, and operation of waterway projects, such as locks, dams, and reservoirs, etc. USACE also enforces the Refuse Act (33 U.S.C. 407). The Coast Guard has been designated to assist in the enforcement of certain specific provisions of law and regulations administered by USACE. Comman...",
+        "bad_acronyms": "EPA/625/11-91/002, 1992 (ar) 40 CFR 268 (as) 40 CFR 240 (at) 42 U.S.C. 7401 (au) 40 CFR 61 (av) 40 CFR 230 (aw) 33 CFR 320 (ax) 33 CFR 321 (ay) 33 CFR 322 (az) 33 CFR 323 (ba) 33 CFR 325 (bb) 33 CFR 330 (bc) 40 CFR 233 (bd) 16 U.S.C. §§1451-1464 (be) 42 U.S.C. 4321 (bf) 40 CFR 220 (bg) 40 CFR 221 (bh) 40 CFR 222 (bi) 40 CFR 227 (bj) 40 CFR 224 (bk) 40 CFR 228 (bl) 40 CFR 223 (bm) 40 CFR 225 (bn) 40 CFR 226 (bo) 40 CFR 229 (bp) 33 U.S.C. 1401 (bq) 40 CFR 255 (br) 33 CFR 324 (bs) 15 CFR 930",
+        "bad_pages": "Page D4-1 – D4-6 Page D4-1 – D4-6 Page D4-9 – D4-10 Page D4-9 – D4-10 Page D4-13 – D4-16 Page D4-13 – D4-16 Page D4-19 – D4-20 Page D4-18a – D4-20 Page E3-1 – E3-25 Page E3-1 – E3-36",
+        "bad_long_tokens": "OPLANOPORDPAPACEPCCPCIPDSPHAPMCSPMIPOIPSGPZRTDSBSPOSQDLDRSSASTANAGTACEVACTACSOPTAPTASKORGTB MEDTCTCCCTEWLSTFCTLTLPTMTOETTPUAoperation planoperation orderphysician assistantprimary, alternate, contingency, and emergencypre-combat checkpre-combat inspectionpatient decontamination siteperiodic health assessmentpreventive maintainence checks and servicepatient movement itempoint of injuryplatoon sergeantpickup zonereturn to dutysupply bulletinsupport ope",
+        "bad_toc": "DoDM 4140.68 March 5 2020 TABLE OF CONTENTS 2 TABLE OF CONTENTS SECTION 1 GENERAL ISSUANCE INFORMATION .............................................................................. 4 1.1. Applicability. .................................................................................................................... 4 SECTION 2 RESPONSIBILITIES ......................................................................................................... 5 2.1. Assistant Secretary of Defense for Sustainment. .............................................................. 5 2.2. DLA. ................................................................................................................................. 5 2.3. DoD Component Heads. ................................................................................................... 5 2.4. Secretaries of the Military Departments. .......................................................................... 6 2.5. Commander United States Special Operations Command USSOCOM. ...................... 7 2.6. Administrators of Participating U.S. Government Civil Agencies. .................................. 7 SECTION 3 GENERAL PROCEDURES ................................................................................................ 8 3.1. NIMSCs. ........................................................................................................................... 8 3.2. PICA. .............................................................................................................................. 17 3.3. SICA. .............................................................................................................................. 18 3.4. Exceptions for SOP Items.............................................................................................. 19 3.5. NIMSC Designation........................................................................................................ 20 SECTION 4 SUPPLY AND DEPOT MAINTENANCE OPERATIONS PROCEDURES ................................ 24 4.1. Procedures for NIMSC 1 2 3 4 5 6 7 8 or 0 items. ................................................. 24 4.2. Provisioning. ................................................................................................................... 24 4.3. PICA Assignment. .......................................................................................................... 25 4.4. IMC Changes and PICA or SICA Reassignment Requests. ........................................... 26 4.5. Item Adoption. ................................................................................................................ 28 4.6. Procurement. ................................................................................................................... 28 4.7. Cataloging. ...................................................................................................................... 29 4.8. Depot Maintenance. ........................................................................................................ 31 4.9. Disposition. ..................................................................................................................... 31 4.10. Inactive Items. ............................................................................................................... 32 4.11. Standardization. ............................................................................................................ 33 SECTION 5 ITEM REVIEW PROCEDURES FOR MIGRATION TO NIMSC 5 OR NIMSC 6 .................. 34 5.1. Review Items for Migration to NIMSC 5 or NIMSC 6. ................................................. 34 5.2. Single Submitter of Procurement Specifications and Depotlevel Repair Specifications. ................................................................................................................... 35 SECTION 6 NIMSC MIGRATION PROCEDURES ............................................................................. 37 6.1. Migration to NIMSC 5 or NIMSC 6. .............................................................................. 37 6.2. NIMSC Migration or PICA Reassignment. .................................................................... 37 6.3. PreETD TimePeriod. .................................................................................................... 40 6.4. ETD TimePeriod............................................................................................................ 42 6.5. PostETD Timeperiod.................................................................................................... 42 SECTION 7 SUPPLY OPERATIONS PROCEDURES FOR NIMSC 5 AND NIMSC 6 ITEMS ................. 44 7.1. Item Stockage.................................................................................................................. 44 7.2. Requirements Computation and Methodology. .............................................................. 44 7.3. Item Distribution. ............................................................................................................ 44 a. Item Transfer Actions. ................................................................................................. 44 b. Requisition Processing. ................................................................................................ 45 "
+        }
+    sent_index_processing_results = {
+        "has_many_short_tokens": [False, True, True, False, False],
+        "has_many_repeating": [False, True, True, False, False],
+        "has_extralong_tokens": [False, False, False, True, False],
+        "is_a_toc": [False, False, False, False, False],
+        "check_quality": [True, False, False, False, False]
+    }
\ No newline at end of file
diff --git a/gamechangerml/src/text_handling/corpus.py b/gamechangerml/src/text_handling/corpus.py
index 3ceee522..5cd6077d 100644
--- a/gamechangerml/src/text_handling/corpus.py
+++ b/gamechangerml/src/text_handling/corpus.py
@@ -63,23 +63,16 @@ def __iter__(self):
                 for para_text, para_id in zip(paragraphs, paragraph_ids):
                     if self.bert_based_tokenizer:
                         tokens = self.auto_token.tokenize(para_text)
-                        process_tokens = preprocess(para_text, min_len=1)
-                        # half of the tokens are actual words
-                        if tokens:
-                            if check_quality_paragraph(process_tokens, tokens, para_text):
-                            #if (len(process_tokens) / len(tokens)) > 0.5:
-                                if len(tokens) > self.min_token_len:
-                                    if self.return_id:
-                                        yield tokens, para_id
-                                    else:
-                                        yield tokens
                     else:
                         tokens = preprocess(para_text, min_len=1)
-                        if len(tokens) > self.min_token_len:
-                            if self.return_id:
-                                yield tokens, para_id
-                            else:
-                                yield tokens
+                    if tokens:
+                        if check_quality_paragraph(tokens, para_text):
+                            if len(tokens) > self.min_token_len:
+                                if self.return_id:
+                                    yield tokens, para_id
+                                else:
+                                    yield tokens
+                    
                 progress += 1
                 processmanager.update_status(
                     processmanager.loading_corpus,
@@ -90,6 +83,7 @@ def __iter__(self):
             except Exception as e:
                 print(e)
                 print(f"Error with {file_name} in creating local corpus")
+        print
 
     def _get_doc(self, file_name):
         with open(file_name, "r") as f:
diff --git a/gamechangerml/src/utilities/text_utils.py b/gamechangerml/src/utilities/text_utils.py
index ccbe61bc..9528dc6a 100644
--- a/gamechangerml/src/utilities/text_utils.py
+++ b/gamechangerml/src/utilities/text_utils.py
@@ -216,9 +216,9 @@ def has_many_short_tokens(processed_tokens, threshold = 4.0):
     else:
         return False
 
-def has_many_repeating(text, processed_tokens, threshold = 0.6):
+def has_many_repeating(text, tokens, threshold = 0.6):
     '''Checks if the ratio of unique tokens is less than an expected threshold'''
-    ratio_unique = len(set(processed_tokens)) / len(text.split(' '))
+    ratio_unique = len(set(tokens)) / len(text.split(' '))
     if ratio_unique < threshold:
         return True
     else:
@@ -234,27 +234,27 @@ def has_extralong_tokens(text, threshold = 25):
 
 def is_a_toc(text):
     '''Checks if a paragraph appears to be a table of contents'''
-    toc_separation = re.findall(r'(\.{2,})', text)
+    toc_separation = re.findall(r'(\.{3,})', text)
     if len(toc_separation) > 0:
         return True
     else:
         return False
 
-def majority_tokens_filtered(processed_tokens, tokens):
+def majority_tokens_filtered(tokens, text):
     '''Checks if most of the tokens were filtered out'''
-    if (len(processed_tokens) / len(tokens)) <= 0.5:
+    if (len(tokens) / len(text.split(' '))) <= 0.5:
         return True
     else:
         return False
 
-def check_quality_paragraph(processed_tokens, tokens, text):
+def check_quality_paragraph(tokens, text):
     '''Runs functions to check that a paragraph isn't a junk paragraph'''
 
-    if majority_tokens_filtered(processed_tokens, tokens):
+    if majority_tokens_filtered(tokens, text):
         return False
-    if has_many_short_tokens(processed_tokens, threshold = 4.0):
+    if has_many_short_tokens(tokens, threshold = 4.0):
         return False
-    elif has_many_repeating(text, processed_tokens, threshold = 0.6):
+    elif has_many_repeating(text, tokens, threshold = 0.6):
         return False
     elif has_extralong_tokens(text, threshold = 25):
         return False

From 001f49967da0b5f4b5048a1f34a9186799401470 Mon Sep 17 00:00:00 2001
From: Kate Dowdy <dowdy_katherine@bah.com>
Date: Mon, 18 Apr 2022 11:16:12 -0600
Subject: [PATCH 3/5] formatting

---
 gamechangerml/api/tests/api_tests.py      | 218 ++++++++++++----------
 gamechangerml/api/tests/test_examples.py  |  91 +++++----
 gamechangerml/src/text_handling/corpus.py |  20 +-
 gamechangerml/src/utilities/text_utils.py | 173 ++++++++++-------
 4 files changed, 292 insertions(+), 210 deletions(-)

diff --git a/gamechangerml/api/tests/api_tests.py b/gamechangerml/api/tests/api_tests.py
index 60293bf6..8c48f74e 100644
--- a/gamechangerml/api/tests/api_tests.py
+++ b/gamechangerml/api/tests/api_tests.py
@@ -13,9 +13,15 @@
 from gamechangerml.src.search.query_expansion.utils import remove_original_kw
 from gamechangerml.src.text_handling.process import preprocess
 from gamechamgerml.src.utilities.text_utils import (
-    has_many_short_tokens, has_many_repeating, has_extralong_tokens, is_a_toc, majority_tokens_filtered, check_quality_paragraph
+    has_many_short_tokens,
+    has_many_repeating,
+    has_extralong_tokens,
+    is_a_toc,
+    majority_tokens_filtered,
+    check_quality_paragraph,
 )
-#from gamechangerml import DATA_PATH
+
+# from gamechangerml import DATA_PATH
 
 from .test_examples import TestSet
 
@@ -41,18 +47,11 @@ def test_expandTerms():
     verified = {
         "qexp": {
             "artificial intelligence": [
-                "\"employ artificial intelligence\"",
-                "\"developing artificial intelligence\""
+                '"employ artificial intelligence"',
+                '"developing artificial intelligence"',
             ]
         },
-        "wordsim": {
-            "artificial": [
-                "artifical"
-            ],
-            "intelligence": [
-                "intellegence"
-            ]
-        }
+        "wordsim": {"artificial": ["artifical"], "intelligence": ["intellegence"]},
     }
     assert resp.json() == verified
 
@@ -103,36 +102,41 @@ def getCurrentTrans():
     resp = http.get(API_URL + "/getCurrentTransformer")
     return resp.json()
 
+
 ## Sent Index Processing Tests
 
+
 def test_has_many_short_tokens():
     test_pars = TestSet.sent_index_processing_pars
     results = []
     for x in test_pars.keys():
         text = test_pars[x]
         tokens = preprocess(text)
-        check = has_many_short_tokens(tokens, threshold = 4.0)
+        check = has_many_short_tokens(tokens, threshold=4.0)
         results.append(check)
-    assert results == TestSet.sent_index_processing_results['has_many_short_tokens']
- 
+    assert results == TestSet.sent_index_processing_results["has_many_short_tokens"]
+
+
 def test_has_many_repeating():
     test_pars = TestSet.sent_index_processing_pars
     results = []
     for x in test_pars.keys():
         text = test_pars[x]
         tokens = preprocess(text)
-        check = has_many_repeating(text, tokens, threshold = 0.6)
+        check = has_many_repeating(text, tokens, threshold=0.6)
         results.append(check)
-    assert results == TestSet.sent_index_processing_results['has_many_repeating']
-    
+    assert results == TestSet.sent_index_processing_results["has_many_repeating"]
+
+
 def test_has_extralong_tokens():
     test_pars = TestSet.sent_index_processing_pars
     results = []
     for x in test_pars.keys():
         text = test_pars[x]
-        check = has_extralong_tokens(text, threshold = 25)
+        check = has_extralong_tokens(text, threshold=25)
         results.append(check)
-    assert results == TestSet.sent_index_processing_results['has_extralong_tokens']
+    assert results == TestSet.sent_index_processing_results["has_extralong_tokens"]
+
 
 def test_is_a_toc():
     test_pars = TestSet.sent_index_processing_pars
@@ -141,7 +145,8 @@ def test_is_a_toc():
         text = test_pars[x]
         check = is_a_toc(text)
         results.append(check)
-    assert results == TestSet.sent_index_processing_results['is_a_toc']
+    assert results == TestSet.sent_index_processing_results["is_a_toc"]
+
 
 def test_check_quality_paragraph():
     test_pars = TestSet.sent_index_processing_pars
@@ -151,7 +156,7 @@ def test_check_quality_paragraph():
         tokens = preprocess(text)
         check = check_quality_paragraph(tokens, text)
         results.append(check)
-    assert results == TestSet.sent_index_processing_results['check_quality']
+    assert results == TestSet.sent_index_processing_results["check_quality"]
 
 
 # def test_changeModels():
@@ -165,6 +170,7 @@ def test_check_quality_paragraph():
 
 # Search Tests
 
+
 def test_postSentSearch():
     test_data = TestSet.sentence_test_data
     verified = TestSet.sentence_search_expect
@@ -184,8 +190,9 @@ def test_recommender():
     resp = http.post(API_URL + "/recommender", json=test_data)
     data = resp.json()
     print(data)
-    assert len(data['results']) == 5
-    assert len(set(expected['results']).intersection(data['results'])) > 0
+    assert len(data["results"]) == 5
+    assert len(set(expected["results"]).intersection(data["results"])) > 0
+
 
 # QA Tests
 
@@ -193,14 +200,10 @@ def test_recommender():
 def send_qa(query, context):
 
     start = time.perf_counter()
-    post = {
-        "query": query,
-        "search_context": context
-    }
+    post = {"query": query, "search_context": context}
     data = json.dumps(post).encode("utf-8")
-    headers = {'Content-Type': 'application/json'}
-    response = http.post(API_URL + "/questionAnswer",
-                         data=data, headers=headers)
+    headers = {"Content-Type": "application/json"}
+    response = http.post(API_URL + "/questionAnswer", data=data, headers=headers)
 
     end = time.perf_counter()
     took = float(f"{end-start:0.4f}")
@@ -209,137 +212,156 @@ def send_qa(query, context):
 
 
 qa_test_context_1 = [
-    "Virginia'\''s Democratic-controlled Legislature passed a bill legalizing the possession of small amounts of marijuana on Wednesday, making it the 16th state to take the step. Under Virginia'\''s new law, adults ages 21 and over can possess an ounce or less of marijuana beginning on July 1, rather than Jan. 1, 2024. Gov. Ralph Northam, a Democrat, proposed moving up the date, arguing it would be a mistake to continue to penalize people for possessing a drug that would soon be legal. Lt. Gov. Justin Fairfax, also a Democrat, broke a 20-20 vote tie in Virginia'\''s Senate to pass the bill. No Republicans supported the measure. Democratic House of Delegates Speaker Eileen Filler-Corn hailed the plan. Today, with the Governor'\''s amendments, we will have made tremendous progress in ending the targeting of Black and brown Virginians through selective enforcement of marijuana prohibition by this summer she said in a statement. Republicans voiced a number of objections to what they characterized as an unwieldy, nearly 300-page bill. Several criticized measures that would grant licensing preferences to people and groups who'\''ve been affected by the war on drugs and make it easier for workers in the industry to unionize. Senate Minority Leader Tommy Norment also questioned Northam'\''s motives.",
-    "We have a governor who wants to contribute to the resurrection of his legacy, Norment said, referring to the 2019 discovery of a racist photo in Northam'\''s 1984 medical school yearbook. The accelerated timeline sets Virginia cannabis consumers in an unusual predicament. While it will be legal to grow up to four marijuana plants beginning July 1, it could be several years before the state begins licensing recreational marijuana retailers. And unlike other states, the law won'\''t allow the commonwealth'\''s existing medical dispensaries to begin selling to all adults immediately. Jenn Michelle Pedini, executive director of Virginia NORML, called legalization an incredible victory but said the group would continue to push to allow retail sales to begin sooner.",
-    "In the interest of public and consumer safety, Virginians 21 and older should be able to purchase retail cannabis products at the already operational dispensaries in 2021, not in 2024, Pedini said in a statement. Such a delay will only exacerbate the divide for equity applicants and embolden illicit activity. Northam and other Democrats pitched marijuana legalization as a way to address the historic harms of the war on drugs. One state study found Black Virginians were 3.5 times more likely to be arrested on marijuana charges compared with white people. Those trends persisted even after Virginia reduced penalties for possession to a $25 civil fine. New York and New Jersey also focused on addressing those patterns when governors in those states signed laws to legalize recreational marijuana this year. Northam'\''s proposal sets aside 30% of funds to go to communities affected by the war on drugs, compared with 70% in New Jersey. Another 40% of Virginia'\''s revenue will go toward early childhood education, with the remainder funding public health programs and substance abuse treatment.",
-    "Those plans, and much of the bill'\''s regulatory framework, are still tentative; Virginia lawmakers will have to approve them again during their general session next year. Some criminal justice advocates say lawmakers should also revisit language that creates a penalty for driving with an open container of marijuana. In the absence of retail sales, some members of law enforcement said it'\''s not clear what a container of marijuana will be. The bill specifies a category of social equity applicants, such as people who'\''ve been charged with marijuana-related offenses or who graduated from historically Black colleges and universities. Those entrepreneurs will be given preference when the state grants licensing. Mike Thomas, a Black hemp cultivator based in Richmond who served jail time for marijuana possession, said those entrepreneurs deserved special attention. Thomas said he looked forward to offering his own line of organic, craft cannabis. Being that the arrest rate wasn'\''t the same for everyone, I don'\''t think the business opportunities should be the same for everyone"
+    "Virginia'''s Democratic-controlled Legislature passed a bill legalizing the possession of small amounts of marijuana on Wednesday, making it the 16th state to take the step. Under Virginia'''s new law, adults ages 21 and over can possess an ounce or less of marijuana beginning on July 1, rather than Jan. 1, 2024. Gov. Ralph Northam, a Democrat, proposed moving up the date, arguing it would be a mistake to continue to penalize people for possessing a drug that would soon be legal. Lt. Gov. Justin Fairfax, also a Democrat, broke a 20-20 vote tie in Virginia'''s Senate to pass the bill. No Republicans supported the measure. Democratic House of Delegates Speaker Eileen Filler-Corn hailed the plan. Today, with the Governor'''s amendments, we will have made tremendous progress in ending the targeting of Black and brown Virginians through selective enforcement of marijuana prohibition by this summer she said in a statement. Republicans voiced a number of objections to what they characterized as an unwieldy, nearly 300-page bill. Several criticized measures that would grant licensing preferences to people and groups who'''ve been affected by the war on drugs and make it easier for workers in the industry to unionize. Senate Minority Leader Tommy Norment also questioned Northam'''s motives.",
+    "We have a governor who wants to contribute to the resurrection of his legacy, Norment said, referring to the 2019 discovery of a racist photo in Northam'''s 1984 medical school yearbook. The accelerated timeline sets Virginia cannabis consumers in an unusual predicament. While it will be legal to grow up to four marijuana plants beginning July 1, it could be several years before the state begins licensing recreational marijuana retailers. And unlike other states, the law won'''t allow the commonwealth'''s existing medical dispensaries to begin selling to all adults immediately. Jenn Michelle Pedini, executive director of Virginia NORML, called legalization an incredible victory but said the group would continue to push to allow retail sales to begin sooner.",
+    "In the interest of public and consumer safety, Virginians 21 and older should be able to purchase retail cannabis products at the already operational dispensaries in 2021, not in 2024, Pedini said in a statement. Such a delay will only exacerbate the divide for equity applicants and embolden illicit activity. Northam and other Democrats pitched marijuana legalization as a way to address the historic harms of the war on drugs. One state study found Black Virginians were 3.5 times more likely to be arrested on marijuana charges compared with white people. Those trends persisted even after Virginia reduced penalties for possession to a $25 civil fine. New York and New Jersey also focused on addressing those patterns when governors in those states signed laws to legalize recreational marijuana this year. Northam'''s proposal sets aside 30% of funds to go to communities affected by the war on drugs, compared with 70% in New Jersey. Another 40% of Virginia'''s revenue will go toward early childhood education, with the remainder funding public health programs and substance abuse treatment.",
+    "Those plans, and much of the bill'''s regulatory framework, are still tentative; Virginia lawmakers will have to approve them again during their general session next year. Some criminal justice advocates say lawmakers should also revisit language that creates a penalty for driving with an open container of marijuana. In the absence of retail sales, some members of law enforcement said it'''s not clear what a container of marijuana will be. The bill specifies a category of social equity applicants, such as people who'''ve been charged with marijuana-related offenses or who graduated from historically Black colleges and universities. Those entrepreneurs will be given preference when the state grants licensing. Mike Thomas, a Black hemp cultivator based in Richmond who served jail time for marijuana possession, said those entrepreneurs deserved special attention. Thomas said he looked forward to offering his own line of organic, craft cannabis. Being that the arrest rate wasn'''t the same for everyone, I don'''t think the business opportunities should be the same for everyone",
 ]
 
 
 def test_qa_regular():
     query = "when is marijuana legalized"
-    expected = 'it will be legal to grow up to four marijuana plants beginning July 1'
+    expected = "it will be legal to grow up to four marijuana plants beginning July 1"
     resp, took = send_qa(query, qa_test_context_1)
-    top_answer = resp['answers'][0]['text']
-    scores = [i['null_score_diff'] for i in resp['answers']]
-    print("\nQUESTION: ", query, "\nANSWER: ",
-          top_answer, f"\n (took {took} seconds)\n")
+    top_answer = resp["answers"][0]["text"]
+    scores = [i["null_score_diff"] for i in resp["answers"]]
+    print(
+        "\nQUESTION: ", query, "\nANSWER: ", top_answer, f"\n (took {took} seconds)\n"
+    )
     assert top_answer == expected  # assert response is right
     # assert took < QA_TIMEOUT # assert time
-    assert resp['answers'][0]['null_score_diff'] == min(
-        scores)  # assert is best scoring answer
+    assert resp["answers"][0]["null_score_diff"] == min(
+        scores
+    )  # assert is best scoring answer
 
 
 def test_qa_one_question():
     query = "when is marijuana legalized?"
-    expected = 'it will be legal to grow up to four marijuana plants beginning July 1'
+    expected = "it will be legal to grow up to four marijuana plants beginning July 1"
     resp, took = send_qa(query, qa_test_context_1)
-    top_answer = resp['answers'][0]['text']
-    scores = [i['null_score_diff'] for i in resp['answers']]
-    print("\nQUESTION: ", query, "\nANSWER: ",
-          top_answer, f"\n (took {took} seconds)\n")
+    top_answer = resp["answers"][0]["text"]
+    scores = [i["null_score_diff"] for i in resp["answers"]]
+    print(
+        "\nQUESTION: ", query, "\nANSWER: ", top_answer, f"\n (took {took} seconds)\n"
+    )
     assert top_answer == expected  # assert response is right
     # assert took < QA_TIMEOUT # assert time
-    assert resp['answers'][0]['null_score_diff'] == min(
-        scores)  # assert is best scoring answer
+    assert resp["answers"][0]["null_score_diff"] == min(
+        scores
+    )  # assert is best scoring answer
 
 
 def test_qa_multiple_question():
     query = "when is marijuana legalized???"
-    expected = 'it will be legal to grow up to four marijuana plants beginning July 1'
+    expected = "it will be legal to grow up to four marijuana plants beginning July 1"
     resp, took = send_qa(query, qa_test_context_1)
-    top_answer = resp['answers'][0]['text']
-    scores = [i['null_score_diff'] for i in resp['answers']]
-    print("\nQUESTION: ", query, "\nANSWER: ",
-          top_answer, f"\n (took {took} seconds)\n")
+    top_answer = resp["answers"][0]["text"]
+    scores = [i["null_score_diff"] for i in resp["answers"]]
+    print(
+        "\nQUESTION: ", query, "\nANSWER: ", top_answer, f"\n (took {took} seconds)\n"
+    )
     assert top_answer == expected  # assert response is right
     # assert took < QA_TIMEOUT # assert time
-    assert resp['answers'][0]['null_score_diff'] == min(
-        scores)  # assert is best scoring answer
+    assert resp["answers"][0]["null_score_diff"] == min(
+        scores
+    )  # assert is best scoring answer
 
 
 def test_qa_allcaps():
     query = "WHEN IS MARIJUANA LEGALIZED"
-    expected = 'it will be legal to grow up to four marijuana plants beginning July 1'
+    expected = "it will be legal to grow up to four marijuana plants beginning July 1"
     resp, took = send_qa(query, qa_test_context_1)
-    top_answer = resp['answers'][0]['text']
-    scores = [i['null_score_diff'] for i in resp['answers']]
-    print("\nQUESTION: ", query, "\nANSWER: ",
-          top_answer, f"\n (took {took} seconds)\n")
+    top_answer = resp["answers"][0]["text"]
+    scores = [i["null_score_diff"] for i in resp["answers"]]
+    print(
+        "\nQUESTION: ", query, "\nANSWER: ", top_answer, f"\n (took {took} seconds)\n"
+    )
     assert top_answer == expected  # assert response is right
     # assert took < QA_TIMEOUT # assert time
-    assert resp['answers'][0]['null_score_diff'] == min(
-        scores)  # assert is best scoring answer
+    assert resp["answers"][0]["null_score_diff"] == min(
+        scores
+    )  # assert is best scoring answer
 
 
 def test_qa_apostrophe():
     query = "when's marijuana legalized"
-    expected = 'it will be legal to grow up to four marijuana plants beginning July 1'
+    expected = "it will be legal to grow up to four marijuana plants beginning July 1"
     resp, took = send_qa(query, qa_test_context_1)
-    top_answer = resp['answers'][0]['text']
-    scores = [i['null_score_diff'] for i in resp['answers']]
-    print("\nQUESTION: ", query, "\nANSWER: ",
-          top_answer, f"\n (took {took} seconds)\n")
+    top_answer = resp["answers"][0]["text"]
+    scores = [i["null_score_diff"] for i in resp["answers"]]
+    print(
+        "\nQUESTION: ", query, "\nANSWER: ", top_answer, f"\n (took {took} seconds)\n"
+    )
     assert top_answer == expected  # assert response is right
     # assert took < QA_TIMEOUT # assert time
-    assert resp['answers'][0]['null_score_diff'] == min(
-        scores)  # assert is best scoring answer
+    assert resp["answers"][0]["null_score_diff"] == min(
+        scores
+    )  # assert is best scoring answer
 
 
 def test_qa_past_tense():
     query = "when was marijuana legalized?"
-    expected = 'Wednesday'
+    expected = "Wednesday"
     resp, took = send_qa(query, qa_test_context_1)
-    top_answer = resp['answers'][0]['text']
-    scores = [i['null_score_diff'] for i in resp['answers']]
-    print("\nQUESTION: ", query, "\nANSWER: ",
-          top_answer, f"\n (took {took} seconds)\n")
+    top_answer = resp["answers"][0]["text"]
+    scores = [i["null_score_diff"] for i in resp["answers"]]
+    print(
+        "\nQUESTION: ", query, "\nANSWER: ", top_answer, f"\n (took {took} seconds)\n"
+    )
     assert top_answer == expected  # assert response is right
     # assert took < QA_TIMEOUT # assert time
-    assert resp['answers'][0]['null_score_diff'] == min(
-        scores)  # assert is best scoring answer
+    assert resp["answers"][0]["null_score_diff"] == min(
+        scores
+    )  # assert is best scoring answer
 
 
 def test_qa_future_tense():
     query = "when will marijuana be legal?"
-    expected = 'it will be legal to grow up to four marijuana plants beginning July 1'
+    expected = "it will be legal to grow up to four marijuana plants beginning July 1"
     resp, took = send_qa(query, qa_test_context_1)
-    top_answer = resp['answers'][0]['text']
-    scores = [i['null_score_diff'] for i in resp['answers']]
-    print("\nQUESTION: ", query, "\nANSWER: ",
-          top_answer, f"\n (took {took} seconds)\n")
+    top_answer = resp["answers"][0]["text"]
+    scores = [i["null_score_diff"] for i in resp["answers"]]
+    print(
+        "\nQUESTION: ", query, "\nANSWER: ", top_answer, f"\n (took {took} seconds)\n"
+    )
     assert top_answer == expected  # assert response is right
     # assert took < QA_TIMEOUT # assert time
-    assert resp['answers'][0]['null_score_diff'] == min(
-        scores)  # assert is best scoring answer
+    assert resp["answers"][0]["null_score_diff"] == min(
+        scores
+    )  # assert is best scoring answer
 
 
 def test_qa_specific():
     query = "when will marijuana be legal in Virginia?"
-    expected = 'it will be legal to grow up to four marijuana plants beginning July 1'
+    expected = "it will be legal to grow up to four marijuana plants beginning July 1"
     resp, took = send_qa(query, qa_test_context_1)
-    top_answer = resp['answers'][0]['text']
-    scores = [i['null_score_diff'] for i in resp['answers']]
-    print("\nQUESTION: ", query, "\nANSWER: ",
-          top_answer, f"\n (took {took} seconds)\n")
+    top_answer = resp["answers"][0]["text"]
+    scores = [i["null_score_diff"] for i in resp["answers"]]
+    print(
+        "\nQUESTION: ", query, "\nANSWER: ", top_answer, f"\n (took {took} seconds)\n"
+    )
     assert top_answer == expected  # assert response is right
     # assert took < QA_TIMEOUT # assert time
-    assert resp['answers'][0]['null_score_diff'] == min(
-        scores)  # assert is best scoring answer
+    assert resp["answers"][0]["null_score_diff"] == min(
+        scores
+    )  # assert is best scoring answer
 
 
 def test_qa_outside_scope():
     query = "what is the capital of Assyria?"
-    expected = ''
+    expected = ""
     resp, took = send_qa(query, qa_test_context_1)
-    top_answer = resp['answers'][0]['text']
-    scores = [i['null_score_diff'] for i in resp['answers']]
-    print("\nQUESTION: ", query, "\nANSWER: ",
-          top_answer, f"\n (took {took} seconds)\n")
+    top_answer = resp["answers"][0]["text"]
+    scores = [i["null_score_diff"] for i in resp["answers"]]
+    print(
+        "\nQUESTION: ", query, "\nANSWER: ", top_answer, f"\n (took {took} seconds)\n"
+    )
     assert top_answer == expected  # assert response is right
     # assert took < QA_TIMEOUT # assert time
-    assert resp['answers'][0]['null_score_diff'] == min(
-        scores)  # assert is best scoring answer
+    assert resp["answers"][0]["null_score_diff"] == min(
+        scores
+    )  # assert is best scoring answer
+
 
 # Train Model tests
 
diff --git a/gamechangerml/api/tests/test_examples.py b/gamechangerml/api/tests/test_examples.py
index 14dd9687..83a9c5f1 100644
--- a/gamechangerml/api/tests/test_examples.py
+++ b/gamechangerml/api/tests/test_examples.py
@@ -1,91 +1,108 @@
 class TestSet:
     qa_test_data = {"text": "How manysides does a pentagon have?"}
-    qa_expect = {"answers": ["five"],
-                 "question": "How many sides does a pentagon have?"}
+    qa_expect = {
+        "answers": ["five"],
+        "question": "How many sides does a pentagon have?",
+    }
     text_extract_test_data = {
         "text": "In a major policy revision intended to encourage more schools to welcome children back to in-person instruction, federal health officials on Friday relaxed the six-foot distancing rule for elementary school students, saying they need only remain three feet apart in classrooms as long as everyone is wearing a mask. The three-foot rule also now applies to students in middle schools and high schools, as long as community transmission is not high, officials said. When transmission is high, however, these students must be at least six feet apart, unless they are taught in cohorts, or small groups that are kept separate from others. The six-foot rule still applies in the community at large, officials emphasized, and for teachers and other adults who work in schools, who must maintain that distance from other adults and from students. Most schools are already operating at least partially in person, and evidence suggests they are doing so relatively safely. Research shows in-school spread can be mitigated with simple safety measures such as masking, distancing, hand-washing and open windows. EDUCATION BRIEFING: The pandemic is upending education. Get the latest news and tips."
     }
-    summary_expect = {"extractType": "summary", "extracted": "In a major policy revision intended to encourage more schools to welcome children back to in-person instruction, federal health officials on Friday relaxed the six-foot distancing rule for elementary school students, saying they need only remain three feet apart in classrooms as long as everyone is wearing a mask."}
-    topics_expect = {"extractType": "topics", "extracted": [[0.44866187988155737, "distancing"], [0.30738175379466876, "schools"], [
-        0.3028274099264987, "upending"], [0.26273395468924415, "students"], [0.23815691706519543, "adults"]]}
-    keywords_expect = {"extractType": "keywords",
-                       "extracted": ["six-foot rule", "three-foot rule"]}
+    summary_expect = {
+        "extractType": "summary",
+        "extracted": "In a major policy revision intended to encourage more schools to welcome children back to in-person instruction, federal health officials on Friday relaxed the six-foot distancing rule for elementary school students, saying they need only remain three feet apart in classrooms as long as everyone is wearing a mask.",
+    }
+    topics_expect = {
+        "extractType": "topics",
+        "extracted": [
+            [0.44866187988155737, "distancing"],
+            [0.30738175379466876, "schools"],
+            [0.3028274099264987, "upending"],
+            [0.26273395468924415, "students"],
+            [0.23815691706519543, "adults"],
+        ],
+    }
+    keywords_expect = {
+        "extractType": "keywords",
+        "extracted": ["six-foot rule", "three-foot rule"],
+    }
     sentence_test_data = {"text": "naval command"}
     sentence_search_expect = [
         {
             "id": "OPNAVNOTE 5430.1032.pdf_36",
             "text": "naval forces central command comusnavcent commander u s naval forces southern command comnavso and commander u s naval forces europe commander u s naval forces africa comusnaveur comusnavaf",
             "text_length": 0.2,
-            "score": 0.9124890685081481
+            "score": 0.9124890685081481,
         },
         {
             "id": "OPNAVINST 3440.18.pdf_124",
             "text": "c commander u s naval forces europe africa for ports in the u s european command and the u s africa command area of responsibility",
             "text_length": 0.11060606060606061,
-            "score": 0.7812968355236631
+            "score": 0.7812968355236631,
         },
         {
             "id": "OPNAVINST 3006.1 w CH-2.pdf_178",
             "text": "enclosure naval forces africa commander u s naval forces central command commander u s naval forces southern command shall",
             "text_length": 0.09848484848484848,
-            "score": 0.775530730233048
+            "score": 0.775530730233048,
         },
         {
             "id": "MILPERSMAN 1001-021.pdf_10",
             "text": "major shore commands e g office of the chief of naval operations navy personnel command commander navy reserve forces command etc",
             "text_length": 0.10909090909090909,
-            "score": 0.7683667984875766
+            "score": 0.7683667984875766,
         },
         {
             "id": "OPNAVINST 3440.18.pdf_125",
             "text": "d commander u s naval forces central command for ports in the u s central command area of responsibility and",
             "text_length": 0.07727272727272727,
-            "score": 0.7664882681586526
+            "score": 0.7664882681586526,
         },
         {
             "id": "OPNAVINST 8120.1A.pdf_64",
             "text": "j commander naval sea systems command comnavseasyscom comnavseasyscom is the echelon supporting flag officer to",
             "text_length": 0.08181818181818182,
-            "score": 0.764475125616247
+            "score": 0.764475125616247,
         },
         {
             "id": "DoDD 4500.56 CH 5.pdf_157",
             "text": "m commander u s naval forces europe and commander u s naval forces africa",
             "text_length": 0.024242424242424242,
-            "score": 0.7282583944725268
+            "score": 0.7282583944725268,
         },
         {
             "id": "OPNAVINST 3111.17B.pdf_224",
             "text": "commander u s naval forces europe u s naval forces africa",
             "text_length": 0.0,
-            "score": 0.716657280921936
+            "score": 0.716657280921936,
         },
         {
             "id": "MARINE CORPS MANUAL CH 1-3.pdf_690",
             "text": "navy personnel under the military command of the commandant of the marine corps",
             "text_length": 0.03333333333333333,
-            "score": 0.6932793577512105
+            "score": 0.6932793577512105,
         },
         {
             "id": "SECNAVINST 4200.36B.pdf_28",
             "text": "naval regional commanders and the commandant of the marine corps shall",
             "text_length": 0.019696969696969695,
-            "score": 0.6766319462747284
-        }
+            "score": 0.6766319462747284,
+        },
     ]
 
     word_sim_data = {"text": "naval command"}
-    word_sim_except = {
-        "naval": [
-            "navy",
-            "maritime"
-        ],
-        'command': []
-    }
+    word_sim_except = {"naval": ["navy", "maritime"], "command": []}
 
     recommender_data = {"filenames": ["Title 10"]}
-    recommender_results = {'filenames': ['Title 10'], 'results':  [
-        'Title 50', 'AACP 02.1', 'DoDD 5143.01 CH 2', 'DoDD S-5230.28', 'DoDI 5000.89']}
+    recommender_results = {
+        "filenames": ["Title 10"],
+        "results": [
+            "Title 50",
+            "AACP 02.1",
+            "DoDD 5143.01 CH 2",
+            "DoDD S-5230.28",
+            "DoDI 5000.89",
+        ],
+    }
 
     # extraction_data = {"text": "Carbon emissions trading is poised to go global, and billions of dollars — maybe even trillions — could be at stake. That's thanks to last month's U.N. climate summit in Glasgow Scotland, which approved a new international trading system where companies pay for cuts in greenhouse gas emissions somewhere else, rather than doing it themselves."}
     # extraction_keywords_expect = {
@@ -151,12 +168,12 @@ class TestSet:
         ],
     }
     transformer_list_expect = {
-        'bert-base-cased-squad2',
-        'distilbart-mnli-12-3',
-        'distilbert-base-uncased-distilled-squad',
-        'distilroberta-base',
-        'msmarco-distilbert-base-v2',
-        'msmarco-distilbert-base-v2_20220105'
+        "bert-base-cased-squad2",
+        "distilbart-mnli-12-3",
+        "distilbert-base-uncased-distilled-squad",
+        "distilroberta-base",
+        "msmarco-distilbert-base-v2",
+        "msmarco-distilbert-base-v2_20220105"
         # 'msmarco-distilbert-base-v2_2021-10-17',
         # 'msmarco-distilbert-base-v2_20211210',
     }
@@ -166,12 +183,12 @@ class TestSet:
         "bad_acronyms": "EPA/625/11-91/002, 1992 (ar) 40 CFR 268 (as) 40 CFR 240 (at) 42 U.S.C. 7401 (au) 40 CFR 61 (av) 40 CFR 230 (aw) 33 CFR 320 (ax) 33 CFR 321 (ay) 33 CFR 322 (az) 33 CFR 323 (ba) 33 CFR 325 (bb) 33 CFR 330 (bc) 40 CFR 233 (bd) 16 U.S.C. §§1451-1464 (be) 42 U.S.C. 4321 (bf) 40 CFR 220 (bg) 40 CFR 221 (bh) 40 CFR 222 (bi) 40 CFR 227 (bj) 40 CFR 224 (bk) 40 CFR 228 (bl) 40 CFR 223 (bm) 40 CFR 225 (bn) 40 CFR 226 (bo) 40 CFR 229 (bp) 33 U.S.C. 1401 (bq) 40 CFR 255 (br) 33 CFR 324 (bs) 15 CFR 930",
         "bad_pages": "Page D4-1 – D4-6 Page D4-1 – D4-6 Page D4-9 – D4-10 Page D4-9 – D4-10 Page D4-13 – D4-16 Page D4-13 – D4-16 Page D4-19 – D4-20 Page D4-18a – D4-20 Page E3-1 – E3-25 Page E3-1 – E3-36",
         "bad_long_tokens": "OPLANOPORDPAPACEPCCPCIPDSPHAPMCSPMIPOIPSGPZRTDSBSPOSQDLDRSSASTANAGTACEVACTACSOPTAPTASKORGTB MEDTCTCCCTEWLSTFCTLTLPTMTOETTPUAoperation planoperation orderphysician assistantprimary, alternate, contingency, and emergencypre-combat checkpre-combat inspectionpatient decontamination siteperiodic health assessmentpreventive maintainence checks and servicepatient movement itempoint of injuryplatoon sergeantpickup zonereturn to dutysupply bulletinsupport ope",
-        "bad_toc": "DoDM 4140.68 March 5 2020 TABLE OF CONTENTS 2 TABLE OF CONTENTS SECTION 1 GENERAL ISSUANCE INFORMATION .............................................................................. 4 1.1. Applicability. .................................................................................................................... 4 SECTION 2 RESPONSIBILITIES ......................................................................................................... 5 2.1. Assistant Secretary of Defense for Sustainment. .............................................................. 5 2.2. DLA. ................................................................................................................................. 5 2.3. DoD Component Heads. ................................................................................................... 5 2.4. Secretaries of the Military Departments. .......................................................................... 6 2.5. Commander United States Special Operations Command USSOCOM. ...................... 7 2.6. Administrators of Participating U.S. Government Civil Agencies. .................................. 7 SECTION 3 GENERAL PROCEDURES ................................................................................................ 8 3.1. NIMSCs. ........................................................................................................................... 8 3.2. PICA. .............................................................................................................................. 17 3.3. SICA. .............................................................................................................................. 18 3.4. Exceptions for SOP Items.............................................................................................. 19 3.5. NIMSC Designation........................................................................................................ 20 SECTION 4 SUPPLY AND DEPOT MAINTENANCE OPERATIONS PROCEDURES ................................ 24 4.1. Procedures for NIMSC 1 2 3 4 5 6 7 8 or 0 items. ................................................. 24 4.2. Provisioning. ................................................................................................................... 24 4.3. PICA Assignment. .......................................................................................................... 25 4.4. IMC Changes and PICA or SICA Reassignment Requests. ........................................... 26 4.5. Item Adoption. ................................................................................................................ 28 4.6. Procurement. ................................................................................................................... 28 4.7. Cataloging. ...................................................................................................................... 29 4.8. Depot Maintenance. ........................................................................................................ 31 4.9. Disposition. ..................................................................................................................... 31 4.10. Inactive Items. ............................................................................................................... 32 4.11. Standardization. ............................................................................................................ 33 SECTION 5 ITEM REVIEW PROCEDURES FOR MIGRATION TO NIMSC 5 OR NIMSC 6 .................. 34 5.1. Review Items for Migration to NIMSC 5 or NIMSC 6. ................................................. 34 5.2. Single Submitter of Procurement Specifications and Depotlevel Repair Specifications. ................................................................................................................... 35 SECTION 6 NIMSC MIGRATION PROCEDURES ............................................................................. 37 6.1. Migration to NIMSC 5 or NIMSC 6. .............................................................................. 37 6.2. NIMSC Migration or PICA Reassignment. .................................................................... 37 6.3. PreETD TimePeriod. .................................................................................................... 40 6.4. ETD TimePeriod............................................................................................................ 42 6.5. PostETD Timeperiod.................................................................................................... 42 SECTION 7 SUPPLY OPERATIONS PROCEDURES FOR NIMSC 5 AND NIMSC 6 ITEMS ................. 44 7.1. Item Stockage.................................................................................................................. 44 7.2. Requirements Computation and Methodology. .............................................................. 44 7.3. Item Distribution. ............................................................................................................ 44 a. Item Transfer Actions. ................................................................................................. 44 b. Requisition Processing. ................................................................................................ 45 "
-        }
+        "bad_toc": "DoDM 4140.68 March 5 2020 TABLE OF CONTENTS 2 TABLE OF CONTENTS SECTION 1 GENERAL ISSUANCE INFORMATION .............................................................................. 4 1.1. Applicability. .................................................................................................................... 4 SECTION 2 RESPONSIBILITIES ......................................................................................................... 5 2.1. Assistant Secretary of Defense for Sustainment. .............................................................. 5 2.2. DLA. ................................................................................................................................. 5 2.3. DoD Component Heads. ................................................................................................... 5 2.4. Secretaries of the Military Departments. .......................................................................... 6 2.5. Commander United States Special Operations Command USSOCOM. ...................... 7 2.6. Administrators of Participating U.S. Government Civil Agencies. .................................. 7 SECTION 3 GENERAL PROCEDURES ................................................................................................ 8 3.1. NIMSCs. ........................................................................................................................... 8 3.2. PICA. .............................................................................................................................. 17 3.3. SICA. .............................................................................................................................. 18 3.4. Exceptions for SOP Items.............................................................................................. 19 3.5. NIMSC Designation........................................................................................................ 20 SECTION 4 SUPPLY AND DEPOT MAINTENANCE OPERATIONS PROCEDURES ................................ 24 4.1. Procedures for NIMSC 1 2 3 4 5 6 7 8 or 0 items. ................................................. 24 4.2. Provisioning. ................................................................................................................... 24 4.3. PICA Assignment. .......................................................................................................... 25 4.4. IMC Changes and PICA or SICA Reassignment Requests. ........................................... 26 4.5. Item Adoption. ................................................................................................................ 28 4.6. Procurement. ................................................................................................................... 28 4.7. Cataloging. ...................................................................................................................... 29 4.8. Depot Maintenance. ........................................................................................................ 31 4.9. Disposition. ..................................................................................................................... 31 4.10. Inactive Items. ............................................................................................................... 32 4.11. Standardization. ............................................................................................................ 33 SECTION 5 ITEM REVIEW PROCEDURES FOR MIGRATION TO NIMSC 5 OR NIMSC 6 .................. 34 5.1. Review Items for Migration to NIMSC 5 or NIMSC 6. ................................................. 34 5.2. Single Submitter of Procurement Specifications and Depotlevel Repair Specifications. ................................................................................................................... 35 SECTION 6 NIMSC MIGRATION PROCEDURES ............................................................................. 37 6.1. Migration to NIMSC 5 or NIMSC 6. .............................................................................. 37 6.2. NIMSC Migration or PICA Reassignment. .................................................................... 37 6.3. PreETD TimePeriod. .................................................................................................... 40 6.4. ETD TimePeriod............................................................................................................ 42 6.5. PostETD Timeperiod.................................................................................................... 42 SECTION 7 SUPPLY OPERATIONS PROCEDURES FOR NIMSC 5 AND NIMSC 6 ITEMS ................. 44 7.1. Item Stockage.................................................................................................................. 44 7.2. Requirements Computation and Methodology. .............................................................. 44 7.3. Item Distribution. ............................................................................................................ 44 a. Item Transfer Actions. ................................................................................................. 44 b. Requisition Processing. ................................................................................................ 45 ",
+    }
     sent_index_processing_results = {
         "has_many_short_tokens": [False, True, True, False, False],
         "has_many_repeating": [False, True, True, False, False],
         "has_extralong_tokens": [False, False, False, True, False],
         "is_a_toc": [False, False, False, False, False],
-        "check_quality": [True, False, False, False, False]
-    }
\ No newline at end of file
+        "check_quality": [True, False, False, False, False],
+    }
diff --git a/gamechangerml/src/text_handling/corpus.py b/gamechangerml/src/text_handling/corpus.py
index 5cd6077d..cba0220c 100644
--- a/gamechangerml/src/text_handling/corpus.py
+++ b/gamechangerml/src/text_handling/corpus.py
@@ -18,15 +18,19 @@ def __init__(
         min_token_len=3,
         verbose=False,
         bert_based_tokenizer=None,
-        files_to_use=None
+        files_to_use=None,
     ):
         self.directory = directory
-        if files_to_use: ## if we only want to do this on a subset
-            self.file_list = list(set([os.path.join(directory, i) for i in files_to_use]).intersection([
-                os.path.join(directory, file)
-                for file in os.listdir(directory)
-                if file[-5:] == ".json"
-            ]))
+        if files_to_use:  ## if we only want to do this on a subset
+            self.file_list = list(
+                set([os.path.join(directory, i) for i in files_to_use]).intersection(
+                    [
+                        os.path.join(directory, file)
+                        for file in os.listdir(directory)
+                        if file[-5:] == ".json"
+                    ]
+                )
+            )
         else:
             self.file_list = [
                 os.path.join(directory, file)
@@ -72,7 +76,7 @@ def __iter__(self):
                                     yield tokens, para_id
                                 else:
                                     yield tokens
-                    
+
                 progress += 1
                 processmanager.update_status(
                     processmanager.loading_corpus,
diff --git a/gamechangerml/src/utilities/text_utils.py b/gamechangerml/src/utilities/text_utils.py
index 9528dc6a..5dfff235 100644
--- a/gamechangerml/src/utilities/text_utils.py
+++ b/gamechangerml/src/utilities/text_utils.py
@@ -161,134 +161,157 @@ def clean_text(doc_text: str) -> str:
 
     return text
 
+
 # Source: https://rajpurkar.github.io/SQuAD-explorer/
 def normalize_answer(s: str) -> str:
     """
     Normalize answers for QA evaluation.
     Lower text and remove punctuation, articles and extra whitespace.
     """
+
     def remove_articles(text):
-        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
-        return re.sub(regex, ' ', text)
+        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
+
     def white_space_fix(text):
-        return ' '.join(text.split())
+        return " ".join(text.split())
+
     def remove_punc(text):
         exclude = set(punctuation)
-        return ''.join(ch for ch in text if ch not in exclude)
+        return "".join(ch for ch in text if ch not in exclude)
+
     def lower(text):
         return text.lower()
+
     return white_space_fix(remove_articles(remove_punc(lower(s))))
 
+
 def normalize_query(s: str) -> str:
     """
     Normalize queries.
     Lower text and remove extra whitespace.
     """
+
     def white_space_fix(text):
-        return ' '.join(text.strip().lstrip().split())
+        return " ".join(text.strip().lstrip().split())
+
     def lower(text):
         return text.lower()
+
     def remove_quotes(text):
         exclude = ["'", '"']
-        return ''.join(ch for ch in text if ch not in exclude)
+        return "".join(ch for ch in text if ch not in exclude)
+
     return white_space_fix(remove_quotes(lower(s)))
 
+
 def clean_query(query: str) -> str:
-    '''Removes all non alphanumeric characters and 'and' / 'or' from query string'''
+    """Removes all non alphanumeric characters and 'and' / 'or' from query string"""
 
-    stop = ['and', 'or']
+    stop = ["and", "or"]
     query = [i for i in query.lower().split() if i not in stop]
-    query = re.sub(r'[^ a-zA-Z]', '', ' '.join(query))
-    query = ' '.join(query.strip().lstrip().split())
-    
-    return query 
+    query = re.sub(r"[^ a-zA-Z]", "", " ".join(query))
+    query = " ".join(query.strip().lstrip().split())
+
+    return query
+
 
 def get_tokens(s: str) -> List[str]:
-    '''Get tokens from normalized answer.'''
-    if not s: return []
+    """Get tokens from normalized answer."""
+    if not s:
+        return []
     return s.split()
 
-def has_many_short_tokens(processed_tokens, threshold = 4.0):
-    '''Checks if the median length of tokens is less than the expected threshold'''
+
+def has_many_short_tokens(processed_tokens, threshold=4.0):
+    """Checks if the median length of tokens is less than the expected threshold"""
     median_len = np.median([len(i) for i in processed_tokens])
     if median_len <= threshold:
         return True
     else:
         return False
 
-def has_many_repeating(text, tokens, threshold = 0.6):
-    '''Checks if the ratio of unique tokens is less than an expected threshold'''
-    ratio_unique = len(set(tokens)) / len(text.split(' '))
+
+def has_many_repeating(text, tokens, threshold=0.6):
+    """Checks if the ratio of unique tokens is less than an expected threshold"""
+    ratio_unique = len(set(tokens)) / len(text.split(" "))
     if ratio_unique < threshold:
         return True
     else:
         return False
 
-def has_extralong_tokens(text, threshold = 25):
-    '''Checks if the paragraph has a token that exceeds the threshold for normal token length'''
-    longest_token = np.max([len(i) for i in text.split(' ')])
+
+def has_extralong_tokens(text, threshold=25):
+    """Checks if the paragraph has a token that exceeds the threshold for normal token length"""
+    longest_token = np.max([len(i) for i in text.split(" ")])
     if longest_token > threshold:
         return True
     else:
         return False
 
+
 def is_a_toc(text):
-    '''Checks if a paragraph appears to be a table of contents'''
-    toc_separation = re.findall(r'(\.{3,})', text)
+    """Checks if a paragraph appears to be a table of contents"""
+    toc_separation = re.findall(r"(\.{3,})", text)
     if len(toc_separation) > 0:
         return True
     else:
         return False
 
+
 def majority_tokens_filtered(tokens, text):
-    '''Checks if most of the tokens were filtered out'''
-    if (len(tokens) / len(text.split(' '))) <= 0.5:
+    """Checks if most of the tokens were filtered out after processing"""
+    if (len(tokens) / len(text.split(" "))) <= 0.5:
         return True
     else:
         return False
 
+
 def check_quality_paragraph(tokens, text):
-    '''Runs functions to check that a paragraph isn't a junk paragraph'''
+    """Runs filter functions to check that a paragraph isn't a junk paragraph"""
 
     if majority_tokens_filtered(tokens, text):
         return False
-    if has_many_short_tokens(tokens, threshold = 4.0):
+    if has_many_short_tokens(tokens, threshold=4.0):
         return False
-    elif has_many_repeating(text, tokens, threshold = 0.6):
+    elif has_many_repeating(text, tokens, threshold=0.6):
         return False
-    elif has_extralong_tokens(text, threshold = 25):
+    elif has_extralong_tokens(text, threshold=25):
         return False
     elif is_a_toc(text):
         return False
     else:
         return True
 
+
 # Adapted from https://www.datacamp.com/community/tutorials/fuzzy-string-python
-def levenshtein_ratio_and_distance(s: str, t: str, ratio_calc: bool=False) -> Tuple[int,float]:
-    """ levenshtein_ratio_and_distance:
-        Calculates levenshtein distance between two strings.
-        If ratio_calc = True, the function computes the
-        levenshtein distance ratio of similarity between two strings
-        For all i and j, distance[i,j] will contain the Levenshtein
-        distance between the first i characters of s and the
-        first j characters of t
+def levenshtein_ratio_and_distance(
+    s: str, t: str, ratio_calc: bool = False
+) -> Tuple[int, float]:
+    """levenshtein_ratio_and_distance:
+    Calculates levenshtein distance between two strings.
+    If ratio_calc = True, the function computes the
+    levenshtein distance ratio of similarity between two strings
+    For all i and j, distance[i,j] will contain the Levenshtein
+    distance between the first i characters of s and the
+    first j characters of t
     """
     # Initialize matrix of zeros
-    rows = len(s)+1
-    cols = len(t)+1
-    distance = np.zeros((rows,cols),dtype = int)
+    rows = len(s) + 1
+    cols = len(t) + 1
+    distance = np.zeros((rows, cols), dtype=int)
 
     # Populate matrix of zeros with the indeces of each character of both strings
     for i in range(1, rows):
-        for k in range(1,cols):
+        for k in range(1, cols):
             distance[i][0] = i
             distance[0][k] = k
 
-    # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions    
+    # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions
     for col in range(1, cols):
         for row in range(1, rows):
-            if s[row-1] == t[col-1]:
-                cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
+            if s[row - 1] == t[col - 1]:
+                cost = 0  # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
             else:
                 # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
                 # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
@@ -296,15 +319,18 @@ def levenshtein_ratio_and_distance(s: str, t: str, ratio_calc: bool=False) -> Tu
                     cost = 2
                 else:
                     cost = 1
-            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of deletions
-                                 distance[row][col-1] + 1,          # Cost of insertions
-                                 distance[row-1][col-1] + cost)     # Cost of substitutions
-    Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
-    
+            distance[row][col] = min(
+                distance[row - 1][col] + 1,  # Cost of deletions
+                distance[row][col - 1] + 1,  # Cost of insertions
+                distance[row - 1][col - 1] + cost,
+            )  # Cost of substitutions
+    Ratio = ((len(s) + len(t)) - distance[row][col]) / (len(s) + len(t))
+
     return distance[row][col], Ratio
 
+
 def string_contains(str1: str, str2: str) -> bool:
-    '''Checks if a str2 contains str1'''
+    """Checks if a str2 contains str1"""
     set1 = str1.lower().split()
     set2 = str2.lower().split()
     if len(set(set1).intersection(set2)) == len(set1):
@@ -312,22 +338,24 @@ def string_contains(str1: str, str2: str) -> bool:
     else:
         return False
 
-def check_majority_numbers(query: str, ratio: float=0.6) -> bool:
-    '''Checks ratio of numerical characters in a string, True if ratio is less than ratio threshold'''
-    
-    if len(re.sub(r'[0-9]', '', query))/len(query) <= ratio:
+
+def check_majority_numbers(query: str, ratio: float = 0.6) -> bool:
+    """Checks ratio of numerical characters in a string, True if ratio is less than ratio threshold"""
+
+    if len(re.sub(r"[0-9]", "", query)) / len(query) <= ratio:
         return True
     else:
         return False
 
-def sort_first(samples: List[str]) -> Dict[str,List[str]]:
-    '''Makes a dictionary of first letter: string for faster lookup of strings'''
+
+def sort_first(samples: List[str]) -> Dict[str, List[str]]:
+    """Makes a dictionary of first letter: string for faster lookup of strings"""
 
     doc_dict = {}
     docs = []
     first_letters = []
     for i in list(set(samples)):
-        if type(i)==str:
+        if type(i) == str:
             first_letters.append(str(i)[0].lower())
             docs.append(i)
     zipped = dict(zip(docs, first_letters))
@@ -336,18 +364,21 @@ def sort_first(samples: List[str]) -> Dict[str,List[str]]:
 
     return doc_dict
 
+
 def filter_title_queries(queries: List[str], doc_ids: List[str]) -> List[str]:
-    '''Collects list of queries that appear in a list of doc_ids/appear to look like doc_ids'''
-    
+    """Collects list of queries that appear in a list of doc_ids/appear to look like doc_ids"""
+
     remove = []
     logger.info("Making dictionary for doc titles")
     doc_dict = sort_first(doc_ids)
     logger.info("*** Comparing queries to doc titles\n")
     for i in queries:
-        if not re.search('[a-zA-Z]', i):  ## if the query has no letters, remove
+        if not re.search("[a-zA-Z]", i):  ## if the query has no letters, remove
             logger.info(f"*** Removing query: {i} // (contains no characters)")
             remove.append(i)
-        elif re.search('[0-9]', i):       ## if there are numbers in the query, compare to titles
+        elif re.search(
+            "[0-9]", i
+        ):  ## if there are numbers in the query, compare to titles
             if i.lower() in list(set([q.lower() for q in doc_ids])):
                 logger.info(f"*** Removing query: {i} // (in doc ids)")
                 remove.append(i)
@@ -357,21 +388,29 @@ def filter_title_queries(queries: List[str], doc_ids: List[str]) -> List[str]:
             else:
                 try:
                     cleaned = i.upper().replace("'", "")
-                    start = cleaned[0].lower() # starting letter
+                    start = cleaned[0].lower()  # starting letter
                     sub = doc_dict[start]
                     for x in sub:
                         if string_contains(cleaned, x):
-                            logger.info(f"*** Removing query: {i} // (string inside string)")
+                            logger.info(
+                                f"*** Removing query: {i} // (string inside string)"
+                            )
                             remove.append(i)
                             break
                         else:
-                            dist, ratio = levenshtein_ratio_and_distance(cleaned.lower(),x.lower())
+                            dist, ratio = levenshtein_ratio_and_distance(
+                                cleaned.lower(), x.lower()
+                            )
                             if len(i) > 12 and ratio >= 0.75:
-                                logger.info(f"*** Removing query: {i} // ({dist} char, {ratio} ratio diff from doc title)")
+                                logger.info(
+                                    f"*** Removing query: {i} // ({dist} char, {ratio} ratio diff from doc title)"
+                                )
                                 remove.append(i)
                                 break
                             elif len(i) < 12 and dist <= 2:
-                                logger.info(f"*** Removing query: {i} // ({dist} char, {ratio} ratio diff from doc title)")
+                                logger.info(
+                                    f"*** Removing query: {i} // ({dist} char, {ratio} ratio diff from doc title)"
+                                )
                                 remove.append(i)
                                 break
                 except Exception as e:

From a84386f78cd8dc307e5de67fbf06b31b382c6353 Mon Sep 17 00:00:00 2001
From: Kate Dowdy <dowdy_katherine@bah.com>
Date: Mon, 18 Apr 2022 12:55:27 -0600
Subject: [PATCH 4/5] fixing tests/typo

---
 gamechangerml/api/tests/api_tests.py      | 3 +--
 gamechangerml/api/tests/test_examples.py  | 6 +++---
 gamechangerml/src/utilities/text_utils.py | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/gamechangerml/api/tests/api_tests.py b/gamechangerml/api/tests/api_tests.py
index 8c48f74e..090c33e4 100644
--- a/gamechangerml/api/tests/api_tests.py
+++ b/gamechangerml/api/tests/api_tests.py
@@ -12,12 +12,11 @@
 
 from gamechangerml.src.search.query_expansion.utils import remove_original_kw
 from gamechangerml.src.text_handling.process import preprocess
-from gamechamgerml.src.utilities.text_utils import (
+from gamechangerml.src.utilities.text_utils import (
     has_many_short_tokens,
     has_many_repeating,
     has_extralong_tokens,
     is_a_toc,
-    majority_tokens_filtered,
     check_quality_paragraph,
 )
 
diff --git a/gamechangerml/api/tests/test_examples.py b/gamechangerml/api/tests/test_examples.py
index 83a9c5f1..6350f451 100644
--- a/gamechangerml/api/tests/test_examples.py
+++ b/gamechangerml/api/tests/test_examples.py
@@ -187,8 +187,8 @@ class TestSet:
     }
     sent_index_processing_results = {
         "has_many_short_tokens": [False, True, True, False, False],
-        "has_many_repeating": [False, True, True, False, False],
-        "has_extralong_tokens": [False, False, False, True, False],
-        "is_a_toc": [False, False, False, False, False],
+        "has_many_repeating": [False, True, True, False, True],
+        "has_extralong_tokens": [False, False, False, True, True],
+        "is_a_toc": [False, False, False, False, True],
         "check_quality": [True, False, False, False, False],
     }
diff --git a/gamechangerml/src/utilities/text_utils.py b/gamechangerml/src/utilities/text_utils.py
index 5dfff235..e3c6ce6f 100644
--- a/gamechangerml/src/utilities/text_utils.py
+++ b/gamechangerml/src/utilities/text_utils.py
@@ -252,7 +252,7 @@ def has_extralong_tokens(text, threshold=25):
 
 def is_a_toc(text):
     """Checks if a paragraph appears to be a table of contents"""
-    toc_separation = re.findall(r"(\.{3,})", text)
+    toc_separation = re.findall(r"(\.{6,})", text)
     if len(toc_separation) > 0:
         return True
     else:

From 8a029c60e579730bed80cf87de09e9d93aa0bbcb Mon Sep 17 00:00:00 2001
From: Kate Dowdy <dowdy_katherine@bah.com>
Date: Tue, 19 Apr 2022 23:29:10 -0600
Subject: [PATCH 5/5] making less strict thresholds

---
 gamechangerml/src/utilities/text_utils.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/gamechangerml/src/utilities/text_utils.py b/gamechangerml/src/utilities/text_utils.py
index e3c6ce6f..a34d8f3a 100644
--- a/gamechangerml/src/utilities/text_utils.py
+++ b/gamechangerml/src/utilities/text_utils.py
@@ -223,7 +223,7 @@ def get_tokens(s: str) -> List[str]:
     return s.split()
 
 
-def has_many_short_tokens(processed_tokens, threshold=4.0):
+def has_many_short_tokens(processed_tokens, threshold):
     """Checks if the median length of tokens is less than the expected threshold"""
     median_len = np.median([len(i) for i in processed_tokens])
     if median_len <= threshold:
@@ -232,7 +232,7 @@ def has_many_short_tokens(processed_tokens, threshold=4.0):
         return False
 
 
-def has_many_repeating(text, tokens, threshold=0.6):
+def has_many_repeating(text, tokens, threshold):
     """Checks if the ratio of unique tokens is less than an expected threshold"""
     ratio_unique = len(set(tokens)) / len(text.split(" "))
     if ratio_unique < threshold:
@@ -241,10 +241,12 @@ def has_many_repeating(text, tokens, threshold=0.6):
         return False
 
 
-def has_extralong_tokens(text, threshold=25):
-    """Checks if the paragraph has a token that exceeds the threshold for normal token length"""
-    longest_token = np.max([len(i) for i in text.split(" ")])
-    if longest_token > threshold:
+def has_extralong_tokens(text, threshold):
+    """Checks if the paragraph has a high percentage of (nonwebsite) tokens exceeding threshold for normal token length"""
+    websites = ['http', 'www.']
+    tokens = [i for i in text.split(" ") if i[:4] not in websites]
+    long_tokens = [i for i in tokens if len(i) > threshold]
+    if len(long_tokens) / len(tokens) > 0.05:
         return True
     else:
         return False
@@ -272,9 +274,9 @@ def check_quality_paragraph(tokens, text):
 
     if majority_tokens_filtered(tokens, text):
         return False
-    if has_many_short_tokens(tokens, threshold=4.0):
+    if has_many_short_tokens(tokens, threshold=2.5):
         return False
-    elif has_many_repeating(text, tokens, threshold=0.6):
+    elif has_many_repeating(text, tokens, threshold=0.2):
         return False
     elif has_extralong_tokens(text, threshold=25):
         return False