diff --git a/gamechangerml/api/fastapi/routers/controls.py b/gamechangerml/api/fastapi/routers/controls.py index 9d0486d0..c51c45d7 100644 --- a/gamechangerml/api/fastapi/routers/controls.py +++ b/gamechangerml/api/fastapi/routers/controls.py @@ -664,18 +664,32 @@ def update_metadata(model_dict): try: index_path = model_dict["index_path"] except: - index_path = os.path.join(MODEL_PATH, "sent_index_20210715") + index_path = os.path.join( + Config.LOCAL_PACKAGED_MODELS_DIR, model_dict["sentence"] + ) try: update_eval_data = model_dict['update_eval_data'] except: update_eval_data = False + try: + testing_only = model_dict["testing_only"] + except: + testing_only = False + try: + upload = model_dict["upload"] + except: + upload = True + + logger.info(f"Testing only is set to: {testing_only}") args = { "meta_steps": meta_steps, "corpus_dir": corpus_dir, "retriever": retriever, "index_path": index_path, - "update_eval_data": update_eval_data + "update_eval_data": update_eval_data, + "testing_only": testing_only, + "upload": upload } pipeline.run( @@ -695,6 +709,10 @@ def finetune_sentence(model_dict): remake_train_data = model_dict["remake_train_data"] except: remake_train_data = False + try: + model = model_dict["model"] + except: + model = None args = { "batch_size": 8, "epochs": int(model_dict["epochs"]), @@ -702,6 +720,7 @@ def finetune_sentence(model_dict): "testing_only": bool(testing_only), "remake_train_data": bool(remake_train_data), "retriever": MODELS.sentence_searcher, + "model": model } pipeline.run( build_type="sent_finetune", @@ -749,10 +768,14 @@ def train_qexp(model_dict): def run_evals(model_dict): logger.info("Attempting to run evaluation") + try: + sample_limit = int(model_dict["sample_limit"]) + except: + sample_limit = 15000 args = { "model_name": model_dict["model_name"], "eval_type": model_dict["eval_type"], - "sample_limit": int(model_dict["sample_limit"]), + "sample_limit": sample_limit, "validation_data": model_dict["validation_data"], } pipeline.run( diff --git a/gamechangerml/api/tests/api_tests.py b/gamechangerml/api/tests/api_tests.py index cbfb1e49..49222b5f 100644 --- a/gamechangerml/api/tests/api_tests.py +++ b/gamechangerml/api/tests/api_tests.py @@ -11,7 +11,7 @@ from http.client import HTTPConnection # py3 from gamechangerml.src.search.query_expansion.utils import remove_original_kw -# from gamechangerml import DATA_PATH +#from gamechangerml import DATA_PATH from .test_examples import TestSet @@ -301,17 +301,6 @@ def test_qa_outside_scope(): # resp = http.post(API_URL + "/trainModel", json=model_dict) # assert resp.ok == True -# def test_trainModel_sent_finetune(): -# model_dict = { -# "build_type": "sent_finetune", -# "batch_size": 32, -# "epochs": 1, -# "warmup_steps": 100, -# "testing_only": True -# } -# resp = http.post(API_URL + "/trainModel", json=model_dict) -# assert resp.ok == True - # def test_trainModel_eval_squad(): # model_dict = { # "build_type": "eval", diff --git a/gamechangerml/configs/config.py b/gamechangerml/configs/config.py index e36aea23..a02b5b99 100644 --- a/gamechangerml/configs/config.py +++ b/gamechangerml/configs/config.py @@ -115,7 +115,6 @@ class ValidationConfig: "validation_dir": os.path.join(DATA_PATH, "validation"), "evaluation_dir": os.path.join(DATA_PATH, "evaluation"), "user_dir": os.path.join(DATA_PATH, "user_data"), - # location with smaller set of corpus JSONs "test_corpus_dir": "gamechangerml/test_corpus", "squad": { "dev": "original/squad2.0/dev-v2.0.json", diff --git a/gamechangerml/data/test_data/MatamoFeedback_TEST.csv b/gamechangerml/data/test_data/MatamoFeedback_TEST.csv new file mode 100644 index 00000000..48e42938 --- /dev/null +++ b/gamechangerml/data/test_data/MatamoFeedback_TEST.csv @@ -0,0 +1,31 @@ +,event_name,createdAt,user_id,value_1,value_2,value_3,value_4,value_7,value_5 +39,intelligent_search_thumbs_up,2021-08-31T17:14:00.347Z,12345,search_text: dcma,title_returned: Memo Joint Officer Handbook - Staffing and Action Guide (2011),,,, +2,intelligent_search_thumbs_up,2021-04-05T18:19:15.673Z,12345,title_returned: DoDD 5110.04 Washington Headquarters Services (WHS),search_text: plain language policy,,,, +16,qa_thumbs_down,2021-05-25T15:30:03.401Z,12345,question: who is the president?,QA answer: Russell T. Voughr,,,, +38,intelligent_search_thumbs_up,2021-08-27T16:39:26.347Z,12345,search_text: senior accountable official,title_returned: DoDI 5010.40 Managers' Internal Control Program Procedures,,,, +8,intelligent_search_thumbs_up,2021-04-22T12:30:23.340Z,12345,title_returned: AFI 11-235 SPECIALIZED REFUELING OPERATIONS,search_text: wet wing,,,, +34,intelligent_search_thumbs_up,2021-08-13T00:56:55.410Z,12345,search_text: security cooperation,title_returned: Memo 2015 - CNAS - Security Cooperation and Assistance,,,, +41,intelligent_search_thumbs_up,2021-09-23T14:41:50.557Z,12345,search_text: telework,title_returned: AFI 36-816 Civilian Telework Program,,,, +40,intelligent_search_thumbs_up,2021-09-08T23:54:40.935Z,12345,search_text: military,title_returned: DoDI 1332.45 Retention Determinations for Non-Deployable Service Members,,,, +20,qa_thumbs_down,2021-05-27T16:23:01.079Z,12345,question: who is the secretary of defense?,QA answer: David L. Norquist,,,, +1,intelligent_search_thumbs_up,2021-04-05T18:16:55.146Z,12345,title_returned: CJCSM 3500.03E Joint Training Manual for the Armed Forces of the United States,search_text: cyber range,,,, +23,intelligent_search_thumbs_up,2021-06-24T19:45:57.054Z,12345,title_returned: DoDI 1000.01 Identification (ID) Cards Required by the Geneva Conventions,search_text: who is sergeant major of the army,,,, +15,intelligent_search_thumbs_up,2021-05-21T16:25:13.292Z,12345,title_returned: DoDD 5105.60 National Geospatial-Intelligence Agency (NGA),search_text: geospatial,,,, +27,intelligent_search_thumbs_up,2021-07-21T13:48:49.176Z,12345,"search_text: ""use of alcohol"" and events",title_returned: AFI 34-219 ALCOHOLIC BEVERAGE PROGRAM,,,, +42,intelligent_search_thumbs_up,2021-09-23T16:11:31.850Z,12345,search_text: physical fitness,title_returned: MCO 1700.39 MARINE CORPS RECREATION PROGRAMS,,,, +35,intelligent_search_thumbs_up,2021-08-13T03:16:11.860Z,12345,search_text: pizza,title_returned: MISC PUBS GREECE,,,, +31,intelligent_search_thumbs_up,2021-08-03T19:02:06.140Z,12345,search_text: navy,title_returned: OPNAVINST 4650.17 UNUSUALLY ARDUOUS SEA DUTY FOR TRAVEL AND TRANSPORTATION ENTITLEMENTS,,,, +5,intelligent_search_thumbs_up,2021-04-15T11:42:48.684Z,12345,title_returned: DoDD 3000.06 Combat Support Agencies (CSAs),search_text: CSA,,,, +10,intelligent_search_thumbs_up,2021-05-11T19:38:12.825Z,12345,title_returned: SECNAVINST 5100.10K DEPARTMENT OF THE NAVY SAFETY PROGRAM,search_text: safety and occupational health,,,, +37,intelligent_search_thumbs_up,2021-08-25T18:08:27.541Z,12345,search_text: control system automation,title_returned: CIM 11000.7 FACILITIES ENERGY MANUAL,,,, +25,qa_thumbs_up,2021-07-15T14:36:45.517Z,12345,question: what is the mission of the national institute of health?,"QA answer: increase research in the field of viral disease causes, prevention, and treatment",,,, +30,intelligent_search_thumbs_up,2021-08-03T19:00:42.759Z,12345,search_text: telework,title_returned: AFI 36-816 Civilian Telework Program,,,, +4,intelligent_search_thumbs_up,2021-04-14T18:18:26.068Z,12345,title_returned: DoDD 5105.77 National Guard Bureau (NGB),"search_text: ""National Guard"" and NGB",,,, +29,intelligent_search_thumbs_up,2021-08-03T12:50:35.464Z,12345,search_text: International Cooperative Administrative Support Services (ICASS),title_returned: DoDI 7060.06 International Cooperative Administrative Support Services (ICASS),,,, +0,intelligent_search_thumbs_up,2021-03-30T13:13:04.319Z,12345,title_returned: CJCSI 3207.01C Department of Defense Support to Humanitarian Mine Action,search_text: USAR,,,, +13,intelligent_search_thumbs_up,2021-05-18T15:35:06.388Z,12345,title_returned: CJCSI 5123.01H Charter of the Joint Requirements Oversight Council (JROC) and the Implementation of the Joint Capabilities Integration and Development System,search_text: interoperability,,,, +14,qa_thumbs_up,2021-05-21T00:09:46.558Z,12345,question: who is the sergeant major of the army?,QA answer: Sergeant Major of the Army will serve as the senior enlisted assistant and advisor to the Chief of Staff,,,, +19,qa_thumbs_up,2021-05-26T15:15:29.828Z,12345,question: what is jadc2?,QA answer: MDC2 is renamed Joint All Domain Command and Control ( JADC2 ). 2. ( U ) The JROC acknowledges that the campaign plan is a living document that will evolve as experiments and exercises shape common understanding of JADC2,,,, +17,qa_thumbs_down,2021-05-25T16:53:38.305Z,12345,question: what is the mission of dcma?,"QA answer: The mission of Headquarters, US Army Western Command is to serve as the Army component to CINCPAC for the Pacific Command",,,, +32,intelligent_search_thumbs_up,2021-08-09T14:02:18.534Z,12345,"search_text: ""synchronizer"" and ""intelligence""",title_returned: DoDD 5143.01 Under Secretary of Defense for Intelligence and Security (USD(I&S)),,,, +7,intelligent_search_thumbs_up,2021-04-21T13:13:18.303Z,12345,title_returned: SECNAVINST 1752.4C SEXUAL ASSAULT PREVENTION AND RESPONSE PROGRAM PROCEDURES,search_text: sexual assault prevention,,,, \ No newline at end of file diff --git a/gamechangerml/data/test_data/SearchPDFMapping_TEST.csv b/gamechangerml/data/test_data/SearchPDFMapping_TEST.csv new file mode 100644 index 00000000..2a0e886d --- /dev/null +++ b/gamechangerml/data/test_data/SearchPDFMapping_TEST.csv @@ -0,0 +1,107 @@ +,index,idvisit,idaction_name,document,documenttime,search_cat,search,searchtime,clone_name +0,45792,369564,40360,,,GAMECHANGER_gamechanger_combined,equipment storage,2021-09-02T12:44:47.000Z, +1,10686,302108,34784,EO 13603.pdf,2021-07-27T17:42:52.000Z,GAMECHANGER_gamechanger_combined,EO 13603,2021-07-27T17:42:43.000Z,gamechanger +2,55601,466750,49795,,,GAMECHANGER_gamechanger_combined,Airfield Resource Protection,2021-12-01T04:35:30.000Z, +3,12176,329363,37004,TC 3-04.15.pdf,2021-08-09T16:43:02.000Z,GAMECHANGER_gamechanger_combined,aeronautical navigation aafif,2021-08-09T16:42:11.000Z,gamechanger +4,28616,240862,3061,,,GAMECHANGER_gamechanger_combined,5143.01,2021-05-14T16:37:09.000Z, +5,8885,284864,32452,DoDI 5000.75 CH 2.pdf,2021-07-14T17:56:46.000Z,GAMECHANGER_gamechanger_combined,BCAC approval,2021-07-14T17:56:01.000Z, +6,2363,237417,3972,"DEP SEC DEF Memo, Total Force Manpower Governance for OSD, Defense Agencies, and DoD Field Activities, 6 6 2017 OCR.pdf",2021-05-12T16:33:03.000Z,GAMECHANGER_gamechanger_combined,manpower,2021-05-12T16:29:52.000Z, +7,26165,509984,55222,DoDM 4140.26 Volume 3.pdf,2022-01-25T12:48:28.000Z,GAMECHANGER_gamechanger_combined,DOD 4140.26,2022-01-25T12:48:07.000Z,gamechanger +8,23656,480871,52182,AFI 10-201.pdf,2021-12-15T15:35:45.000Z,GAMECHANGER_gamechanger_combined,"employed in place",2021-12-15T15:35:19.000Z,gamechanger +9,55890,468906,3214,,,GAMECHANGER_gamechanger_combined,enterprise architecture,2021-12-02T15:55:26.000Z, +10,54607,459930,48942,,,GAMECHANGER_gamechanger_combined,functional flight check,2021-11-23T01:42:21.000Z, +11,39460,295625,14175,,,GAMECHANGER_gamechanger_combined,matomo,2021-07-23T01:36:08.000Z, +12,10866,304881,35069,DoDI 4515.13 CH 5.pdf,2021-07-28T17:13:12.000Z,GAMECHANGER_gamechanger_combined,TWCF,2021-07-28T17:10:30.000Z,gamechanger +13,28520,239494,4361,,,GAMECHANGER_gamechanger_combined,navy,2021-05-13T17:40:32.000Z, +14,44234,350570,38645,,,GAMECHANGER_gamechanger_combined,beef diet,2021-08-18T17:03:25.000Z, +15,12509,334963,37434,DoDI 1322.29 CH 1.pdf,2021-08-11T13:45:54.000Z,GAMECHANGER_gamechanger_combined,DoDI 1322.29,2021-08-11T13:45:46.000Z,gamechanger +16,20084,448895,47872,CFR-2021-title6-vol1.pdf,2021-11-16T13:21:50.000Z,GAMECHANGER_gamechanger_combined,undersecretary of defense for intelligence and security,2021-11-16T13:21:42.000Z,gamechanger +17,4384,259015,27048,AFI 90-201.pdf,2021-06-08T13:55:23.000Z,GAMECHANGER_gamechanger,"workflow" "air force",2021-06-08T13:55:00.000Z, +18,27141,219787,21819,,,GAMECHANGER_gamechanger_combined,SORN,2021-04-30T16:41:46.000Z, +19,42173,326855,7568,,,GAMECHANGER_gamechanger_combined,nga,2021-08-06T22:21:21.000Z, +20,27727,231206,23094,,,GAMECHANGER_gamechanger_combined,DFAS Instruction 8000.01,2021-05-06T19:27:09.000Z, +21,57111,474752,51055,,,GAMECHANGER_gamechanger_combined,awarded badge,2021-12-08T16:27:41.000Z, +22,37880,286364,32684,,,GAMECHANGER_gamechanger_combined,haircut,2021-07-15T23:08:05.000Z, +23,3531,251238,25787,2019,2021-05-26T14:07:23.000Z,GAMECHANGER_gamechanger_combined,climate change design standards,2021-05-26T14:07:02.000Z,GAO +24,11730,320452,36313,QTP 24-3-C355.pdf,2021-08-04T19:28:07.000Z,GAMECHANGER_gamechanger_combined,36-1-191,2021-08-04T19:27:36.000Z,gamechanger +25,59404,487143,52904,,,GAMECHANGER_gamechanger_combined,defense intelligence enterprise manager,2021-12-28T17:34:58.000Z, +26,48086,390026,42738,,,GAMECHANGER_gamechanger_combined,government shutdown,2021-09-29T11:41:26.000Z, +27,40614,306326,35326,,,GAMECHANGER_gamechanger_combined,SPeD,2021-07-29T10:53:43.000Z, +28,18864,423045,46164,AFMAN 48-146.pdf,2021-10-28T17:58:15.000Z,GAMECHANGER_gamechanger_combined,medical Testing for DoD Civilian Workers,2021-10-28T17:57:37.000Z,gamechanger +29,41654,320323,36309,,,GAMECHANGER_gamechanger_combined,administrative grievance,2021-08-04T19:27:52.000Z, +30,54847,461166,49142,,,GAMECHANGER_gamechanger_combined,Combatant Command (COCOM),2021-11-23T18:29:37.000Z, +31,58118,478450,51795,,,GAMECHANGER_gamechanger_combined,"child care",2021-12-13T15:29:41.000Z, +32,17735,402210,44507,DoDD 5240.06 CH 3.pdf,2021-10-14T16:04:21.000Z,GAMECHANGER_gamechanger_combined,5240.06,2021-10-14T16:04:10.000Z,gamechanger +33,13121,344143,38043,DoDI 8510.01 CH 3.pdf,2021-08-16T13:29:24.000Z,GAMECHANGER_gamechanger_combined,DoDI 8510.01 Risk Management Framework (RMF) for DoD Information Technology (IT),2021-08-16T13:28:40.000Z,gamechanger +34,60444,494809,3440,,,GAMECHANGER_gamechanger_combined,telework,2022-01-11T15:24:34.000Z, +35,565,215541,21497,"FRM 10, 1 1 2020 OCR.pdf",2021-04-28T21:49:46.000Z,GAMECHANGER_gamechanger_combined,JTRU,2021-04-28T21:49:20.000Z, +36,4675,261268,27387,AR 135-175.pdf,2021-06-10T12:39:14.000Z,GAMECHANGER_gamechanger,SUSPENSION OF FAVORABLE PERSONNEL ACTIONS (FLAG) officer retirement,2021-06-10T12:37:14.000Z, +37,15975,384767,41918,DoDM 3305.09 CH 2.pdf,2021-09-21T19:38:58.000Z,GAMECHANGER_gamechanger_combined,"combatant command " and "intelligence",2021-09-21T19:37:23.000Z,gamechanger +38,44049,348440,31120,,,GAMECHANGER_gamechanger_combined,ZBR,2021-08-17T18:56:00.000Z, +39,59791,490253,49516,,,GAMECHANGER_gamechanger_combined,"Program of REcord",2022-01-05T13:15:24.000Z, +40,20285,452039,48126,"DEP SEC DEF Memo, Space Organization and Management Tasks, 9 10 2018 OCR.pdf",2021-11-17T16:10:49.000Z,GAMECHANGER_gamechanger_combined,"space governance committee",2021-11-17T16:10:23.000Z,gamechanger +41,51881,424019,46305,,,GAMECHANGER_gamechanger_combined,DECO,2021-10-29T11:33:02.000Z, +42,3681,252998,26007,CJCSM 3265.01A.pdf,2021-05-28T10:52:59.000Z,GAMECHANGER_gamechanger_combined,jcsfl mission partner,2021-05-28T10:52:52.000Z, +43,50399,409949,45060,,,GAMECHANGER_gamechanger_combined,EO 12196,2021-10-20T20:26:16.000Z, +44,49778,401613,44466,,,GAMECHANGER_gamechanger_combined,collection and "reporting crimes",2021-10-14T10:54:46.000Z, +45,14341,363308,39660,AFMAN 91-203.pdf,2021-08-26T20:51:45.000Z,GAMECHANGER_gamechanger_combined,fire extinguisher inspection,2021-08-26T20:48:56.000Z,gamechanger +46,38814,290291,33616,,,GAMECHANGER_gamechanger_combined,SNCOA,2021-07-20T20:43:57.000Z, +47,53201,444839,47638,,,GAMECHANGER_gamechanger_combined,Federal Acquisition Regulation 52.245-1,2021-11-10T22:23:57.000Z, +48,32741,264516,27981,,,GAMECHANGER_gamechanger,intelligent command,2021-06-14T20:26:09.000Z, +49,15152,375088,40854,H.R 21 IH 117th.pdf,2021-09-09T18:56:29.000Z,GAMECHANGER_gamechanger_combined,"CLOUD Security",2021-09-09T18:55:06.000Z,gamechanger +50,34979,273832,30080,,,GAMECHANGER_gamechanger_combined,Disaster response force,2021-06-28T15:43:18.000Z, +51,40524,305598,35257,,,GAMECHANGER_gamechanger_combined,Government Purchase Card (GPC) for Material,2021-07-28T20:23:53.000Z, +52,20470,455391,48441,DoDI 8320.04 CH 3.pdf,2021-11-18T20:21:19.000Z,GAMECHANGER_gamechanger_combined,operational contractor support,2021-11-18T20:20:58.000Z,gamechanger +53,16544,389766,42689,Defense Acquisition Services.pdf,2021-09-28T19:31:50.000Z,GAMECHANGER_gamechanger_combined,"S-CAT",2021-09-28T19:31:14.000Z,gamechanger +54,29303,247423,2554,,,GAMECHANGER_gamechanger_combined,awards,2021-05-21T00:08:53.000Z, +55,22810,475517,51130,AFI 36-815.pdf,2021-12-08T22:40:02.000Z,GAMECHANGER_gamechanger_combined,rest and recuperation,2021-12-08T22:39:38.000Z,gamechanger +56,48363,391546,42980,,,GAMECHANGER_gamechanger_combined,"intelligence mission",2021-09-30T16:09:32.000Z, +57,53413,448709,47865,,,GAMECHANGER_gamechanger_combined,"authority to operate" +spectrum,2021-11-16T12:36:51.000Z, +58,6640,272439,29771,DoD Support for the National Security Commission on Artificial Intelligence.pdf,2021-06-24T19:08:35.000Z,GAMECHANGER_gamechanger_combined,'artificial intelligence',2021-06-24T19:07:47.000Z, +59,46272,375746,25296,,,GAMECHANGER_gamechanger_combined,helicopter,2021-09-10T15:10:02.000Z, +60,52128,426795,46533,,,GAMECHANGER_gamechanger_combined,dodd 5230.25 withholding of unclassified technical data from public disclosure,2021-11-01T16:33:12.000Z, +61,11767,321104,16860,AFMAN 11-2MC-130HV3CL-5.pdf,2021-08-05T01:42:28.000Z,GAMECHANGER_gamechanger_combined,farp,2021-08-05T01:40:11.000Z,gamechanger +62,41189,313980,35893,,,GAMECHANGER_gamechanger,Reliability Centered Maintenance (RCM),2021-08-02T19:30:58.000Z, +63,24811,492811,10427,SECNAVINST 5300.28F.pdf,2022-01-07T16:29:02.000Z,GAMECHANGER_gamechanger_combined,ncis,2022-01-07T16:25:24.000Z,gamechanger +64,1488,225852,22782,Framework for Risk Categorization for Use During Independent Technical Risk Assessments.pdf,2021-05-04T22:24:40.000Z,GAMECHANGER_gamechanger_combined,risk matrix,2021-05-04T22:24:29.000Z, +65,52629,434726,47035,,,GAMECHANGER_gamechanger_combined,Expeditionary Combat Readiness Center Missions,2021-11-04T15:53:40.000Z, +66,17952,404934,39222,DAFMAN 10-703.pdf,2021-10-18T16:53:57.000Z,GAMECHANGER_gamechanger_combined,DoDI 5530.03 International Agreements,2021-10-18T16:53:42.000Z,gamechanger +67,9970,292745,33904,AFI 36-2903.pdf,2021-07-22T00:56:25.000Z,GAMECHANGER_gamechanger_combined,"majcom patch, ocp",2021-07-22T00:55:47.000Z,gamechanger +68,33351,267728,26120,,,GAMECHANGER_gamechanger,ethical behaviour,2021-06-17T19:22:56.000Z, +69,17948,404858,44739,DoDFMR V2BCH13.pdf,2021-10-18T16:32:48.000Z,GAMECHANGER_gamechanger_combined,restoration design phases,2021-10-18T16:32:34.000Z,gamechanger +70,18894,423320,46224,AFI 31-118.pdf,2021-10-28T19:02:17.000Z,GAMECHANGER_gamechanger_combined,"active shooter",2021-10-28T18:59:41.000Z,gamechanger +71,57118,474761,51056,,,GAMECHANGER_gamechanger_combined,11-402,2021-12-08T16:27:47.000Z, +72,57020,474387,50975,,,GAMECHANGER_gamechanger_combined,piercing,2021-12-08T14:03:21.000Z, +73,60408,494648,41548,,,GAMECHANGER_gamechanger_combined,hearing aids,2022-01-11T14:18:36.000Z, +74,51600,422104,1315,,,GAMECHANGER_gamechanger_combined,COVID,2021-10-28T13:30:57.000Z, +75,43415,340192,37862,,,GAMECHANGER_gamechanger_combined,Work Place Hostility,2021-08-12T22:22:49.000Z, +76,61106,500777,54648,,,GAMECHANGER_gamechanger_combined,overlapping period of performances,2022-01-19T01:43:49.000Z, +77,43471,340569,37895,,,GAMECHANGER_gamechanger_combined,tenant security plan,2021-08-13T06:28:57.000Z, +78,18691,420380,45769,Title 43,2021-10-27T17:37:29.000Z,GAMECHANGER_gamechanger_combined,title 10 section 2222,2021-10-27T17:33:02.000Z,Public Lands.pdf +79,60351,494264,19934,,,GAMECHANGER_gamechanger_combined,mil-std,2022-01-10T20:59:20.000Z, +80,15943,384242,41873,AFI 10-403.pdf,2021-09-21T14:43:02.000Z,GAMECHANGER_gamechanger_combined,deployed manpower agency,2021-09-21T14:38:59.000Z,gamechanger +81,6886,274147,30155,H.R 2003 IH 117th.pdf,2021-06-28T19:49:49.000Z,GAMECHANGER_gamechanger_combined,The Mexico City Policy,2021-06-28T19:47:11.000Z, +82,21460,465513,49653,"DEP SEC DEF Memo, Public-Private Talent Exchange, 7 19 2018 OCR.pdf",2021-11-30T14:50:43.000Z,GAMECHANGER_gamechanger_combined,open source software memo,2021-11-30T14:42:38.000Z,gamechanger +83,7913,279574,31193,AR 381-10.pdf,2021-07-07T20:45:29.000Z,GAMECHANGER_gamechanger_combined,"DoDD 5200.27",2021-07-07T20:09:26.000Z, +84,19434,435878,46601,AFMAN 32-1007.pdf,2021-11-04T23:17:18.000Z,GAMECHANGER_gamechanger_combined,bioenvironmental,2021-11-04T23:11:33.000Z,gamechanger +85,5579,266352,28385,"DEP SEC DEF Memo, (UFOUO) Facilitating Unity of Effort to Advance Countering Weapons of Mass Destruction Objectives, 4 9 2018 OCR.pdfnull",2021-06-16T15:46:57.000Z,GAMECHANGER_gamechanger,CWMD unity of effort,2021-06-16T15:45:35.000Z, +86,29100,246409,3151,,,GAMECHANGER_gamechanger_combined,GPC,2021-05-20T13:35:29.000Z, +87,22987,476463,39913,H.R. 5412 RH 117th.pdf,2021-12-09T18:01:58.000Z,GAMECHANGER_gamechanger_combined,"intelligence and security",2021-12-09T18:01:34.000Z,gamechanger +88,4487,259519,15003,DoDD 5205.12 CH 2.pdf,2021-06-08T18:08:00.000Z,GAMECHANGER_gamechanger,mip,2021-06-08T18:07:54.000Z, +89,22998,476516,51468,AFH 1.pdf,2021-12-09T18:33:48.000Z,GAMECHANGER_gamechanger_combined,36-2623,2021-12-09T18:30:46.000Z,gamechanger +90,20850,459805,12605,DoDI 5000.74.pdf,2021-11-22T22:20:21.000Z,GAMECHANGER_gamechanger_combined,cloud computing,2021-11-22T22:19:58.000Z,gamechanger +91,54539,459568,3047,,,GAMECHANGER_gamechanger_combined,advana,2021-11-22T20:14:45.000Z, +92,11728,320405,36307,AR 600-8-22.pdf,2021-08-04T19:24:44.000Z,GAMECHANGER_gamechanger_combined,downgrade award appeal,2021-08-04T19:24:19.000Z,gamechanger +93,13461,349742,38586,CFETP 3DXXX.pdf,2021-08-18T13:09:38.000Z,GAMECHANGER_gamechanger_combined,time in training 3DXXX,2021-08-18T13:09:22.000Z,gamechanger +94,38519,288571,11127,,,GAMECHANGER_gamechanger_combined,physical fitness,2021-07-19T19:06:46.000Z, +95,28479,239408,24245,,,GAMECHANGER_gamechanger_combined,checks and balances,2021-05-13T17:11:11.000Z, +96,8116,280857,31593,CFETP 3E5X1.pdf,2021-07-09T12:46:15.000Z,GAMECHANGER_gamechanger_combined,CFETP 3e5x1,2021-07-09T12:46:03.000Z, +97,41515,318713,36191,,,GAMECHANGER_gamechanger_combined,plain language IC,2021-08-04T13:19:11.000Z, +98,45364,365974,39984,,,GAMECHANGER_gamechanger_combined,span of control,2021-08-30T18:23:10.000Z, +99,39640,298459,34419,,,GAMECHANGER_gamechanger_combined,VA san antonoio,2021-07-26T13:06:37.000Z, +100,18641,420270,13918,EO 13962.pdf,2021-10-27T17:24:38.000Z,GAMECHANGER_gamechanger_combined,covid 19,2021-10-27T17:24:14.000Z,gamechanger +101,18642,420270,13918,EO 13962.pdf,2021-10-27T17:24:37.000Z,GAMECHANGER_gamechanger_combined,covid 19,2021-10-27T17:24:14.000Z,gamechanger +102,22265,472452,3440,AFI 31-118.pdf,2021-12-06T21:55:47.000Z,GAMECHANGER_gamechanger_combined,telework,2021-12-06T21:54:07.000Z,gamechanger +103,22266,472452,3440,AFI 31-118.pdf,2021-12-06T21:55:47.000Z,GAMECHANGER_gamechanger_combined,telework,2021-12-06T21:54:07.000Z,gamechanger +104,22622,474563,3440,AFI 31-118.pdf,2021-12-08T15:16:38.000Z,GAMECHANGER_gamechanger_combined,telework,2021-12-08T15:09:13.000Z,gamechanger +105,22623,474563,3440,AFI 31-118.pdf,2021-12-08T15:16:38.000Z,GAMECHANGER_gamechanger_combined,telework,2021-12-08T15:09:13.000Z,gamechanger diff --git a/gamechangerml/data/test_data/test_validation/any/intelligent_search_data.json b/gamechangerml/data/test_data/test_validation/any/intelligent_search_data.json new file mode 100644 index 00000000..30f24e7a --- /dev/null +++ b/gamechangerml/data/test_data/test_validation/any/intelligent_search_data.json @@ -0,0 +1 @@ +"{\"queries\": {\"S0000000\": \"restoration design phases\", \"S0000001\": \"5240.06\", \"S0000002\": \"synchronizer and intelligence\", \"S0000003\": \"national guard and ngb\", \"S0000004\": \"jtru\", \"S0000005\": \"navy\", \"S0000006\": \"bioenvironmental\", \"S0000007\": \"36-2623\", \"S0000008\": \"geospatial\", \"S0000009\": \"majcom patch, ocp\", \"S0000010\": \"mip\", \"S0000011\": \"telework\", \"S0000012\": \"undersecretary of defense for intelligence and security\", \"S0000013\": \"cyber range\", \"S0000014\": \"ncis\", \"S0000015\": \"downgrade award appeal\", \"S0000016\": \"time in training 3dxxx\", \"S0000017\": \"bcac approval\", \"S0000018\": \"covid 19\", \"S0000019\": \"aeronautical navigation aafif\", \"S0000020\": \"safety and occupational health\", \"S0000021\": \"medical testing for dod civilian workers\", \"S0000022\": \"jcsfl mission partner\", \"S0000023\": \"dodi 1322.29\", \"S0000024\": \"employed in place\", \"S0000025\": \"manpower\", \"S0000026\": \"sexual assault prevention\", \"S0000027\": \"cwmd unity of effort\", \"S0000028\": \"fire extinguisher inspection\", \"S0000029\": \"deployed manpower agency\", \"S0000030\": \"international cooperative administrative support services (icass)\", \"S0000031\": \"suspension of favorable personnel actions (flag) officer retirement\", \"S0000032\": \"artificial intelligence\", \"S0000033\": \"cloud computing\", \"S0000034\": \"wet wing\", \"S0000035\": \"space governance committee\", \"S0000036\": \"s-cat\", \"S0000037\": \"cfetp 3e5x1\", \"S0000038\": \"risk matrix\", \"S0000039\": \"csa\", \"S0000040\": \"36-1-191\", \"S0000041\": \"who is sergeant major of the army\", \"S0000042\": \"use of alcohol and events\", \"S0000043\": \"interoperability\", \"S0000044\": \"twcf\", \"S0000045\": \"control system automation\", \"S0000046\": \"farp\", \"S0000047\": \"dcma\", \"S0000048\": \"military\", \"S0000049\": \"the mexico city policy\", \"S0000050\": \"title 10 section 2222\", \"S0000051\": \"dodd 5200.27\", \"S0000052\": \"dod 4140.26\", \"S0000053\": \"dodi 8510.01 risk management framework (rmf) for dod information technology (it)\", \"S0000054\": \"combatant command and intelligence\", \"S0000055\": \"open source software memo\", \"S0000056\": \"rest and recuperation\", \"S0000057\": \"physical fitness\", \"S0000058\": \"workflow air force\", \"S0000059\": \"climate change design standards\", \"S0000060\": \"active shooter\", \"S0000061\": \"intelligence and security\", \"S0000062\": \"cloud security\", \"S0000063\": \"dodi 5530.03 international agreements\", \"S0000064\": \"senior accountable official\", \"S0000065\": \"usar\", \"S0000066\": \"eo 13603\", \"S0000067\": \"plain language policy\", \"S0000068\": \"security cooperation\", \"S0000069\": \"operational contractor support\"}, \"collection\": {\"R0000000\": \" CJCSM 3500.03E Joint Training Manual for the Armed Forces of the United States\", \"R0000001\": \"EO 13962\", \"R0000002\": \"CFR-2021-title6-vol1\", \"R0000003\": \"TC 3-04.15\", \"R0000004\": \"CJCSM 3265.01A\", \"R0000005\": \"DEP SEC DEF Memo, Space Organization and Management Tasks, 9 10 2018 OCR\", \"R0000006\": \"Framework for Risk Categorization for Use During Independent Technical Risk Assessments\", \"R0000007\": \"AFI 36-2903\", \"R0000008\": \"H.R 21 IH 117th\", \"R0000009\": \"H.R. 5412 RH 117th\", \"R0000010\": \"DoDD 5205.12 CH 2\", \"R0000011\": \"AFI 36-815\", \"R0000012\": \"AFI 10-201\", \"R0000013\": \" DoDD 3000.06 Combat Support Agencies (CSAs)\", \"R0000014\": \"2019\", \"R0000015\": \" SECNAVINST 5100.10K DEPARTMENT OF THE NAVY SAFETY PROGRAM\", \"R0000016\": \"DAFMAN 10-703\", \"R0000017\": \"AR 135-175\", \"R0000018\": \"Defense Acquisition Services\", \"R0000019\": \"DoDI 8510.01 CH 3\", \"R0000020\": \"QTP 24-3-C355\", \"R0000021\": \"DoDI 5000.74\", \"R0000022\": \"AFI 90-201\", \"R0000023\": \" CJCSI 5123.01H Charter of the Joint Requirements Oversight Council (JROC) and the Implementation of the Joint Capabilities Integration and Development System\", \"R0000024\": \"AR 381-10\", \"R0000025\": \"FRM 10, 1 1 2020 OCR\", \"R0000026\": \" Memo 2015 - CNAS - Security Cooperation and Assistance\", \"R0000027\": \" DoDD 5105.77 National Guard Bureau (NGB)\", \"R0000028\": \" AFI 34-219 ALCOHOLIC BEVERAGE PROGRAM\", \"R0000029\": \"DoDM 3305.09 CH 2\", \"R0000030\": \"SECNAVINST 5300.28F\", \"R0000031\": \" DoDD 5143.01 Under Secretary of Defense for Intelligence and Security (USD(I&S))\", \"R0000032\": \"EO 13603\", \"R0000033\": \"AFMAN 48-146\", \"R0000034\": \"DoD Support for the National Security Commission on Artificial Intelligence\", \"R0000035\": \" OPNAVINST 4650.17 UNUSUALLY ARDUOUS SEA DUTY FOR TRAVEL AND TRANSPORTATION ENTITLEMENTS\", \"R0000036\": \" DoDI 1000.01 Identification (ID) Cards Required by the Geneva Conventions\", \"R0000037\": \"DoDI 4515.13 CH 5\", \"R0000038\": \"DoDI 8320.04 CH 3\", \"R0000039\": \" MCO 1700.39 MARINE CORPS RECREATION PROGRAMS\", \"R0000040\": \"AFI 10-403\", \"R0000041\": \" DoDI 5010.40 Managers' Internal Control Program Procedures\", \"R0000042\": \" DoDD 5105.60 National Geospatial-Intelligence Agency (NGA)\", \"R0000043\": \"DoDD 5240.06 CH 3\", \"R0000044\": \"Title 43\", \"R0000045\": \" DoDI 7060.06 International Cooperative Administrative Support Services (ICASS)\", \"R0000046\": \"DoDI 5000.75 CH 2\", \"R0000047\": \" DoDD 5110.04 Washington Headquarters Services (WHS)\", \"R0000048\": \"DoDI 1322.29 CH 1\", \"R0000049\": \"AFI 31-118\", \"R0000050\": \" Memo Joint Officer Handbook - Staffing and Action Guide (2011)\", \"R0000051\": \"AFH 1\", \"R0000052\": \"CFETP 3DXXX\", \"R0000053\": \"CFETP 3E5X1\", \"R0000054\": \"H.R 2003 IH 117th\", \"R0000055\": \" DoDI 1332.45 Retention Determinations for Non-Deployable Service Members\", \"R0000056\": \" MISC PUBS GREECE\", \"R0000057\": \"DEP SEC DEF Memo, (UFOUO) Facilitating Unity of Effort to Advance Countering Weapons of Mass Destruction Objectives, 4 9 2018 OCR\", \"R0000058\": \"AFMAN 11-2MC-130HV3CL-5\", \"R0000059\": \"AR 600-8-22\", \"R0000060\": \"AFMAN 91-203\", \"R0000061\": \"AFMAN 32-1007\", \"R0000062\": \" AFI 11-235 SPECIALIZED REFUELING OPERATIONS\", \"R0000063\": \"DoDM 4140.26 Volume 3\", \"R0000064\": \"DEP SEC DEF Memo, Public-Private Talent Exchange, 7 19 2018 OCR\", \"R0000065\": \"DEP SEC DEF Memo, Total Force Manpower Governance for OSD, Defense Agencies, and DoD Field Activities, 6 6 2017 OCR\", \"R0000066\": \" CIM 11000.7 FACILITIES ENERGY MANUAL\", \"R0000067\": \" SECNAVINST 1752.4C SEXUAL ASSAULT PREVENTION AND RESPONSE PROGRAM PROCEDURES\", \"R0000068\": \" AFI 36-816 Civilian Telework Program\", \"R0000069\": \" CJCSI 3207.01C Department of Defense Support to Humanitarian Mine Action\", \"R0000070\": \"DoDFMR V2BCH13\"}, \"meta_relations\": {\"S0000000\": {\"R0000070\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-18T16:32:48.000Z\", \"exact_matches\": [{\"exact_query\": \"restoration design phases\", \"exact_result\": \"DoDFMR V2BCH13\", \"source\": \"user_history\", \"date\": \"2021-10-18T16:32:48.000Z\"}], \"times_matched\": 1}}, \"S0000001\": {\"R0000043\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-14T16:04:21.000Z\", \"exact_matches\": [{\"exact_query\": \"5240.06\", \"exact_result\": \"DoDD 5240.06 CH 3\", \"source\": \"user_history\", \"date\": \"2021-10-14T16:04:21.000Z\"}], \"times_matched\": 1}}, \"S0000002\": {\"R0000031\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-09T14:02:18.534Z\", \"exact_matches\": [{\"exact_query\": \" \\\"synchronizer\\\" and \\\"intelligence\\\"\", \"exact_result\": \" DoDD 5143.01 Under Secretary of Defense for Intelligence and Security (USD(I&S))\", \"source\": \"matamo\", \"date\": \"2021-08-09T14:02:18.534Z\"}], \"times_matched\": 1}}, \"S0000003\": {\"R0000027\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-14T18:18:26.068Z\", \"exact_matches\": [{\"exact_query\": \" \\\"National Guard\\\" and NGB\", \"exact_result\": \" DoDD 5105.77 National Guard Bureau (NGB)\", \"source\": \"matamo\", \"date\": \"2021-04-14T18:18:26.068Z\"}], \"times_matched\": 1}}, \"S0000004\": {\"R0000025\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-28T21:49:46.000Z\", \"exact_matches\": [{\"exact_query\": \"jtru\", \"exact_result\": \"FRM 10, 1 1 2020 OCR\", \"source\": \"user_history\", \"date\": \"2021-04-28T21:49:46.000Z\"}], \"times_matched\": 1}}, \"S0000005\": {\"R0000035\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-03T19:02:06.140Z\", \"exact_matches\": [{\"exact_query\": \" navy\", \"exact_result\": \" OPNAVINST 4650.17 UNUSUALLY ARDUOUS SEA DUTY FOR TRAVEL AND TRANSPORTATION ENTITLEMENTS\", \"source\": \"matamo\", \"date\": \"2021-08-03T19:02:06.140Z\"}], \"times_matched\": 1}}, \"S0000006\": {\"R0000061\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-11-04T23:17:18.000Z\", \"exact_matches\": [{\"exact_query\": \"bioenvironmental\", \"exact_result\": \"AFMAN 32-1007\", \"source\": \"user_history\", \"date\": \"2021-11-04T23:17:18.000Z\"}], \"times_matched\": 1}}, \"S0000007\": {\"R0000051\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-12-09T18:33:48.000Z\", \"exact_matches\": [{\"exact_query\": \"36-2623\", \"exact_result\": \"AFH 1\", \"source\": \"user_history\", \"date\": \"2021-12-09T18:33:48.000Z\"}], \"times_matched\": 1}}, \"S0000008\": {\"R0000042\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-21T16:25:13.292Z\", \"exact_matches\": [{\"exact_query\": \" geospatial\", \"exact_result\": \" DoDD 5105.60 National Geospatial-Intelligence Agency (NGA)\", \"source\": \"matamo\", \"date\": \"2021-05-21T16:25:13.292Z\"}], \"times_matched\": 1}}, \"S0000009\": {\"R0000007\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-22T00:56:25.000Z\", \"exact_matches\": [{\"exact_query\": \"majcom patch, ocp\", \"exact_result\": \"AFI 36-2903\", \"source\": \"user_history\", \"date\": \"2021-07-22T00:56:25.000Z\"}], \"times_matched\": 1}}, \"S0000010\": {\"R0000010\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-08T18:08:00.000Z\", \"exact_matches\": [{\"exact_query\": \"mip\", \"exact_result\": \"DoDD 5205.12 CH 2\", \"source\": \"user_history\", \"date\": \"2021-06-08T18:08:00.000Z\"}], \"times_matched\": 1}}, \"S0000011\": {\"R0000068\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-03T19:00:42.759Z\", \"exact_matches\": [{\"exact_query\": \" telework\", \"exact_result\": \" AFI 36-816 Civilian Telework Program\", \"source\": \"matamo\", \"date\": \"2021-08-03T19:00:42.759Z\"}, {\"exact_query\": \" telework\", \"exact_result\": \" AFI 36-816 Civilian Telework Program\", \"source\": \"matamo\", \"date\": \"2021-09-23T14:41:50.557Z\"}], \"times_matched\": 2}, \"R0000049\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-03T19:00:42.759Z\", \"exact_matches\": [{\"exact_query\": \"telework\", \"exact_result\": \"AFI 31-118\", \"source\": \"user_history\", \"date\": \"2021-12-06T21:55:47.000Z\"}, {\"exact_query\": \"telework\", \"exact_result\": \"AFI 31-118\", \"source\": \"user_history\", \"date\": \"2021-12-08T15:16:38.000Z\"}], \"times_matched\": 2}}, \"S0000012\": {\"R0000002\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-11-16T13:21:50.000Z\", \"exact_matches\": [{\"exact_query\": \"undersecretary of defense for intelligence and security\", \"exact_result\": \"CFR-2021-title6-vol1\", \"source\": \"user_history\", \"date\": \"2021-11-16T13:21:50.000Z\"}], \"times_matched\": 1}}, \"S0000013\": {\"R0000000\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-05T18:16:55.146Z\", \"exact_matches\": [{\"exact_query\": \" cyber range\", \"exact_result\": \" CJCSM 3500.03E Joint Training Manual for the Armed Forces of the United States\", \"source\": \"matamo\", \"date\": \"2021-04-05T18:16:55.146Z\"}], \"times_matched\": 1}}, \"S0000014\": {\"R0000030\": {\"correct_match\": \"true\", \"last_match_date\": \"2022-01-07T16:29:02.000Z\", \"exact_matches\": [{\"exact_query\": \"ncis\", \"exact_result\": \"SECNAVINST 5300.28F\", \"source\": \"user_history\", \"date\": \"2022-01-07T16:29:02.000Z\"}], \"times_matched\": 1}}, \"S0000015\": {\"R0000059\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-04T19:24:44.000Z\", \"exact_matches\": [{\"exact_query\": \"downgrade award appeal\", \"exact_result\": \"AR 600-8-22\", \"source\": \"user_history\", \"date\": \"2021-08-04T19:24:44.000Z\"}], \"times_matched\": 1}}, \"S0000016\": {\"R0000052\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-18T13:09:38.000Z\", \"exact_matches\": [{\"exact_query\": \"time in training 3dxxx\", \"exact_result\": \"CFETP 3DXXX\", \"source\": \"user_history\", \"date\": \"2021-08-18T13:09:38.000Z\"}], \"times_matched\": 1}}, \"S0000017\": {\"R0000046\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-14T17:56:46.000Z\", \"exact_matches\": [{\"exact_query\": \"bcac approval\", \"exact_result\": \"DoDI 5000.75 CH 2\", \"source\": \"user_history\", \"date\": \"2021-07-14T17:56:46.000Z\"}], \"times_matched\": 1}}, \"S0000018\": {\"R0000001\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-27T17:24:38.000Z\", \"exact_matches\": [{\"exact_query\": \"covid 19\", \"exact_result\": \"EO 13962\", \"source\": \"user_history\", \"date\": \"2021-10-27T17:24:38.000Z\"}], \"times_matched\": 1}}, \"S0000019\": {\"R0000003\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-09T16:43:02.000Z\", \"exact_matches\": [{\"exact_query\": \"aeronautical navigation aafif\", \"exact_result\": \"TC 3-04.15\", \"source\": \"user_history\", \"date\": \"2021-08-09T16:43:02.000Z\"}], \"times_matched\": 1}}, \"S0000020\": {\"R0000015\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-11T19:38:12.825Z\", \"exact_matches\": [{\"exact_query\": \" safety and occupational health\", \"exact_result\": \" SECNAVINST 5100.10K DEPARTMENT OF THE NAVY SAFETY PROGRAM\", \"source\": \"matamo\", \"date\": \"2021-05-11T19:38:12.825Z\"}], \"times_matched\": 1}}, \"S0000021\": {\"R0000033\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-28T17:58:15.000Z\", \"exact_matches\": [{\"exact_query\": \"medical testing for dod civilian workers\", \"exact_result\": \"AFMAN 48-146\", \"source\": \"user_history\", \"date\": \"2021-10-28T17:58:15.000Z\"}], \"times_matched\": 1}}, \"S0000022\": {\"R0000004\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-28T10:52:59.000Z\", \"exact_matches\": [{\"exact_query\": \"jcsfl mission partner\", \"exact_result\": \"CJCSM 3265.01A\", \"source\": \"user_history\", \"date\": \"2021-05-28T10:52:59.000Z\"}], \"times_matched\": 1}}, \"S0000023\": {\"R0000048\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-11T13:45:54.000Z\", \"exact_matches\": [{\"exact_query\": \"dodi 1322.29\", \"exact_result\": \"DoDI 1322.29 CH 1\", \"source\": \"user_history\", \"date\": \"2021-08-11T13:45:54.000Z\"}], \"times_matched\": 1}}, \"S0000024\": {\"R0000012\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-12-15T15:35:45.000Z\", \"exact_matches\": [{\"exact_query\": \"'employed in place'\", \"exact_result\": \"AFI 10-201\", \"source\": \"user_history\", \"date\": \"2021-12-15T15:35:45.000Z\"}], \"times_matched\": 1}}, \"S0000025\": {\"R0000065\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-12T16:33:03.000Z\", \"exact_matches\": [{\"exact_query\": \"manpower\", \"exact_result\": \"DEP SEC DEF Memo, Total Force Manpower Governance for OSD, Defense Agencies, and DoD Field Activities, 6 6 2017 OCR\", \"source\": \"user_history\", \"date\": \"2021-05-12T16:33:03.000Z\"}], \"times_matched\": 1}}, \"S0000026\": {\"R0000067\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-21T13:13:18.303Z\", \"exact_matches\": [{\"exact_query\": \" sexual assault prevention\", \"exact_result\": \" SECNAVINST 1752.4C SEXUAL ASSAULT PREVENTION AND RESPONSE PROGRAM PROCEDURES\", \"source\": \"matamo\", \"date\": \"2021-04-21T13:13:18.303Z\"}], \"times_matched\": 1}}, \"S0000027\": {\"R0000057\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-16T15:46:57.000Z\", \"exact_matches\": [{\"exact_query\": \"cwmd unity of effort\", \"exact_result\": \"DEP SEC DEF Memo, (UFOUO) Facilitating Unity of Effort to Advance Countering Weapons of Mass Destruction Objectives, 4 9 2018 OCR\", \"source\": \"user_history\", \"date\": \"2021-06-16T15:46:57.000Z\"}], \"times_matched\": 1}}, \"S0000028\": {\"R0000060\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-26T20:51:45.000Z\", \"exact_matches\": [{\"exact_query\": \"fire extinguisher inspection\", \"exact_result\": \"AFMAN 91-203\", \"source\": \"user_history\", \"date\": \"2021-08-26T20:51:45.000Z\"}], \"times_matched\": 1}}, \"S0000029\": {\"R0000040\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-09-21T14:43:02.000Z\", \"exact_matches\": [{\"exact_query\": \"deployed manpower agency\", \"exact_result\": \"AFI 10-403\", \"source\": \"user_history\", \"date\": \"2021-09-21T14:43:02.000Z\"}], \"times_matched\": 1}}, \"S0000030\": {\"R0000045\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-03T12:50:35.464Z\", \"exact_matches\": [{\"exact_query\": \" International Cooperative Administrative Support Services (ICASS)\", \"exact_result\": \" DoDI 7060.06 International Cooperative Administrative Support Services (ICASS)\", \"source\": \"matamo\", \"date\": \"2021-08-03T12:50:35.464Z\"}], \"times_matched\": 1}}, \"S0000031\": {\"R0000017\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-10T12:39:14.000Z\", \"exact_matches\": [{\"exact_query\": \"suspension of favorable personnel actions (flag) officer retirement\", \"exact_result\": \"AR 135-175\", \"source\": \"user_history\", \"date\": \"2021-06-10T12:39:14.000Z\"}], \"times_matched\": 1}}, \"S0000032\": {\"R0000034\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-24T19:08:35.000Z\", \"exact_matches\": [{\"exact_query\": \"'artificial intelligence'\", \"exact_result\": \"DoD Support for the National Security Commission on Artificial Intelligence\", \"source\": \"user_history\", \"date\": \"2021-06-24T19:08:35.000Z\"}], \"times_matched\": 1}}, \"S0000033\": {\"R0000021\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-11-22T22:20:21.000Z\", \"exact_matches\": [{\"exact_query\": \"cloud computing\", \"exact_result\": \"DoDI 5000.74\", \"source\": \"user_history\", \"date\": \"2021-11-22T22:20:21.000Z\"}], \"times_matched\": 1}}, \"S0000034\": {\"R0000062\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-22T12:30:23.340Z\", \"exact_matches\": [{\"exact_query\": \" wet wing\", \"exact_result\": \" AFI 11-235 SPECIALIZED REFUELING OPERATIONS\", \"source\": \"matamo\", \"date\": \"2021-04-22T12:30:23.340Z\"}], \"times_matched\": 1}}, \"S0000035\": {\"R0000005\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-11-17T16:10:49.000Z\", \"exact_matches\": [{\"exact_query\": \"'space governance committee'\", \"exact_result\": \"DEP SEC DEF Memo, Space Organization and Management Tasks, 9 10 2018 OCR\", \"source\": \"user_history\", \"date\": \"2021-11-17T16:10:49.000Z\"}], \"times_matched\": 1}}, \"S0000036\": {\"R0000018\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-09-28T19:31:50.000Z\", \"exact_matches\": [{\"exact_query\": \"'s-cat'\", \"exact_result\": \"Defense Acquisition Services\", \"source\": \"user_history\", \"date\": \"2021-09-28T19:31:50.000Z\"}], \"times_matched\": 1}}, \"S0000037\": {\"R0000053\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-09T12:46:15.000Z\", \"exact_matches\": [{\"exact_query\": \"cfetp 3e5x1\", \"exact_result\": \"CFETP 3E5X1\", \"source\": \"user_history\", \"date\": \"2021-07-09T12:46:15.000Z\"}], \"times_matched\": 1}}, \"S0000038\": {\"R0000006\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-04T22:24:40.000Z\", \"exact_matches\": [{\"exact_query\": \"risk matrix\", \"exact_result\": \"Framework for Risk Categorization for Use During Independent Technical Risk Assessments\", \"source\": \"user_history\", \"date\": \"2021-05-04T22:24:40.000Z\"}], \"times_matched\": 1}}, \"S0000039\": {\"R0000013\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-15T11:42:48.684Z\", \"exact_matches\": [{\"exact_query\": \" CSA\", \"exact_result\": \" DoDD 3000.06 Combat Support Agencies (CSAs)\", \"source\": \"matamo\", \"date\": \"2021-04-15T11:42:48.684Z\"}], \"times_matched\": 1}}, \"S0000040\": {\"R0000020\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-04T19:28:07.000Z\", \"exact_matches\": [{\"exact_query\": \"36-1-191\", \"exact_result\": \"QTP 24-3-C355\", \"source\": \"user_history\", \"date\": \"2021-08-04T19:28:07.000Z\"}], \"times_matched\": 1}}, \"S0000041\": {\"R0000036\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-24T19:45:57.054Z\", \"exact_matches\": [{\"exact_query\": \" who is sergeant major of the army\", \"exact_result\": \" DoDI 1000.01 Identification (ID) Cards Required by the Geneva Conventions\", \"source\": \"matamo\", \"date\": \"2021-06-24T19:45:57.054Z\"}], \"times_matched\": 1}}, \"S0000042\": {\"R0000028\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-21T13:48:49.176Z\", \"exact_matches\": [{\"exact_query\": \" \\\"use of alcohol\\\" and events\", \"exact_result\": \" AFI 34-219 ALCOHOLIC BEVERAGE PROGRAM\", \"source\": \"matamo\", \"date\": \"2021-07-21T13:48:49.176Z\"}], \"times_matched\": 1}}, \"S0000043\": {\"R0000023\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-18T15:35:06.388Z\", \"exact_matches\": [{\"exact_query\": \" interoperability\", \"exact_result\": \" CJCSI 5123.01H Charter of the Joint Requirements Oversight Council (JROC) and the Implementation of the Joint Capabilities Integration and Development System\", \"source\": \"matamo\", \"date\": \"2021-05-18T15:35:06.388Z\"}], \"times_matched\": 1}}, \"S0000044\": {\"R0000037\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-28T17:13:12.000Z\", \"exact_matches\": [{\"exact_query\": \"twcf\", \"exact_result\": \"DoDI 4515.13 CH 5\", \"source\": \"user_history\", \"date\": \"2021-07-28T17:13:12.000Z\"}], \"times_matched\": 1}}, \"S0000045\": {\"R0000066\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-25T18:08:27.541Z\", \"exact_matches\": [{\"exact_query\": \" control system automation\", \"exact_result\": \" CIM 11000.7 FACILITIES ENERGY MANUAL\", \"source\": \"matamo\", \"date\": \"2021-08-25T18:08:27.541Z\"}], \"times_matched\": 1}}, \"S0000046\": {\"R0000058\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-05T01:42:28.000Z\", \"exact_matches\": [{\"exact_query\": \"farp\", \"exact_result\": \"AFMAN 11-2MC-130HV3CL-5\", \"source\": \"user_history\", \"date\": \"2021-08-05T01:42:28.000Z\"}], \"times_matched\": 1}}, \"S0000047\": {\"R0000050\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-31T17:14:00.347Z\", \"exact_matches\": [{\"exact_query\": \" dcma\", \"exact_result\": \" Memo Joint Officer Handbook - Staffing and Action Guide (2011)\", \"source\": \"matamo\", \"date\": \"2021-08-31T17:14:00.347Z\"}], \"times_matched\": 1}}, \"S0000048\": {\"R0000055\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-09-08T23:54:40.935Z\", \"exact_matches\": [{\"exact_query\": \" military\", \"exact_result\": \" DoDI 1332.45 Retention Determinations for Non-Deployable Service Members\", \"source\": \"matamo\", \"date\": \"2021-09-08T23:54:40.935Z\"}], \"times_matched\": 1}}, \"S0000049\": {\"R0000054\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-28T19:49:49.000Z\", \"exact_matches\": [{\"exact_query\": \"the mexico city policy\", \"exact_result\": \"H.R 2003 IH 117th\", \"source\": \"user_history\", \"date\": \"2021-06-28T19:49:49.000Z\"}], \"times_matched\": 1}}, \"S0000050\": {\"R0000044\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-27T17:37:29.000Z\", \"exact_matches\": [{\"exact_query\": \"title 10 section 2222\", \"exact_result\": \"Title 43\", \"source\": \"user_history\", \"date\": \"2021-10-27T17:37:29.000Z\"}], \"times_matched\": 1}}, \"S0000051\": {\"R0000024\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-07T20:45:29.000Z\", \"exact_matches\": [{\"exact_query\": \"'dodd 5200.27'\", \"exact_result\": \"AR 381-10\", \"source\": \"user_history\", \"date\": \"2021-07-07T20:45:29.000Z\"}], \"times_matched\": 1}}, \"S0000052\": {\"R0000063\": {\"correct_match\": \"true\", \"last_match_date\": \"2022-01-25T12:48:28.000Z\", \"exact_matches\": [{\"exact_query\": \"dod 4140.26\", \"exact_result\": \"DoDM 4140.26 Volume 3\", \"source\": \"user_history\", \"date\": \"2022-01-25T12:48:28.000Z\"}], \"times_matched\": 1}}, \"S0000053\": {\"R0000019\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-16T13:29:24.000Z\", \"exact_matches\": [{\"exact_query\": \"dodi 8510.01 risk management framework (rmf) for dod information technology (it)\", \"exact_result\": \"DoDI 8510.01 CH 3\", \"source\": \"user_history\", \"date\": \"2021-08-16T13:29:24.000Z\"}], \"times_matched\": 1}}, \"S0000054\": {\"R0000029\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-09-21T19:38:58.000Z\", \"exact_matches\": [{\"exact_query\": \"'combatant command ' and 'intelligence'\", \"exact_result\": \"DoDM 3305.09 CH 2\", \"source\": \"user_history\", \"date\": \"2021-09-21T19:38:58.000Z\"}], \"times_matched\": 1}}, \"S0000055\": {\"R0000064\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-11-30T14:50:43.000Z\", \"exact_matches\": [{\"exact_query\": \"open source software memo\", \"exact_result\": \"DEP SEC DEF Memo, Public-Private Talent Exchange, 7 19 2018 OCR\", \"source\": \"user_history\", \"date\": \"2021-11-30T14:50:43.000Z\"}], \"times_matched\": 1}}, \"S0000056\": {\"R0000011\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-12-08T22:40:02.000Z\", \"exact_matches\": [{\"exact_query\": \"rest and recuperation\", \"exact_result\": \"AFI 36-815\", \"source\": \"user_history\", \"date\": \"2021-12-08T22:40:02.000Z\"}], \"times_matched\": 1}}, \"S0000057\": {\"R0000039\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-09-23T16:11:31.850Z\", \"exact_matches\": [{\"exact_query\": \" physical fitness\", \"exact_result\": \" MCO 1700.39 MARINE CORPS RECREATION PROGRAMS\", \"source\": \"matamo\", \"date\": \"2021-09-23T16:11:31.850Z\"}], \"times_matched\": 1}}, \"S0000058\": {\"R0000022\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-08T13:55:23.000Z\", \"exact_matches\": [{\"exact_query\": \"'workflow' 'air force'\", \"exact_result\": \"AFI 90-201\", \"source\": \"user_history\", \"date\": \"2021-06-08T13:55:23.000Z\"}], \"times_matched\": 1}}, \"S0000059\": {\"R0000014\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-26T14:07:23.000Z\", \"exact_matches\": [{\"exact_query\": \"climate change design standards\", \"exact_result\": \"2019\", \"source\": \"user_history\", \"date\": \"2021-05-26T14:07:23.000Z\"}], \"times_matched\": 1}}, \"S0000060\": {\"R0000049\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-28T19:02:17.000Z\", \"exact_matches\": [{\"exact_query\": \"'active shooter'\", \"exact_result\": \"AFI 31-118\", \"source\": \"user_history\", \"date\": \"2021-10-28T19:02:17.000Z\"}], \"times_matched\": 1}}, \"S0000061\": {\"R0000009\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-12-09T18:01:58.000Z\", \"exact_matches\": [{\"exact_query\": \"'intelligence and security'\", \"exact_result\": \"H.R. 5412 RH 117th\", \"source\": \"user_history\", \"date\": \"2021-12-09T18:01:58.000Z\"}], \"times_matched\": 1}}, \"S0000062\": {\"R0000008\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-09-09T18:56:29.000Z\", \"exact_matches\": [{\"exact_query\": \"'cloud security'\", \"exact_result\": \"H.R 21 IH 117th\", \"source\": \"user_history\", \"date\": \"2021-09-09T18:56:29.000Z\"}], \"times_matched\": 1}}, \"S0000063\": {\"R0000016\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-18T16:53:57.000Z\", \"exact_matches\": [{\"exact_query\": \"dodi 5530.03 international agreements\", \"exact_result\": \"DAFMAN 10-703\", \"source\": \"user_history\", \"date\": \"2021-10-18T16:53:57.000Z\"}], \"times_matched\": 1}}, \"S0000064\": {\"R0000041\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-27T16:39:26.347Z\", \"exact_matches\": [{\"exact_query\": \" senior accountable official\", \"exact_result\": \" DoDI 5010.40 Managers' Internal Control Program Procedures\", \"source\": \"matamo\", \"date\": \"2021-08-27T16:39:26.347Z\"}], \"times_matched\": 1}}, \"S0000065\": {\"R0000069\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-03-30T13:13:04.319Z\", \"exact_matches\": [{\"exact_query\": \" USAR\", \"exact_result\": \" CJCSI 3207.01C Department of Defense Support to Humanitarian Mine Action\", \"source\": \"matamo\", \"date\": \"2021-03-30T13:13:04.319Z\"}], \"times_matched\": 1}}, \"S0000066\": {\"R0000032\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-27T17:42:52.000Z\", \"exact_matches\": [{\"exact_query\": \"eo 13603\", \"exact_result\": \"EO 13603\", \"source\": \"user_history\", \"date\": \"2021-07-27T17:42:52.000Z\"}], \"times_matched\": 1}}, \"S0000067\": {\"R0000047\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-05T18:19:15.673Z\", \"exact_matches\": [{\"exact_query\": \" plain language policy\", \"exact_result\": \" DoDD 5110.04 Washington Headquarters Services (WHS)\", \"source\": \"matamo\", \"date\": \"2021-04-05T18:19:15.673Z\"}], \"times_matched\": 1}}, \"S0000068\": {\"R0000026\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-13T00:56:55.410Z\", \"exact_matches\": [{\"exact_query\": \" security cooperation\", \"exact_result\": \" Memo 2015 - CNAS - Security Cooperation and Assistance\", \"source\": \"matamo\", \"date\": \"2021-08-13T00:56:55.410Z\"}], \"times_matched\": 1}}, \"S0000069\": {\"R0000038\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-11-18T20:21:19.000Z\", \"exact_matches\": [{\"exact_query\": \"operational contractor support\", \"exact_result\": \"DoDI 8320.04 CH 3\", \"source\": \"user_history\", \"date\": \"2021-11-18T20:21:19.000Z\"}], \"times_matched\": 1}}}, \"correct\": {\"S0000000\": [\"R0000070\"], \"S0000001\": [\"R0000043\"], \"S0000002\": [\"R0000031\"], \"S0000003\": [\"R0000027\"], \"S0000004\": [\"R0000025\"], \"S0000005\": [\"R0000035\"], \"S0000006\": [\"R0000061\"], \"S0000007\": [\"R0000051\"], \"S0000008\": [\"R0000042\"], \"S0000009\": [\"R0000007\"], \"S0000010\": [\"R0000010\"], \"S0000011\": [\"R0000068\", \"R0000049\"], \"S0000012\": [\"R0000002\"], \"S0000013\": [\"R0000000\"], \"S0000014\": [\"R0000030\"], \"S0000015\": [\"R0000059\"], \"S0000016\": [\"R0000052\"], \"S0000017\": [\"R0000046\"], \"S0000018\": [\"R0000001\"], \"S0000019\": [\"R0000003\"], \"S0000020\": [\"R0000015\"], \"S0000021\": [\"R0000033\"], \"S0000022\": [\"R0000004\"], \"S0000023\": [\"R0000048\"], \"S0000024\": [\"R0000012\"], \"S0000025\": [\"R0000065\"], \"S0000026\": [\"R0000067\"], \"S0000027\": [\"R0000057\"], \"S0000028\": [\"R0000060\"], \"S0000029\": [\"R0000040\"], \"S0000030\": [\"R0000045\"], \"S0000031\": [\"R0000017\"], \"S0000032\": [\"R0000034\"], \"S0000033\": [\"R0000021\"], \"S0000034\": [\"R0000062\"], \"S0000035\": [\"R0000005\"], \"S0000036\": [\"R0000018\"], \"S0000037\": [\"R0000053\"], \"S0000038\": [\"R0000006\"], \"S0000039\": [\"R0000013\"], \"S0000040\": [\"R0000020\"], \"S0000041\": [\"R0000036\"], \"S0000042\": [\"R0000028\"], \"S0000043\": [\"R0000023\"], \"S0000044\": [\"R0000037\"], \"S0000045\": [\"R0000066\"], \"S0000046\": [\"R0000058\"], \"S0000047\": [\"R0000050\"], \"S0000048\": [\"R0000055\"], \"S0000049\": [\"R0000054\"], \"S0000050\": [\"R0000044\"], \"S0000051\": [\"R0000024\"], \"S0000052\": [\"R0000063\"], \"S0000053\": [\"R0000019\"], \"S0000054\": [\"R0000029\"], \"S0000055\": [\"R0000064\"], \"S0000056\": [\"R0000011\"], \"S0000057\": [\"R0000039\"], \"S0000058\": [\"R0000022\"], \"S0000059\": [\"R0000014\"], \"S0000060\": [\"R0000049\"], \"S0000061\": [\"R0000009\"], \"S0000062\": [\"R0000008\"], \"S0000063\": [\"R0000016\"], \"S0000064\": [\"R0000041\"], \"S0000065\": [\"R0000069\"], \"S0000066\": [\"R0000032\"], \"S0000067\": [\"R0000047\"], \"S0000068\": [\"R0000026\"], \"S0000069\": [\"R0000038\"]}, \"incorrect\": {}, \"correct_vals\": {\"restoration design phases\": [\"DoDFMR V2BCH13\"], \"5240.06\": [\"DoDD 5240.06 CH 3\"], \"synchronizer and intelligence\": [\" DoDD 5143.01 Under Secretary of Defense for Intelligence and Security (USD(I&S))\"], \"national guard and ngb\": [\" DoDD 5105.77 National Guard Bureau (NGB)\"], \"jtru\": [\"FRM 10, 1 1 2020 OCR\"], \"navy\": [\" OPNAVINST 4650.17 UNUSUALLY ARDUOUS SEA DUTY FOR TRAVEL AND TRANSPORTATION ENTITLEMENTS\"], \"bioenvironmental\": [\"AFMAN 32-1007\"], \"36-2623\": [\"AFH 1\"], \"geospatial\": [\" DoDD 5105.60 National Geospatial-Intelligence Agency (NGA)\"], \"majcom patch, ocp\": [\"AFI 36-2903\"], \"mip\": [\"DoDD 5205.12 CH 2\"], \"telework\": [\" AFI 36-816 Civilian Telework Program\", \"AFI 31-118\"], \"undersecretary of defense for intelligence and security\": [\"CFR-2021-title6-vol1\"], \"cyber range\": [\" CJCSM 3500.03E Joint Training Manual for the Armed Forces of the United States\"], \"ncis\": [\"SECNAVINST 5300.28F\"], \"downgrade award appeal\": [\"AR 600-8-22\"], \"time in training 3dxxx\": [\"CFETP 3DXXX\"], \"bcac approval\": [\"DoDI 5000.75 CH 2\"], \"covid 19\": [\"EO 13962\"], \"aeronautical navigation aafif\": [\"TC 3-04.15\"], \"safety and occupational health\": [\" SECNAVINST 5100.10K DEPARTMENT OF THE NAVY SAFETY PROGRAM\"], \"medical testing for dod civilian workers\": [\"AFMAN 48-146\"], \"jcsfl mission partner\": [\"CJCSM 3265.01A\"], \"dodi 1322.29\": [\"DoDI 1322.29 CH 1\"], \"employed in place\": [\"AFI 10-201\"], \"manpower\": [\"DEP SEC DEF Memo, Total Force Manpower Governance for OSD, Defense Agencies, and DoD Field Activities, 6 6 2017 OCR\"], \"sexual assault prevention\": [\" SECNAVINST 1752.4C SEXUAL ASSAULT PREVENTION AND RESPONSE PROGRAM PROCEDURES\"], \"cwmd unity of effort\": [\"DEP SEC DEF Memo, (UFOUO) Facilitating Unity of Effort to Advance Countering Weapons of Mass Destruction Objectives, 4 9 2018 OCR\"], \"fire extinguisher inspection\": [\"AFMAN 91-203\"], \"deployed manpower agency\": [\"AFI 10-403\"], \"international cooperative administrative support services (icass)\": [\" DoDI 7060.06 International Cooperative Administrative Support Services (ICASS)\"], \"suspension of favorable personnel actions (flag) officer retirement\": [\"AR 135-175\"], \"artificial intelligence\": [\"DoD Support for the National Security Commission on Artificial Intelligence\"], \"cloud computing\": [\"DoDI 5000.74\"], \"wet wing\": [\" AFI 11-235 SPECIALIZED REFUELING OPERATIONS\"], \"space governance committee\": [\"DEP SEC DEF Memo, Space Organization and Management Tasks, 9 10 2018 OCR\"], \"s-cat\": [\"Defense Acquisition Services\"], \"cfetp 3e5x1\": [\"CFETP 3E5X1\"], \"risk matrix\": [\"Framework for Risk Categorization for Use During Independent Technical Risk Assessments\"], \"csa\": [\" DoDD 3000.06 Combat Support Agencies (CSAs)\"], \"36-1-191\": [\"QTP 24-3-C355\"], \"who is sergeant major of the army\": [\" DoDI 1000.01 Identification (ID) Cards Required by the Geneva Conventions\"], \"use of alcohol and events\": [\" AFI 34-219 ALCOHOLIC BEVERAGE PROGRAM\"], \"interoperability\": [\" CJCSI 5123.01H Charter of the Joint Requirements Oversight Council (JROC) and the Implementation of the Joint Capabilities Integration and Development System\"], \"twcf\": [\"DoDI 4515.13 CH 5\"], \"control system automation\": [\" CIM 11000.7 FACILITIES ENERGY MANUAL\"], \"farp\": [\"AFMAN 11-2MC-130HV3CL-5\"], \"dcma\": [\" Memo Joint Officer Handbook - Staffing and Action Guide (2011)\"], \"military\": [\" DoDI 1332.45 Retention Determinations for Non-Deployable Service Members\"], \"the mexico city policy\": [\"H.R 2003 IH 117th\"], \"title 10 section 2222\": [\"Title 43\"], \"dodd 5200.27\": [\"AR 381-10\"], \"dod 4140.26\": [\"DoDM 4140.26 Volume 3\"], \"dodi 8510.01 risk management framework (rmf) for dod information technology (it)\": [\"DoDI 8510.01 CH 3\"], \"combatant command and intelligence\": [\"DoDM 3305.09 CH 2\"], \"open source software memo\": [\"DEP SEC DEF Memo, Public-Private Talent Exchange, 7 19 2018 OCR\"], \"rest and recuperation\": [\"AFI 36-815\"], \"physical fitness\": [\" MCO 1700.39 MARINE CORPS RECREATION PROGRAMS\"], \"workflow air force\": [\"AFI 90-201\"], \"climate change design standards\": [\"2019\"], \"active shooter\": [\"AFI 31-118\"], \"intelligence and security\": [\"H.R. 5412 RH 117th\"], \"cloud security\": [\"H.R 21 IH 117th\"], \"dodi 5530.03 international agreements\": [\"DAFMAN 10-703\"], \"senior accountable official\": [\" DoDI 5010.40 Managers' Internal Control Program Procedures\"], \"usar\": [\" CJCSI 3207.01C Department of Defense Support to Humanitarian Mine Action\"], \"eo 13603\": [\"EO 13603\"], \"plain language policy\": [\" DoDD 5110.04 Washington Headquarters Services (WHS)\"], \"security cooperation\": [\" Memo 2015 - CNAS - Security Cooperation and Assistance\"], \"operational contractor support\": [\"DoDI 8320.04 CH 3\"]}, \"incorrect_vals\": {}}" \ No newline at end of file diff --git a/gamechangerml/data/test_data/test_validation/any/intelligent_search_metadata.json b/gamechangerml/data/test_data/test_validation/any/intelligent_search_metadata.json new file mode 100644 index 00000000..3340adca --- /dev/null +++ b/gamechangerml/data/test_data/test_validation/any/intelligent_search_metadata.json @@ -0,0 +1 @@ +{"date_created": "2022-03-09", "level": "any", "number_queries": 70, "number_documents": 71, "number_correct": 70, "number_incorrect": 0, "start_date": "2020-12-01", "end_date": "2025-12-01", "exclude_searches": ["pizza", "shark"], "min_correct_matches": 0, "max_results": 100, "filter_queries": "False"} \ No newline at end of file diff --git a/gamechangerml/data/test_data/test_validation/gold/intelligent_search_data.json b/gamechangerml/data/test_data/test_validation/gold/intelligent_search_data.json new file mode 100644 index 00000000..f4bd8cad --- /dev/null +++ b/gamechangerml/data/test_data/test_validation/gold/intelligent_search_data.json @@ -0,0 +1 @@ +"{\"queries\": {\"S0000000\": \"restoration design phases\", \"S0000001\": \"5240.06\", \"S0000002\": \"synchronizer and intelligence\", \"S0000003\": \"national guard and ngb\", \"S0000004\": \"jtru\", \"S0000005\": \"navy\", \"S0000006\": \"bioenvironmental\", \"S0000007\": \"36-2623\", \"S0000008\": \"geospatial\", \"S0000009\": \"majcom patch, ocp\", \"S0000010\": \"mip\", \"S0000011\": \"telework\", \"S0000012\": \"undersecretary of defense for intelligence and security\", \"S0000013\": \"cyber range\", \"S0000014\": \"ncis\", \"S0000015\": \"downgrade award appeal\", \"S0000016\": \"time in training 3dxxx\", \"S0000017\": \"bcac approval\", \"S0000018\": \"covid 19\", \"S0000019\": \"aeronautical navigation aafif\", \"S0000020\": \"safety and occupational health\", \"S0000021\": \"medical testing for dod civilian workers\", \"S0000022\": \"jcsfl mission partner\", \"S0000023\": \"dodi 1322.29\", \"S0000024\": \"employed in place\", \"S0000025\": \"manpower\", \"S0000026\": \"sexual assault prevention\", \"S0000027\": \"cwmd unity of effort\", \"S0000028\": \"fire extinguisher inspection\", \"S0000029\": \"deployed manpower agency\", \"S0000030\": \"international cooperative administrative support services (icass)\", \"S0000031\": \"suspension of favorable personnel actions (flag) officer retirement\", \"S0000032\": \"artificial intelligence\", \"S0000033\": \"cloud computing\", \"S0000034\": \"wet wing\", \"S0000035\": \"space governance committee\", \"S0000036\": \"s-cat\", \"S0000037\": \"cfetp 3e5x1\", \"S0000038\": \"risk matrix\", \"S0000039\": \"csa\", \"S0000040\": \"36-1-191\", \"S0000041\": \"who is sergeant major of the army\", \"S0000042\": \"use of alcohol and events\", \"S0000043\": \"interoperability\", \"S0000044\": \"twcf\", \"S0000045\": \"control system automation\", \"S0000046\": \"farp\", \"S0000047\": \"dcma\", \"S0000048\": \"military\", \"S0000049\": \"the mexico city policy\", \"S0000050\": \"title 10 section 2222\", \"S0000051\": \"dodd 5200.27\", \"S0000052\": \"dod 4140.26\", \"S0000053\": \"dodi 8510.01 risk management framework (rmf) for dod information technology (it)\", \"S0000054\": \"combatant command and intelligence\", \"S0000055\": \"open source software memo\", \"S0000056\": \"rest and recuperation\", \"S0000057\": \"physical fitness\", \"S0000058\": \"workflow air force\", \"S0000059\": \"climate change design standards\", \"S0000060\": \"active shooter\", \"S0000061\": \"intelligence and security\", \"S0000062\": \"cloud security\", \"S0000063\": \"dodi 5530.03 international agreements\", \"S0000064\": \"senior accountable official\", \"S0000065\": \"usar\", \"S0000066\": \"eo 13603\", \"S0000067\": \"plain language policy\", \"S0000068\": \"security cooperation\", \"S0000069\": \"operational contractor support\"}, \"collection\": {\"R0000000\": \" CJCSM 3500.03E Joint Training Manual for the Armed Forces of the United States\", \"R0000001\": \"EO 13962\", \"R0000002\": \"CFR-2021-title6-vol1\", \"R0000003\": \"TC 3-04.15\", \"R0000004\": \"CJCSM 3265.01A\", \"R0000005\": \"DEP SEC DEF Memo, Space Organization and Management Tasks, 9 10 2018 OCR\", \"R0000006\": \"Framework for Risk Categorization for Use During Independent Technical Risk Assessments\", \"R0000007\": \"AFI 36-2903\", \"R0000008\": \"H.R 21 IH 117th\", \"R0000009\": \"H.R. 5412 RH 117th\", \"R0000010\": \"DoDD 5205.12 CH 2\", \"R0000011\": \"AFI 36-815\", \"R0000012\": \"AFI 10-201\", \"R0000013\": \" DoDD 3000.06 Combat Support Agencies (CSAs)\", \"R0000014\": \"2019\", \"R0000015\": \" SECNAVINST 5100.10K DEPARTMENT OF THE NAVY SAFETY PROGRAM\", \"R0000016\": \"DAFMAN 10-703\", \"R0000017\": \"AR 135-175\", \"R0000018\": \"Defense Acquisition Services\", \"R0000019\": \"DoDI 8510.01 CH 3\", \"R0000020\": \"QTP 24-3-C355\", \"R0000021\": \"DoDI 5000.74\", \"R0000022\": \"AFI 90-201\", \"R0000023\": \" CJCSI 5123.01H Charter of the Joint Requirements Oversight Council (JROC) and the Implementation of the Joint Capabilities Integration and Development System\", \"R0000024\": \"AR 381-10\", \"R0000025\": \"FRM 10, 1 1 2020 OCR\", \"R0000026\": \" Memo 2015 - CNAS - Security Cooperation and Assistance\", \"R0000027\": \" DoDD 5105.77 National Guard Bureau (NGB)\", \"R0000028\": \" AFI 34-219 ALCOHOLIC BEVERAGE PROGRAM\", \"R0000029\": \"DoDM 3305.09 CH 2\", \"R0000030\": \"SECNAVINST 5300.28F\", \"R0000031\": \" DoDD 5143.01 Under Secretary of Defense for Intelligence and Security (USD(I&S))\", \"R0000032\": \"EO 13603\", \"R0000033\": \"AFMAN 48-146\", \"R0000034\": \"DoD Support for the National Security Commission on Artificial Intelligence\", \"R0000035\": \" OPNAVINST 4650.17 UNUSUALLY ARDUOUS SEA DUTY FOR TRAVEL AND TRANSPORTATION ENTITLEMENTS\", \"R0000036\": \" DoDI 1000.01 Identification (ID) Cards Required by the Geneva Conventions\", \"R0000037\": \"DoDI 4515.13 CH 5\", \"R0000038\": \"DoDI 8320.04 CH 3\", \"R0000039\": \" MCO 1700.39 MARINE CORPS RECREATION PROGRAMS\", \"R0000040\": \"AFI 10-403\", \"R0000041\": \" DoDI 5010.40 Managers' Internal Control Program Procedures\", \"R0000042\": \" DoDD 5105.60 National Geospatial-Intelligence Agency (NGA)\", \"R0000043\": \"DoDD 5240.06 CH 3\", \"R0000044\": \"Title 43\", \"R0000045\": \" DoDI 7060.06 International Cooperative Administrative Support Services (ICASS)\", \"R0000046\": \"DoDI 5000.75 CH 2\", \"R0000047\": \" DoDD 5110.04 Washington Headquarters Services (WHS)\", \"R0000048\": \"DoDI 1322.29 CH 1\", \"R0000049\": \"AFI 31-118\", \"R0000050\": \" Memo Joint Officer Handbook - Staffing and Action Guide (2011)\", \"R0000051\": \"AFH 1\", \"R0000052\": \"CFETP 3DXXX\", \"R0000053\": \"CFETP 3E5X1\", \"R0000054\": \"H.R 2003 IH 117th\", \"R0000055\": \" DoDI 1332.45 Retention Determinations for Non-Deployable Service Members\", \"R0000056\": \" MISC PUBS GREECE\", \"R0000057\": \"DEP SEC DEF Memo, (UFOUO) Facilitating Unity of Effort to Advance Countering Weapons of Mass Destruction Objectives, 4 9 2018 OCR\", \"R0000058\": \"AFMAN 11-2MC-130HV3CL-5\", \"R0000059\": \"AR 600-8-22\", \"R0000060\": \"AFMAN 91-203\", \"R0000061\": \"AFMAN 32-1007\", \"R0000062\": \" AFI 11-235 SPECIALIZED REFUELING OPERATIONS\", \"R0000063\": \"DoDM 4140.26 Volume 3\", \"R0000064\": \"DEP SEC DEF Memo, Public-Private Talent Exchange, 7 19 2018 OCR\", \"R0000065\": \"DEP SEC DEF Memo, Total Force Manpower Governance for OSD, Defense Agencies, and DoD Field Activities, 6 6 2017 OCR\", \"R0000066\": \" CIM 11000.7 FACILITIES ENERGY MANUAL\", \"R0000067\": \" SECNAVINST 1752.4C SEXUAL ASSAULT PREVENTION AND RESPONSE PROGRAM PROCEDURES\", \"R0000068\": \" AFI 36-816 Civilian Telework Program\", \"R0000069\": \" CJCSI 3207.01C Department of Defense Support to Humanitarian Mine Action\", \"R0000070\": \"DoDFMR V2BCH13\"}, \"meta_relations\": {\"S0000000\": {\"R0000070\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-18T16:32:48.000Z\", \"exact_matches\": [{\"exact_query\": \"restoration design phases\", \"exact_result\": \"DoDFMR V2BCH13\", \"source\": \"user_history\", \"date\": \"2021-10-18T16:32:48.000Z\"}], \"times_matched\": 1}}, \"S0000001\": {\"R0000043\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-14T16:04:21.000Z\", \"exact_matches\": [{\"exact_query\": \"5240.06\", \"exact_result\": \"DoDD 5240.06 CH 3\", \"source\": \"user_history\", \"date\": \"2021-10-14T16:04:21.000Z\"}], \"times_matched\": 1}}, \"S0000002\": {\"R0000031\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-09T14:02:18.534Z\", \"exact_matches\": [{\"exact_query\": \" \\\"synchronizer\\\" and \\\"intelligence\\\"\", \"exact_result\": \" DoDD 5143.01 Under Secretary of Defense for Intelligence and Security (USD(I&S))\", \"source\": \"matamo\", \"date\": \"2021-08-09T14:02:18.534Z\"}], \"times_matched\": 1}}, \"S0000003\": {\"R0000027\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-14T18:18:26.068Z\", \"exact_matches\": [{\"exact_query\": \" \\\"National Guard\\\" and NGB\", \"exact_result\": \" DoDD 5105.77 National Guard Bureau (NGB)\", \"source\": \"matamo\", \"date\": \"2021-04-14T18:18:26.068Z\"}], \"times_matched\": 1}}, \"S0000004\": {\"R0000025\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-28T21:49:46.000Z\", \"exact_matches\": [{\"exact_query\": \"jtru\", \"exact_result\": \"FRM 10, 1 1 2020 OCR\", \"source\": \"user_history\", \"date\": \"2021-04-28T21:49:46.000Z\"}], \"times_matched\": 1}}, \"S0000005\": {\"R0000035\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-03T19:02:06.140Z\", \"exact_matches\": [{\"exact_query\": \" navy\", \"exact_result\": \" OPNAVINST 4650.17 UNUSUALLY ARDUOUS SEA DUTY FOR TRAVEL AND TRANSPORTATION ENTITLEMENTS\", \"source\": \"matamo\", \"date\": \"2021-08-03T19:02:06.140Z\"}], \"times_matched\": 1}}, \"S0000006\": {\"R0000061\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-11-04T23:17:18.000Z\", \"exact_matches\": [{\"exact_query\": \"bioenvironmental\", \"exact_result\": \"AFMAN 32-1007\", \"source\": \"user_history\", \"date\": \"2021-11-04T23:17:18.000Z\"}], \"times_matched\": 1}}, \"S0000007\": {\"R0000051\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-12-09T18:33:48.000Z\", \"exact_matches\": [{\"exact_query\": \"36-2623\", \"exact_result\": \"AFH 1\", \"source\": \"user_history\", \"date\": \"2021-12-09T18:33:48.000Z\"}], \"times_matched\": 1}}, \"S0000008\": {\"R0000042\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-21T16:25:13.292Z\", \"exact_matches\": [{\"exact_query\": \" geospatial\", \"exact_result\": \" DoDD 5105.60 National Geospatial-Intelligence Agency (NGA)\", \"source\": \"matamo\", \"date\": \"2021-05-21T16:25:13.292Z\"}], \"times_matched\": 1}}, \"S0000009\": {\"R0000007\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-22T00:56:25.000Z\", \"exact_matches\": [{\"exact_query\": \"majcom patch, ocp\", \"exact_result\": \"AFI 36-2903\", \"source\": \"user_history\", \"date\": \"2021-07-22T00:56:25.000Z\"}], \"times_matched\": 1}}, \"S0000010\": {\"R0000010\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-08T18:08:00.000Z\", \"exact_matches\": [{\"exact_query\": \"mip\", \"exact_result\": \"DoDD 5205.12 CH 2\", \"source\": \"user_history\", \"date\": \"2021-06-08T18:08:00.000Z\"}], \"times_matched\": 1}}, \"S0000011\": {\"R0000068\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-03T19:00:42.759Z\", \"exact_matches\": [{\"exact_query\": \" telework\", \"exact_result\": \" AFI 36-816 Civilian Telework Program\", \"source\": \"matamo\", \"date\": \"2021-08-03T19:00:42.759Z\"}, {\"exact_query\": \" telework\", \"exact_result\": \" AFI 36-816 Civilian Telework Program\", \"source\": \"matamo\", \"date\": \"2021-09-23T14:41:50.557Z\"}], \"times_matched\": 2}, \"R0000049\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-03T19:00:42.759Z\", \"exact_matches\": [{\"exact_query\": \"telework\", \"exact_result\": \"AFI 31-118\", \"source\": \"user_history\", \"date\": \"2021-12-06T21:55:47.000Z\"}, {\"exact_query\": \"telework\", \"exact_result\": \"AFI 31-118\", \"source\": \"user_history\", \"date\": \"2021-12-08T15:16:38.000Z\"}], \"times_matched\": 2}}, \"S0000012\": {\"R0000002\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-11-16T13:21:50.000Z\", \"exact_matches\": [{\"exact_query\": \"undersecretary of defense for intelligence and security\", \"exact_result\": \"CFR-2021-title6-vol1\", \"source\": \"user_history\", \"date\": \"2021-11-16T13:21:50.000Z\"}], \"times_matched\": 1}}, \"S0000013\": {\"R0000000\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-05T18:16:55.146Z\", \"exact_matches\": [{\"exact_query\": \" cyber range\", \"exact_result\": \" CJCSM 3500.03E Joint Training Manual for the Armed Forces of the United States\", \"source\": \"matamo\", \"date\": \"2021-04-05T18:16:55.146Z\"}], \"times_matched\": 1}}, \"S0000014\": {\"R0000030\": {\"correct_match\": \"true\", \"last_match_date\": \"2022-01-07T16:29:02.000Z\", \"exact_matches\": [{\"exact_query\": \"ncis\", \"exact_result\": \"SECNAVINST 5300.28F\", \"source\": \"user_history\", \"date\": \"2022-01-07T16:29:02.000Z\"}], \"times_matched\": 1}}, \"S0000015\": {\"R0000059\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-04T19:24:44.000Z\", \"exact_matches\": [{\"exact_query\": \"downgrade award appeal\", \"exact_result\": \"AR 600-8-22\", \"source\": \"user_history\", \"date\": \"2021-08-04T19:24:44.000Z\"}], \"times_matched\": 1}}, \"S0000016\": {\"R0000052\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-18T13:09:38.000Z\", \"exact_matches\": [{\"exact_query\": \"time in training 3dxxx\", \"exact_result\": \"CFETP 3DXXX\", \"source\": \"user_history\", \"date\": \"2021-08-18T13:09:38.000Z\"}], \"times_matched\": 1}}, \"S0000017\": {\"R0000046\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-14T17:56:46.000Z\", \"exact_matches\": [{\"exact_query\": \"bcac approval\", \"exact_result\": \"DoDI 5000.75 CH 2\", \"source\": \"user_history\", \"date\": \"2021-07-14T17:56:46.000Z\"}], \"times_matched\": 1}}, \"S0000018\": {\"R0000001\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-27T17:24:38.000Z\", \"exact_matches\": [{\"exact_query\": \"covid 19\", \"exact_result\": \"EO 13962\", \"source\": \"user_history\", \"date\": \"2021-10-27T17:24:38.000Z\"}], \"times_matched\": 1}}, \"S0000019\": {\"R0000003\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-09T16:43:02.000Z\", \"exact_matches\": [{\"exact_query\": \"aeronautical navigation aafif\", \"exact_result\": \"TC 3-04.15\", \"source\": \"user_history\", \"date\": \"2021-08-09T16:43:02.000Z\"}], \"times_matched\": 1}}, \"S0000020\": {\"R0000015\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-11T19:38:12.825Z\", \"exact_matches\": [{\"exact_query\": \" safety and occupational health\", \"exact_result\": \" SECNAVINST 5100.10K DEPARTMENT OF THE NAVY SAFETY PROGRAM\", \"source\": \"matamo\", \"date\": \"2021-05-11T19:38:12.825Z\"}], \"times_matched\": 1}}, \"S0000021\": {\"R0000033\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-28T17:58:15.000Z\", \"exact_matches\": [{\"exact_query\": \"medical testing for dod civilian workers\", \"exact_result\": \"AFMAN 48-146\", \"source\": \"user_history\", \"date\": \"2021-10-28T17:58:15.000Z\"}], \"times_matched\": 1}}, \"S0000022\": {\"R0000004\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-28T10:52:59.000Z\", \"exact_matches\": [{\"exact_query\": \"jcsfl mission partner\", \"exact_result\": \"CJCSM 3265.01A\", \"source\": \"user_history\", \"date\": \"2021-05-28T10:52:59.000Z\"}], \"times_matched\": 1}}, \"S0000023\": {\"R0000048\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-11T13:45:54.000Z\", \"exact_matches\": [{\"exact_query\": \"dodi 1322.29\", \"exact_result\": \"DoDI 1322.29 CH 1\", \"source\": \"user_history\", \"date\": \"2021-08-11T13:45:54.000Z\"}], \"times_matched\": 1}}, \"S0000024\": {\"R0000012\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-12-15T15:35:45.000Z\", \"exact_matches\": [{\"exact_query\": \"'employed in place'\", \"exact_result\": \"AFI 10-201\", \"source\": \"user_history\", \"date\": \"2021-12-15T15:35:45.000Z\"}], \"times_matched\": 1}}, \"S0000025\": {\"R0000065\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-12T16:33:03.000Z\", \"exact_matches\": [{\"exact_query\": \"manpower\", \"exact_result\": \"DEP SEC DEF Memo, Total Force Manpower Governance for OSD, Defense Agencies, and DoD Field Activities, 6 6 2017 OCR\", \"source\": \"user_history\", \"date\": \"2021-05-12T16:33:03.000Z\"}], \"times_matched\": 1}}, \"S0000026\": {\"R0000067\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-21T13:13:18.303Z\", \"exact_matches\": [{\"exact_query\": \" sexual assault prevention\", \"exact_result\": \" SECNAVINST 1752.4C SEXUAL ASSAULT PREVENTION AND RESPONSE PROGRAM PROCEDURES\", \"source\": \"matamo\", \"date\": \"2021-04-21T13:13:18.303Z\"}], \"times_matched\": 1}}, \"S0000027\": {\"R0000057\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-16T15:46:57.000Z\", \"exact_matches\": [{\"exact_query\": \"cwmd unity of effort\", \"exact_result\": \"DEP SEC DEF Memo, (UFOUO) Facilitating Unity of Effort to Advance Countering Weapons of Mass Destruction Objectives, 4 9 2018 OCR\", \"source\": \"user_history\", \"date\": \"2021-06-16T15:46:57.000Z\"}], \"times_matched\": 1}}, \"S0000028\": {\"R0000060\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-26T20:51:45.000Z\", \"exact_matches\": [{\"exact_query\": \"fire extinguisher inspection\", \"exact_result\": \"AFMAN 91-203\", \"source\": \"user_history\", \"date\": \"2021-08-26T20:51:45.000Z\"}], \"times_matched\": 1}}, \"S0000029\": {\"R0000040\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-09-21T14:43:02.000Z\", \"exact_matches\": [{\"exact_query\": \"deployed manpower agency\", \"exact_result\": \"AFI 10-403\", \"source\": \"user_history\", \"date\": \"2021-09-21T14:43:02.000Z\"}], \"times_matched\": 1}}, \"S0000030\": {\"R0000045\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-03T12:50:35.464Z\", \"exact_matches\": [{\"exact_query\": \" International Cooperative Administrative Support Services (ICASS)\", \"exact_result\": \" DoDI 7060.06 International Cooperative Administrative Support Services (ICASS)\", \"source\": \"matamo\", \"date\": \"2021-08-03T12:50:35.464Z\"}], \"times_matched\": 1}}, \"S0000031\": {\"R0000017\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-10T12:39:14.000Z\", \"exact_matches\": [{\"exact_query\": \"suspension of favorable personnel actions (flag) officer retirement\", \"exact_result\": \"AR 135-175\", \"source\": \"user_history\", \"date\": \"2021-06-10T12:39:14.000Z\"}], \"times_matched\": 1}}, \"S0000032\": {\"R0000034\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-24T19:08:35.000Z\", \"exact_matches\": [{\"exact_query\": \"'artificial intelligence'\", \"exact_result\": \"DoD Support for the National Security Commission on Artificial Intelligence\", \"source\": \"user_history\", \"date\": \"2021-06-24T19:08:35.000Z\"}], \"times_matched\": 1}}, \"S0000033\": {\"R0000021\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-11-22T22:20:21.000Z\", \"exact_matches\": [{\"exact_query\": \"cloud computing\", \"exact_result\": \"DoDI 5000.74\", \"source\": \"user_history\", \"date\": \"2021-11-22T22:20:21.000Z\"}], \"times_matched\": 1}}, \"S0000034\": {\"R0000062\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-22T12:30:23.340Z\", \"exact_matches\": [{\"exact_query\": \" wet wing\", \"exact_result\": \" AFI 11-235 SPECIALIZED REFUELING OPERATIONS\", \"source\": \"matamo\", \"date\": \"2021-04-22T12:30:23.340Z\"}], \"times_matched\": 1}}, \"S0000035\": {\"R0000005\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-11-17T16:10:49.000Z\", \"exact_matches\": [{\"exact_query\": \"'space governance committee'\", \"exact_result\": \"DEP SEC DEF Memo, Space Organization and Management Tasks, 9 10 2018 OCR\", \"source\": \"user_history\", \"date\": \"2021-11-17T16:10:49.000Z\"}], \"times_matched\": 1}}, \"S0000036\": {\"R0000018\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-09-28T19:31:50.000Z\", \"exact_matches\": [{\"exact_query\": \"'s-cat'\", \"exact_result\": \"Defense Acquisition Services\", \"source\": \"user_history\", \"date\": \"2021-09-28T19:31:50.000Z\"}], \"times_matched\": 1}}, \"S0000037\": {\"R0000053\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-09T12:46:15.000Z\", \"exact_matches\": [{\"exact_query\": \"cfetp 3e5x1\", \"exact_result\": \"CFETP 3E5X1\", \"source\": \"user_history\", \"date\": \"2021-07-09T12:46:15.000Z\"}], \"times_matched\": 1}}, \"S0000038\": {\"R0000006\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-04T22:24:40.000Z\", \"exact_matches\": [{\"exact_query\": \"risk matrix\", \"exact_result\": \"Framework for Risk Categorization for Use During Independent Technical Risk Assessments\", \"source\": \"user_history\", \"date\": \"2021-05-04T22:24:40.000Z\"}], \"times_matched\": 1}}, \"S0000039\": {\"R0000013\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-15T11:42:48.684Z\", \"exact_matches\": [{\"exact_query\": \" CSA\", \"exact_result\": \" DoDD 3000.06 Combat Support Agencies (CSAs)\", \"source\": \"matamo\", \"date\": \"2021-04-15T11:42:48.684Z\"}], \"times_matched\": 1}}, \"S0000040\": {\"R0000020\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-04T19:28:07.000Z\", \"exact_matches\": [{\"exact_query\": \"36-1-191\", \"exact_result\": \"QTP 24-3-C355\", \"source\": \"user_history\", \"date\": \"2021-08-04T19:28:07.000Z\"}], \"times_matched\": 1}}, \"S0000041\": {\"R0000036\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-24T19:45:57.054Z\", \"exact_matches\": [{\"exact_query\": \" who is sergeant major of the army\", \"exact_result\": \" DoDI 1000.01 Identification (ID) Cards Required by the Geneva Conventions\", \"source\": \"matamo\", \"date\": \"2021-06-24T19:45:57.054Z\"}], \"times_matched\": 1}}, \"S0000042\": {\"R0000028\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-21T13:48:49.176Z\", \"exact_matches\": [{\"exact_query\": \" \\\"use of alcohol\\\" and events\", \"exact_result\": \" AFI 34-219 ALCOHOLIC BEVERAGE PROGRAM\", \"source\": \"matamo\", \"date\": \"2021-07-21T13:48:49.176Z\"}], \"times_matched\": 1}}, \"S0000043\": {\"R0000023\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-18T15:35:06.388Z\", \"exact_matches\": [{\"exact_query\": \" interoperability\", \"exact_result\": \" CJCSI 5123.01H Charter of the Joint Requirements Oversight Council (JROC) and the Implementation of the Joint Capabilities Integration and Development System\", \"source\": \"matamo\", \"date\": \"2021-05-18T15:35:06.388Z\"}], \"times_matched\": 1}}, \"S0000044\": {\"R0000037\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-28T17:13:12.000Z\", \"exact_matches\": [{\"exact_query\": \"twcf\", \"exact_result\": \"DoDI 4515.13 CH 5\", \"source\": \"user_history\", \"date\": \"2021-07-28T17:13:12.000Z\"}], \"times_matched\": 1}}, \"S0000045\": {\"R0000066\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-25T18:08:27.541Z\", \"exact_matches\": [{\"exact_query\": \" control system automation\", \"exact_result\": \" CIM 11000.7 FACILITIES ENERGY MANUAL\", \"source\": \"matamo\", \"date\": \"2021-08-25T18:08:27.541Z\"}], \"times_matched\": 1}}, \"S0000046\": {\"R0000058\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-05T01:42:28.000Z\", \"exact_matches\": [{\"exact_query\": \"farp\", \"exact_result\": \"AFMAN 11-2MC-130HV3CL-5\", \"source\": \"user_history\", \"date\": \"2021-08-05T01:42:28.000Z\"}], \"times_matched\": 1}}, \"S0000047\": {\"R0000050\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-31T17:14:00.347Z\", \"exact_matches\": [{\"exact_query\": \" dcma\", \"exact_result\": \" Memo Joint Officer Handbook - Staffing and Action Guide (2011)\", \"source\": \"matamo\", \"date\": \"2021-08-31T17:14:00.347Z\"}], \"times_matched\": 1}}, \"S0000048\": {\"R0000055\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-09-08T23:54:40.935Z\", \"exact_matches\": [{\"exact_query\": \" military\", \"exact_result\": \" DoDI 1332.45 Retention Determinations for Non-Deployable Service Members\", \"source\": \"matamo\", \"date\": \"2021-09-08T23:54:40.935Z\"}], \"times_matched\": 1}}, \"S0000049\": {\"R0000054\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-28T19:49:49.000Z\", \"exact_matches\": [{\"exact_query\": \"the mexico city policy\", \"exact_result\": \"H.R 2003 IH 117th\", \"source\": \"user_history\", \"date\": \"2021-06-28T19:49:49.000Z\"}], \"times_matched\": 1}}, \"S0000050\": {\"R0000044\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-27T17:37:29.000Z\", \"exact_matches\": [{\"exact_query\": \"title 10 section 2222\", \"exact_result\": \"Title 43\", \"source\": \"user_history\", \"date\": \"2021-10-27T17:37:29.000Z\"}], \"times_matched\": 1}}, \"S0000051\": {\"R0000024\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-07T20:45:29.000Z\", \"exact_matches\": [{\"exact_query\": \"'dodd 5200.27'\", \"exact_result\": \"AR 381-10\", \"source\": \"user_history\", \"date\": \"2021-07-07T20:45:29.000Z\"}], \"times_matched\": 1}}, \"S0000052\": {\"R0000063\": {\"correct_match\": \"true\", \"last_match_date\": \"2022-01-25T12:48:28.000Z\", \"exact_matches\": [{\"exact_query\": \"dod 4140.26\", \"exact_result\": \"DoDM 4140.26 Volume 3\", \"source\": \"user_history\", \"date\": \"2022-01-25T12:48:28.000Z\"}], \"times_matched\": 1}}, \"S0000053\": {\"R0000019\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-16T13:29:24.000Z\", \"exact_matches\": [{\"exact_query\": \"dodi 8510.01 risk management framework (rmf) for dod information technology (it)\", \"exact_result\": \"DoDI 8510.01 CH 3\", \"source\": \"user_history\", \"date\": \"2021-08-16T13:29:24.000Z\"}], \"times_matched\": 1}}, \"S0000054\": {\"R0000029\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-09-21T19:38:58.000Z\", \"exact_matches\": [{\"exact_query\": \"'combatant command ' and 'intelligence'\", \"exact_result\": \"DoDM 3305.09 CH 2\", \"source\": \"user_history\", \"date\": \"2021-09-21T19:38:58.000Z\"}], \"times_matched\": 1}}, \"S0000055\": {\"R0000064\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-11-30T14:50:43.000Z\", \"exact_matches\": [{\"exact_query\": \"open source software memo\", \"exact_result\": \"DEP SEC DEF Memo, Public-Private Talent Exchange, 7 19 2018 OCR\", \"source\": \"user_history\", \"date\": \"2021-11-30T14:50:43.000Z\"}], \"times_matched\": 1}}, \"S0000056\": {\"R0000011\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-12-08T22:40:02.000Z\", \"exact_matches\": [{\"exact_query\": \"rest and recuperation\", \"exact_result\": \"AFI 36-815\", \"source\": \"user_history\", \"date\": \"2021-12-08T22:40:02.000Z\"}], \"times_matched\": 1}}, \"S0000057\": {\"R0000039\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-09-23T16:11:31.850Z\", \"exact_matches\": [{\"exact_query\": \" physical fitness\", \"exact_result\": \" MCO 1700.39 MARINE CORPS RECREATION PROGRAMS\", \"source\": \"matamo\", \"date\": \"2021-09-23T16:11:31.850Z\"}], \"times_matched\": 1}}, \"S0000058\": {\"R0000022\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-08T13:55:23.000Z\", \"exact_matches\": [{\"exact_query\": \"'workflow' 'air force'\", \"exact_result\": \"AFI 90-201\", \"source\": \"user_history\", \"date\": \"2021-06-08T13:55:23.000Z\"}], \"times_matched\": 1}}, \"S0000059\": {\"R0000014\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-26T14:07:23.000Z\", \"exact_matches\": [{\"exact_query\": \"climate change design standards\", \"exact_result\": \"2019\", \"source\": \"user_history\", \"date\": \"2021-05-26T14:07:23.000Z\"}], \"times_matched\": 1}}, \"S0000060\": {\"R0000049\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-28T19:02:17.000Z\", \"exact_matches\": [{\"exact_query\": \"'active shooter'\", \"exact_result\": \"AFI 31-118\", \"source\": \"user_history\", \"date\": \"2021-10-28T19:02:17.000Z\"}], \"times_matched\": 1}}, \"S0000061\": {\"R0000009\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-12-09T18:01:58.000Z\", \"exact_matches\": [{\"exact_query\": \"'intelligence and security'\", \"exact_result\": \"H.R. 5412 RH 117th\", \"source\": \"user_history\", \"date\": \"2021-12-09T18:01:58.000Z\"}], \"times_matched\": 1}}, \"S0000062\": {\"R0000008\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-09-09T18:56:29.000Z\", \"exact_matches\": [{\"exact_query\": \"'cloud security'\", \"exact_result\": \"H.R 21 IH 117th\", \"source\": \"user_history\", \"date\": \"2021-09-09T18:56:29.000Z\"}], \"times_matched\": 1}}, \"S0000063\": {\"R0000016\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-18T16:53:57.000Z\", \"exact_matches\": [{\"exact_query\": \"dodi 5530.03 international agreements\", \"exact_result\": \"DAFMAN 10-703\", \"source\": \"user_history\", \"date\": \"2021-10-18T16:53:57.000Z\"}], \"times_matched\": 1}}, \"S0000064\": {\"R0000041\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-27T16:39:26.347Z\", \"exact_matches\": [{\"exact_query\": \" senior accountable official\", \"exact_result\": \" DoDI 5010.40 Managers' Internal Control Program Procedures\", \"source\": \"matamo\", \"date\": \"2021-08-27T16:39:26.347Z\"}], \"times_matched\": 1}}, \"S0000065\": {\"R0000069\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-03-30T13:13:04.319Z\", \"exact_matches\": [{\"exact_query\": \" USAR\", \"exact_result\": \" CJCSI 3207.01C Department of Defense Support to Humanitarian Mine Action\", \"source\": \"matamo\", \"date\": \"2021-03-30T13:13:04.319Z\"}], \"times_matched\": 1}}, \"S0000066\": {\"R0000032\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-27T17:42:52.000Z\", \"exact_matches\": [{\"exact_query\": \"eo 13603\", \"exact_result\": \"EO 13603\", \"source\": \"user_history\", \"date\": \"2021-07-27T17:42:52.000Z\"}], \"times_matched\": 1}}, \"S0000067\": {\"R0000047\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-05T18:19:15.673Z\", \"exact_matches\": [{\"exact_query\": \" plain language policy\", \"exact_result\": \" DoDD 5110.04 Washington Headquarters Services (WHS)\", \"source\": \"matamo\", \"date\": \"2021-04-05T18:19:15.673Z\"}], \"times_matched\": 1}}, \"S0000068\": {\"R0000026\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-13T00:56:55.410Z\", \"exact_matches\": [{\"exact_query\": \" security cooperation\", \"exact_result\": \" Memo 2015 - CNAS - Security Cooperation and Assistance\", \"source\": \"matamo\", \"date\": \"2021-08-13T00:56:55.410Z\"}], \"times_matched\": 1}}, \"S0000069\": {\"R0000038\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-11-18T20:21:19.000Z\", \"exact_matches\": [{\"exact_query\": \"operational contractor support\", \"exact_result\": \"DoDI 8320.04 CH 3\", \"source\": \"user_history\", \"date\": \"2021-11-18T20:21:19.000Z\"}], \"times_matched\": 1}}}, \"correct\": {\"S0000002\": [\"R0000031\"], \"S0000003\": [\"R0000027\"], \"S0000005\": [\"R0000035\"], \"S0000008\": [\"R0000042\"], \"S0000011\": [\"R0000068\"], \"S0000013\": [\"R0000000\"], \"S0000020\": [\"R0000015\"], \"S0000026\": [\"R0000067\"], \"S0000030\": [\"R0000045\"], \"S0000034\": [\"R0000062\"], \"S0000039\": [\"R0000013\"], \"S0000041\": [\"R0000036\"], \"S0000042\": [\"R0000028\"], \"S0000043\": [\"R0000023\"], \"S0000045\": [\"R0000066\"], \"S0000047\": [\"R0000050\"], \"S0000048\": [\"R0000055\"], \"S0000057\": [\"R0000039\"], \"S0000064\": [\"R0000041\"], \"S0000065\": [\"R0000069\"], \"S0000067\": [\"R0000047\"], \"S0000068\": [\"R0000026\"]}, \"incorrect\": {}, \"correct_vals\": {\"synchronizer and intelligence\": [\" DoDD 5143.01 Under Secretary of Defense for Intelligence and Security (USD(I&S))\"], \"national guard and ngb\": [\" DoDD 5105.77 National Guard Bureau (NGB)\"], \"navy\": [\" OPNAVINST 4650.17 UNUSUALLY ARDUOUS SEA DUTY FOR TRAVEL AND TRANSPORTATION ENTITLEMENTS\"], \"geospatial\": [\" DoDD 5105.60 National Geospatial-Intelligence Agency (NGA)\"], \"telework\": [\" AFI 36-816 Civilian Telework Program\"], \"cyber range\": [\" CJCSM 3500.03E Joint Training Manual for the Armed Forces of the United States\"], \"safety and occupational health\": [\" SECNAVINST 5100.10K DEPARTMENT OF THE NAVY SAFETY PROGRAM\"], \"sexual assault prevention\": [\" SECNAVINST 1752.4C SEXUAL ASSAULT PREVENTION AND RESPONSE PROGRAM PROCEDURES\"], \"international cooperative administrative support services (icass)\": [\" DoDI 7060.06 International Cooperative Administrative Support Services (ICASS)\"], \"wet wing\": [\" AFI 11-235 SPECIALIZED REFUELING OPERATIONS\"], \"csa\": [\" DoDD 3000.06 Combat Support Agencies (CSAs)\"], \"who is sergeant major of the army\": [\" DoDI 1000.01 Identification (ID) Cards Required by the Geneva Conventions\"], \"use of alcohol and events\": [\" AFI 34-219 ALCOHOLIC BEVERAGE PROGRAM\"], \"interoperability\": [\" CJCSI 5123.01H Charter of the Joint Requirements Oversight Council (JROC) and the Implementation of the Joint Capabilities Integration and Development System\"], \"control system automation\": [\" CIM 11000.7 FACILITIES ENERGY MANUAL\"], \"dcma\": [\" Memo Joint Officer Handbook - Staffing and Action Guide (2011)\"], \"military\": [\" DoDI 1332.45 Retention Determinations for Non-Deployable Service Members\"], \"physical fitness\": [\" MCO 1700.39 MARINE CORPS RECREATION PROGRAMS\"], \"senior accountable official\": [\" DoDI 5010.40 Managers' Internal Control Program Procedures\"], \"usar\": [\" CJCSI 3207.01C Department of Defense Support to Humanitarian Mine Action\"], \"plain language policy\": [\" DoDD 5110.04 Washington Headquarters Services (WHS)\"], \"security cooperation\": [\" Memo 2015 - CNAS - Security Cooperation and Assistance\"]}, \"incorrect_vals\": {}}" \ No newline at end of file diff --git a/gamechangerml/data/test_data/test_validation/gold/intelligent_search_metadata.json b/gamechangerml/data/test_data/test_validation/gold/intelligent_search_metadata.json new file mode 100644 index 00000000..5e22a06d --- /dev/null +++ b/gamechangerml/data/test_data/test_validation/gold/intelligent_search_metadata.json @@ -0,0 +1 @@ +{"date_created": "2022-03-09", "level": "gold", "number_queries": 70, "number_documents": 71, "number_correct": 22, "number_incorrect": 0, "start_date": "2020-12-01", "end_date": "2025-12-01", "exclude_searches": ["pizza", "shark"], "min_correct_matches": 3, "max_results": 7, "filter_queries": "False"} \ No newline at end of file diff --git a/gamechangerml/data/test_data/test_validation/silver/intelligent_search_data.json b/gamechangerml/data/test_data/test_validation/silver/intelligent_search_data.json new file mode 100644 index 00000000..1d57654e --- /dev/null +++ b/gamechangerml/data/test_data/test_validation/silver/intelligent_search_data.json @@ -0,0 +1 @@ +"{\"queries\": {\"S0000000\": \"restoration design phases\", \"S0000001\": \"5240.06\", \"S0000002\": \"synchronizer and intelligence\", \"S0000003\": \"national guard and ngb\", \"S0000004\": \"jtru\", \"S0000005\": \"navy\", \"S0000006\": \"bioenvironmental\", \"S0000007\": \"36-2623\", \"S0000008\": \"geospatial\", \"S0000009\": \"majcom patch, ocp\", \"S0000010\": \"mip\", \"S0000011\": \"telework\", \"S0000012\": \"undersecretary of defense for intelligence and security\", \"S0000013\": \"cyber range\", \"S0000014\": \"ncis\", \"S0000015\": \"downgrade award appeal\", \"S0000016\": \"time in training 3dxxx\", \"S0000017\": \"bcac approval\", \"S0000018\": \"covid 19\", \"S0000019\": \"aeronautical navigation aafif\", \"S0000020\": \"safety and occupational health\", \"S0000021\": \"medical testing for dod civilian workers\", \"S0000022\": \"jcsfl mission partner\", \"S0000023\": \"dodi 1322.29\", \"S0000024\": \"employed in place\", \"S0000025\": \"manpower\", \"S0000026\": \"sexual assault prevention\", \"S0000027\": \"cwmd unity of effort\", \"S0000028\": \"fire extinguisher inspection\", \"S0000029\": \"deployed manpower agency\", \"S0000030\": \"international cooperative administrative support services (icass)\", \"S0000031\": \"suspension of favorable personnel actions (flag) officer retirement\", \"S0000032\": \"artificial intelligence\", \"S0000033\": \"cloud computing\", \"S0000034\": \"wet wing\", \"S0000035\": \"space governance committee\", \"S0000036\": \"s-cat\", \"S0000037\": \"cfetp 3e5x1\", \"S0000038\": \"risk matrix\", \"S0000039\": \"csa\", \"S0000040\": \"36-1-191\", \"S0000041\": \"who is sergeant major of the army\", \"S0000042\": \"use of alcohol and events\", \"S0000043\": \"interoperability\", \"S0000044\": \"twcf\", \"S0000045\": \"control system automation\", \"S0000046\": \"farp\", \"S0000047\": \"dcma\", \"S0000048\": \"military\", \"S0000049\": \"the mexico city policy\", \"S0000050\": \"title 10 section 2222\", \"S0000051\": \"dodd 5200.27\", \"S0000052\": \"dod 4140.26\", \"S0000053\": \"dodi 8510.01 risk management framework (rmf) for dod information technology (it)\", \"S0000054\": \"combatant command and intelligence\", \"S0000055\": \"open source software memo\", \"S0000056\": \"rest and recuperation\", \"S0000057\": \"physical fitness\", \"S0000058\": \"workflow air force\", \"S0000059\": \"climate change design standards\", \"S0000060\": \"active shooter\", \"S0000061\": \"intelligence and security\", \"S0000062\": \"cloud security\", \"S0000063\": \"dodi 5530.03 international agreements\", \"S0000064\": \"senior accountable official\", \"S0000065\": \"usar\", \"S0000066\": \"eo 13603\", \"S0000067\": \"plain language policy\", \"S0000068\": \"security cooperation\", \"S0000069\": \"operational contractor support\"}, \"collection\": {\"R0000000\": \" CJCSM 3500.03E Joint Training Manual for the Armed Forces of the United States\", \"R0000001\": \"EO 13962\", \"R0000002\": \"CFR-2021-title6-vol1\", \"R0000003\": \"TC 3-04.15\", \"R0000004\": \"CJCSM 3265.01A\", \"R0000005\": \"DEP SEC DEF Memo, Space Organization and Management Tasks, 9 10 2018 OCR\", \"R0000006\": \"Framework for Risk Categorization for Use During Independent Technical Risk Assessments\", \"R0000007\": \"AFI 36-2903\", \"R0000008\": \"H.R 21 IH 117th\", \"R0000009\": \"H.R. 5412 RH 117th\", \"R0000010\": \"DoDD 5205.12 CH 2\", \"R0000011\": \"AFI 36-815\", \"R0000012\": \"AFI 10-201\", \"R0000013\": \" DoDD 3000.06 Combat Support Agencies (CSAs)\", \"R0000014\": \"2019\", \"R0000015\": \" SECNAVINST 5100.10K DEPARTMENT OF THE NAVY SAFETY PROGRAM\", \"R0000016\": \"DAFMAN 10-703\", \"R0000017\": \"AR 135-175\", \"R0000018\": \"Defense Acquisition Services\", \"R0000019\": \"DoDI 8510.01 CH 3\", \"R0000020\": \"QTP 24-3-C355\", \"R0000021\": \"DoDI 5000.74\", \"R0000022\": \"AFI 90-201\", \"R0000023\": \" CJCSI 5123.01H Charter of the Joint Requirements Oversight Council (JROC) and the Implementation of the Joint Capabilities Integration and Development System\", \"R0000024\": \"AR 381-10\", \"R0000025\": \"FRM 10, 1 1 2020 OCR\", \"R0000026\": \" Memo 2015 - CNAS - Security Cooperation and Assistance\", \"R0000027\": \" DoDD 5105.77 National Guard Bureau (NGB)\", \"R0000028\": \" AFI 34-219 ALCOHOLIC BEVERAGE PROGRAM\", \"R0000029\": \"DoDM 3305.09 CH 2\", \"R0000030\": \"SECNAVINST 5300.28F\", \"R0000031\": \" DoDD 5143.01 Under Secretary of Defense for Intelligence and Security (USD(I&S))\", \"R0000032\": \"EO 13603\", \"R0000033\": \"AFMAN 48-146\", \"R0000034\": \"DoD Support for the National Security Commission on Artificial Intelligence\", \"R0000035\": \" OPNAVINST 4650.17 UNUSUALLY ARDUOUS SEA DUTY FOR TRAVEL AND TRANSPORTATION ENTITLEMENTS\", \"R0000036\": \" DoDI 1000.01 Identification (ID) Cards Required by the Geneva Conventions\", \"R0000037\": \"DoDI 4515.13 CH 5\", \"R0000038\": \"DoDI 8320.04 CH 3\", \"R0000039\": \" MCO 1700.39 MARINE CORPS RECREATION PROGRAMS\", \"R0000040\": \"AFI 10-403\", \"R0000041\": \" DoDI 5010.40 Managers' Internal Control Program Procedures\", \"R0000042\": \" DoDD 5105.60 National Geospatial-Intelligence Agency (NGA)\", \"R0000043\": \"DoDD 5240.06 CH 3\", \"R0000044\": \"Title 43\", \"R0000045\": \" DoDI 7060.06 International Cooperative Administrative Support Services (ICASS)\", \"R0000046\": \"DoDI 5000.75 CH 2\", \"R0000047\": \" DoDD 5110.04 Washington Headquarters Services (WHS)\", \"R0000048\": \"DoDI 1322.29 CH 1\", \"R0000049\": \"AFI 31-118\", \"R0000050\": \" Memo Joint Officer Handbook - Staffing and Action Guide (2011)\", \"R0000051\": \"AFH 1\", \"R0000052\": \"CFETP 3DXXX\", \"R0000053\": \"CFETP 3E5X1\", \"R0000054\": \"H.R 2003 IH 117th\", \"R0000055\": \" DoDI 1332.45 Retention Determinations for Non-Deployable Service Members\", \"R0000056\": \" MISC PUBS GREECE\", \"R0000057\": \"DEP SEC DEF Memo, (UFOUO) Facilitating Unity of Effort to Advance Countering Weapons of Mass Destruction Objectives, 4 9 2018 OCR\", \"R0000058\": \"AFMAN 11-2MC-130HV3CL-5\", \"R0000059\": \"AR 600-8-22\", \"R0000060\": \"AFMAN 91-203\", \"R0000061\": \"AFMAN 32-1007\", \"R0000062\": \" AFI 11-235 SPECIALIZED REFUELING OPERATIONS\", \"R0000063\": \"DoDM 4140.26 Volume 3\", \"R0000064\": \"DEP SEC DEF Memo, Public-Private Talent Exchange, 7 19 2018 OCR\", \"R0000065\": \"DEP SEC DEF Memo, Total Force Manpower Governance for OSD, Defense Agencies, and DoD Field Activities, 6 6 2017 OCR\", \"R0000066\": \" CIM 11000.7 FACILITIES ENERGY MANUAL\", \"R0000067\": \" SECNAVINST 1752.4C SEXUAL ASSAULT PREVENTION AND RESPONSE PROGRAM PROCEDURES\", \"R0000068\": \" AFI 36-816 Civilian Telework Program\", \"R0000069\": \" CJCSI 3207.01C Department of Defense Support to Humanitarian Mine Action\", \"R0000070\": \"DoDFMR V2BCH13\"}, \"meta_relations\": {\"S0000000\": {\"R0000070\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-18T16:32:48.000Z\", \"exact_matches\": [{\"exact_query\": \"restoration design phases\", \"exact_result\": \"DoDFMR V2BCH13\", \"source\": \"user_history\", \"date\": \"2021-10-18T16:32:48.000Z\"}], \"times_matched\": 1}}, \"S0000001\": {\"R0000043\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-14T16:04:21.000Z\", \"exact_matches\": [{\"exact_query\": \"5240.06\", \"exact_result\": \"DoDD 5240.06 CH 3\", \"source\": \"user_history\", \"date\": \"2021-10-14T16:04:21.000Z\"}], \"times_matched\": 1}}, \"S0000002\": {\"R0000031\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-09T14:02:18.534Z\", \"exact_matches\": [{\"exact_query\": \" \\\"synchronizer\\\" and \\\"intelligence\\\"\", \"exact_result\": \" DoDD 5143.01 Under Secretary of Defense for Intelligence and Security (USD(I&S))\", \"source\": \"matamo\", \"date\": \"2021-08-09T14:02:18.534Z\"}], \"times_matched\": 1}}, \"S0000003\": {\"R0000027\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-14T18:18:26.068Z\", \"exact_matches\": [{\"exact_query\": \" \\\"National Guard\\\" and NGB\", \"exact_result\": \" DoDD 5105.77 National Guard Bureau (NGB)\", \"source\": \"matamo\", \"date\": \"2021-04-14T18:18:26.068Z\"}], \"times_matched\": 1}}, \"S0000004\": {\"R0000025\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-28T21:49:46.000Z\", \"exact_matches\": [{\"exact_query\": \"jtru\", \"exact_result\": \"FRM 10, 1 1 2020 OCR\", \"source\": \"user_history\", \"date\": \"2021-04-28T21:49:46.000Z\"}], \"times_matched\": 1}}, \"S0000005\": {\"R0000035\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-03T19:02:06.140Z\", \"exact_matches\": [{\"exact_query\": \" navy\", \"exact_result\": \" OPNAVINST 4650.17 UNUSUALLY ARDUOUS SEA DUTY FOR TRAVEL AND TRANSPORTATION ENTITLEMENTS\", \"source\": \"matamo\", \"date\": \"2021-08-03T19:02:06.140Z\"}], \"times_matched\": 1}}, \"S0000006\": {\"R0000061\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-11-04T23:17:18.000Z\", \"exact_matches\": [{\"exact_query\": \"bioenvironmental\", \"exact_result\": \"AFMAN 32-1007\", \"source\": \"user_history\", \"date\": \"2021-11-04T23:17:18.000Z\"}], \"times_matched\": 1}}, \"S0000007\": {\"R0000051\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-12-09T18:33:48.000Z\", \"exact_matches\": [{\"exact_query\": \"36-2623\", \"exact_result\": \"AFH 1\", \"source\": \"user_history\", \"date\": \"2021-12-09T18:33:48.000Z\"}], \"times_matched\": 1}}, \"S0000008\": {\"R0000042\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-21T16:25:13.292Z\", \"exact_matches\": [{\"exact_query\": \" geospatial\", \"exact_result\": \" DoDD 5105.60 National Geospatial-Intelligence Agency (NGA)\", \"source\": \"matamo\", \"date\": \"2021-05-21T16:25:13.292Z\"}], \"times_matched\": 1}}, \"S0000009\": {\"R0000007\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-22T00:56:25.000Z\", \"exact_matches\": [{\"exact_query\": \"majcom patch, ocp\", \"exact_result\": \"AFI 36-2903\", \"source\": \"user_history\", \"date\": \"2021-07-22T00:56:25.000Z\"}], \"times_matched\": 1}}, \"S0000010\": {\"R0000010\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-08T18:08:00.000Z\", \"exact_matches\": [{\"exact_query\": \"mip\", \"exact_result\": \"DoDD 5205.12 CH 2\", \"source\": \"user_history\", \"date\": \"2021-06-08T18:08:00.000Z\"}], \"times_matched\": 1}}, \"S0000011\": {\"R0000068\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-03T19:00:42.759Z\", \"exact_matches\": [{\"exact_query\": \" telework\", \"exact_result\": \" AFI 36-816 Civilian Telework Program\", \"source\": \"matamo\", \"date\": \"2021-08-03T19:00:42.759Z\"}, {\"exact_query\": \" telework\", \"exact_result\": \" AFI 36-816 Civilian Telework Program\", \"source\": \"matamo\", \"date\": \"2021-09-23T14:41:50.557Z\"}], \"times_matched\": 2}, \"R0000049\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-03T19:00:42.759Z\", \"exact_matches\": [{\"exact_query\": \"telework\", \"exact_result\": \"AFI 31-118\", \"source\": \"user_history\", \"date\": \"2021-12-06T21:55:47.000Z\"}, {\"exact_query\": \"telework\", \"exact_result\": \"AFI 31-118\", \"source\": \"user_history\", \"date\": \"2021-12-08T15:16:38.000Z\"}], \"times_matched\": 2}}, \"S0000012\": {\"R0000002\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-11-16T13:21:50.000Z\", \"exact_matches\": [{\"exact_query\": \"undersecretary of defense for intelligence and security\", \"exact_result\": \"CFR-2021-title6-vol1\", \"source\": \"user_history\", \"date\": \"2021-11-16T13:21:50.000Z\"}], \"times_matched\": 1}}, \"S0000013\": {\"R0000000\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-05T18:16:55.146Z\", \"exact_matches\": [{\"exact_query\": \" cyber range\", \"exact_result\": \" CJCSM 3500.03E Joint Training Manual for the Armed Forces of the United States\", \"source\": \"matamo\", \"date\": \"2021-04-05T18:16:55.146Z\"}], \"times_matched\": 1}}, \"S0000014\": {\"R0000030\": {\"correct_match\": \"true\", \"last_match_date\": \"2022-01-07T16:29:02.000Z\", \"exact_matches\": [{\"exact_query\": \"ncis\", \"exact_result\": \"SECNAVINST 5300.28F\", \"source\": \"user_history\", \"date\": \"2022-01-07T16:29:02.000Z\"}], \"times_matched\": 1}}, \"S0000015\": {\"R0000059\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-04T19:24:44.000Z\", \"exact_matches\": [{\"exact_query\": \"downgrade award appeal\", \"exact_result\": \"AR 600-8-22\", \"source\": \"user_history\", \"date\": \"2021-08-04T19:24:44.000Z\"}], \"times_matched\": 1}}, \"S0000016\": {\"R0000052\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-18T13:09:38.000Z\", \"exact_matches\": [{\"exact_query\": \"time in training 3dxxx\", \"exact_result\": \"CFETP 3DXXX\", \"source\": \"user_history\", \"date\": \"2021-08-18T13:09:38.000Z\"}], \"times_matched\": 1}}, \"S0000017\": {\"R0000046\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-14T17:56:46.000Z\", \"exact_matches\": [{\"exact_query\": \"bcac approval\", \"exact_result\": \"DoDI 5000.75 CH 2\", \"source\": \"user_history\", \"date\": \"2021-07-14T17:56:46.000Z\"}], \"times_matched\": 1}}, \"S0000018\": {\"R0000001\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-27T17:24:38.000Z\", \"exact_matches\": [{\"exact_query\": \"covid 19\", \"exact_result\": \"EO 13962\", \"source\": \"user_history\", \"date\": \"2021-10-27T17:24:38.000Z\"}], \"times_matched\": 1}}, \"S0000019\": {\"R0000003\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-09T16:43:02.000Z\", \"exact_matches\": [{\"exact_query\": \"aeronautical navigation aafif\", \"exact_result\": \"TC 3-04.15\", \"source\": \"user_history\", \"date\": \"2021-08-09T16:43:02.000Z\"}], \"times_matched\": 1}}, \"S0000020\": {\"R0000015\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-11T19:38:12.825Z\", \"exact_matches\": [{\"exact_query\": \" safety and occupational health\", \"exact_result\": \" SECNAVINST 5100.10K DEPARTMENT OF THE NAVY SAFETY PROGRAM\", \"source\": \"matamo\", \"date\": \"2021-05-11T19:38:12.825Z\"}], \"times_matched\": 1}}, \"S0000021\": {\"R0000033\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-28T17:58:15.000Z\", \"exact_matches\": [{\"exact_query\": \"medical testing for dod civilian workers\", \"exact_result\": \"AFMAN 48-146\", \"source\": \"user_history\", \"date\": \"2021-10-28T17:58:15.000Z\"}], \"times_matched\": 1}}, \"S0000022\": {\"R0000004\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-28T10:52:59.000Z\", \"exact_matches\": [{\"exact_query\": \"jcsfl mission partner\", \"exact_result\": \"CJCSM 3265.01A\", \"source\": \"user_history\", \"date\": \"2021-05-28T10:52:59.000Z\"}], \"times_matched\": 1}}, \"S0000023\": {\"R0000048\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-11T13:45:54.000Z\", \"exact_matches\": [{\"exact_query\": \"dodi 1322.29\", \"exact_result\": \"DoDI 1322.29 CH 1\", \"source\": \"user_history\", \"date\": \"2021-08-11T13:45:54.000Z\"}], \"times_matched\": 1}}, \"S0000024\": {\"R0000012\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-12-15T15:35:45.000Z\", \"exact_matches\": [{\"exact_query\": \"'employed in place'\", \"exact_result\": \"AFI 10-201\", \"source\": \"user_history\", \"date\": \"2021-12-15T15:35:45.000Z\"}], \"times_matched\": 1}}, \"S0000025\": {\"R0000065\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-12T16:33:03.000Z\", \"exact_matches\": [{\"exact_query\": \"manpower\", \"exact_result\": \"DEP SEC DEF Memo, Total Force Manpower Governance for OSD, Defense Agencies, and DoD Field Activities, 6 6 2017 OCR\", \"source\": \"user_history\", \"date\": \"2021-05-12T16:33:03.000Z\"}], \"times_matched\": 1}}, \"S0000026\": {\"R0000067\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-21T13:13:18.303Z\", \"exact_matches\": [{\"exact_query\": \" sexual assault prevention\", \"exact_result\": \" SECNAVINST 1752.4C SEXUAL ASSAULT PREVENTION AND RESPONSE PROGRAM PROCEDURES\", \"source\": \"matamo\", \"date\": \"2021-04-21T13:13:18.303Z\"}], \"times_matched\": 1}}, \"S0000027\": {\"R0000057\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-16T15:46:57.000Z\", \"exact_matches\": [{\"exact_query\": \"cwmd unity of effort\", \"exact_result\": \"DEP SEC DEF Memo, (UFOUO) Facilitating Unity of Effort to Advance Countering Weapons of Mass Destruction Objectives, 4 9 2018 OCR\", \"source\": \"user_history\", \"date\": \"2021-06-16T15:46:57.000Z\"}], \"times_matched\": 1}}, \"S0000028\": {\"R0000060\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-26T20:51:45.000Z\", \"exact_matches\": [{\"exact_query\": \"fire extinguisher inspection\", \"exact_result\": \"AFMAN 91-203\", \"source\": \"user_history\", \"date\": \"2021-08-26T20:51:45.000Z\"}], \"times_matched\": 1}}, \"S0000029\": {\"R0000040\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-09-21T14:43:02.000Z\", \"exact_matches\": [{\"exact_query\": \"deployed manpower agency\", \"exact_result\": \"AFI 10-403\", \"source\": \"user_history\", \"date\": \"2021-09-21T14:43:02.000Z\"}], \"times_matched\": 1}}, \"S0000030\": {\"R0000045\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-03T12:50:35.464Z\", \"exact_matches\": [{\"exact_query\": \" International Cooperative Administrative Support Services (ICASS)\", \"exact_result\": \" DoDI 7060.06 International Cooperative Administrative Support Services (ICASS)\", \"source\": \"matamo\", \"date\": \"2021-08-03T12:50:35.464Z\"}], \"times_matched\": 1}}, \"S0000031\": {\"R0000017\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-10T12:39:14.000Z\", \"exact_matches\": [{\"exact_query\": \"suspension of favorable personnel actions (flag) officer retirement\", \"exact_result\": \"AR 135-175\", \"source\": \"user_history\", \"date\": \"2021-06-10T12:39:14.000Z\"}], \"times_matched\": 1}}, \"S0000032\": {\"R0000034\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-24T19:08:35.000Z\", \"exact_matches\": [{\"exact_query\": \"'artificial intelligence'\", \"exact_result\": \"DoD Support for the National Security Commission on Artificial Intelligence\", \"source\": \"user_history\", \"date\": \"2021-06-24T19:08:35.000Z\"}], \"times_matched\": 1}}, \"S0000033\": {\"R0000021\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-11-22T22:20:21.000Z\", \"exact_matches\": [{\"exact_query\": \"cloud computing\", \"exact_result\": \"DoDI 5000.74\", \"source\": \"user_history\", \"date\": \"2021-11-22T22:20:21.000Z\"}], \"times_matched\": 1}}, \"S0000034\": {\"R0000062\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-22T12:30:23.340Z\", \"exact_matches\": [{\"exact_query\": \" wet wing\", \"exact_result\": \" AFI 11-235 SPECIALIZED REFUELING OPERATIONS\", \"source\": \"matamo\", \"date\": \"2021-04-22T12:30:23.340Z\"}], \"times_matched\": 1}}, \"S0000035\": {\"R0000005\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-11-17T16:10:49.000Z\", \"exact_matches\": [{\"exact_query\": \"'space governance committee'\", \"exact_result\": \"DEP SEC DEF Memo, Space Organization and Management Tasks, 9 10 2018 OCR\", \"source\": \"user_history\", \"date\": \"2021-11-17T16:10:49.000Z\"}], \"times_matched\": 1}}, \"S0000036\": {\"R0000018\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-09-28T19:31:50.000Z\", \"exact_matches\": [{\"exact_query\": \"'s-cat'\", \"exact_result\": \"Defense Acquisition Services\", \"source\": \"user_history\", \"date\": \"2021-09-28T19:31:50.000Z\"}], \"times_matched\": 1}}, \"S0000037\": {\"R0000053\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-09T12:46:15.000Z\", \"exact_matches\": [{\"exact_query\": \"cfetp 3e5x1\", \"exact_result\": \"CFETP 3E5X1\", \"source\": \"user_history\", \"date\": \"2021-07-09T12:46:15.000Z\"}], \"times_matched\": 1}}, \"S0000038\": {\"R0000006\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-04T22:24:40.000Z\", \"exact_matches\": [{\"exact_query\": \"risk matrix\", \"exact_result\": \"Framework for Risk Categorization for Use During Independent Technical Risk Assessments\", \"source\": \"user_history\", \"date\": \"2021-05-04T22:24:40.000Z\"}], \"times_matched\": 1}}, \"S0000039\": {\"R0000013\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-15T11:42:48.684Z\", \"exact_matches\": [{\"exact_query\": \" CSA\", \"exact_result\": \" DoDD 3000.06 Combat Support Agencies (CSAs)\", \"source\": \"matamo\", \"date\": \"2021-04-15T11:42:48.684Z\"}], \"times_matched\": 1}}, \"S0000040\": {\"R0000020\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-04T19:28:07.000Z\", \"exact_matches\": [{\"exact_query\": \"36-1-191\", \"exact_result\": \"QTP 24-3-C355\", \"source\": \"user_history\", \"date\": \"2021-08-04T19:28:07.000Z\"}], \"times_matched\": 1}}, \"S0000041\": {\"R0000036\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-24T19:45:57.054Z\", \"exact_matches\": [{\"exact_query\": \" who is sergeant major of the army\", \"exact_result\": \" DoDI 1000.01 Identification (ID) Cards Required by the Geneva Conventions\", \"source\": \"matamo\", \"date\": \"2021-06-24T19:45:57.054Z\"}], \"times_matched\": 1}}, \"S0000042\": {\"R0000028\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-21T13:48:49.176Z\", \"exact_matches\": [{\"exact_query\": \" \\\"use of alcohol\\\" and events\", \"exact_result\": \" AFI 34-219 ALCOHOLIC BEVERAGE PROGRAM\", \"source\": \"matamo\", \"date\": \"2021-07-21T13:48:49.176Z\"}], \"times_matched\": 1}}, \"S0000043\": {\"R0000023\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-18T15:35:06.388Z\", \"exact_matches\": [{\"exact_query\": \" interoperability\", \"exact_result\": \" CJCSI 5123.01H Charter of the Joint Requirements Oversight Council (JROC) and the Implementation of the Joint Capabilities Integration and Development System\", \"source\": \"matamo\", \"date\": \"2021-05-18T15:35:06.388Z\"}], \"times_matched\": 1}}, \"S0000044\": {\"R0000037\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-28T17:13:12.000Z\", \"exact_matches\": [{\"exact_query\": \"twcf\", \"exact_result\": \"DoDI 4515.13 CH 5\", \"source\": \"user_history\", \"date\": \"2021-07-28T17:13:12.000Z\"}], \"times_matched\": 1}}, \"S0000045\": {\"R0000066\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-25T18:08:27.541Z\", \"exact_matches\": [{\"exact_query\": \" control system automation\", \"exact_result\": \" CIM 11000.7 FACILITIES ENERGY MANUAL\", \"source\": \"matamo\", \"date\": \"2021-08-25T18:08:27.541Z\"}], \"times_matched\": 1}}, \"S0000046\": {\"R0000058\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-05T01:42:28.000Z\", \"exact_matches\": [{\"exact_query\": \"farp\", \"exact_result\": \"AFMAN 11-2MC-130HV3CL-5\", \"source\": \"user_history\", \"date\": \"2021-08-05T01:42:28.000Z\"}], \"times_matched\": 1}}, \"S0000047\": {\"R0000050\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-31T17:14:00.347Z\", \"exact_matches\": [{\"exact_query\": \" dcma\", \"exact_result\": \" Memo Joint Officer Handbook - Staffing and Action Guide (2011)\", \"source\": \"matamo\", \"date\": \"2021-08-31T17:14:00.347Z\"}], \"times_matched\": 1}}, \"S0000048\": {\"R0000055\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-09-08T23:54:40.935Z\", \"exact_matches\": [{\"exact_query\": \" military\", \"exact_result\": \" DoDI 1332.45 Retention Determinations for Non-Deployable Service Members\", \"source\": \"matamo\", \"date\": \"2021-09-08T23:54:40.935Z\"}], \"times_matched\": 1}}, \"S0000049\": {\"R0000054\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-28T19:49:49.000Z\", \"exact_matches\": [{\"exact_query\": \"the mexico city policy\", \"exact_result\": \"H.R 2003 IH 117th\", \"source\": \"user_history\", \"date\": \"2021-06-28T19:49:49.000Z\"}], \"times_matched\": 1}}, \"S0000050\": {\"R0000044\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-27T17:37:29.000Z\", \"exact_matches\": [{\"exact_query\": \"title 10 section 2222\", \"exact_result\": \"Title 43\", \"source\": \"user_history\", \"date\": \"2021-10-27T17:37:29.000Z\"}], \"times_matched\": 1}}, \"S0000051\": {\"R0000024\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-07T20:45:29.000Z\", \"exact_matches\": [{\"exact_query\": \"'dodd 5200.27'\", \"exact_result\": \"AR 381-10\", \"source\": \"user_history\", \"date\": \"2021-07-07T20:45:29.000Z\"}], \"times_matched\": 1}}, \"S0000052\": {\"R0000063\": {\"correct_match\": \"true\", \"last_match_date\": \"2022-01-25T12:48:28.000Z\", \"exact_matches\": [{\"exact_query\": \"dod 4140.26\", \"exact_result\": \"DoDM 4140.26 Volume 3\", \"source\": \"user_history\", \"date\": \"2022-01-25T12:48:28.000Z\"}], \"times_matched\": 1}}, \"S0000053\": {\"R0000019\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-16T13:29:24.000Z\", \"exact_matches\": [{\"exact_query\": \"dodi 8510.01 risk management framework (rmf) for dod information technology (it)\", \"exact_result\": \"DoDI 8510.01 CH 3\", \"source\": \"user_history\", \"date\": \"2021-08-16T13:29:24.000Z\"}], \"times_matched\": 1}}, \"S0000054\": {\"R0000029\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-09-21T19:38:58.000Z\", \"exact_matches\": [{\"exact_query\": \"'combatant command ' and 'intelligence'\", \"exact_result\": \"DoDM 3305.09 CH 2\", \"source\": \"user_history\", \"date\": \"2021-09-21T19:38:58.000Z\"}], \"times_matched\": 1}}, \"S0000055\": {\"R0000064\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-11-30T14:50:43.000Z\", \"exact_matches\": [{\"exact_query\": \"open source software memo\", \"exact_result\": \"DEP SEC DEF Memo, Public-Private Talent Exchange, 7 19 2018 OCR\", \"source\": \"user_history\", \"date\": \"2021-11-30T14:50:43.000Z\"}], \"times_matched\": 1}}, \"S0000056\": {\"R0000011\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-12-08T22:40:02.000Z\", \"exact_matches\": [{\"exact_query\": \"rest and recuperation\", \"exact_result\": \"AFI 36-815\", \"source\": \"user_history\", \"date\": \"2021-12-08T22:40:02.000Z\"}], \"times_matched\": 1}}, \"S0000057\": {\"R0000039\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-09-23T16:11:31.850Z\", \"exact_matches\": [{\"exact_query\": \" physical fitness\", \"exact_result\": \" MCO 1700.39 MARINE CORPS RECREATION PROGRAMS\", \"source\": \"matamo\", \"date\": \"2021-09-23T16:11:31.850Z\"}], \"times_matched\": 1}}, \"S0000058\": {\"R0000022\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-06-08T13:55:23.000Z\", \"exact_matches\": [{\"exact_query\": \"'workflow' 'air force'\", \"exact_result\": \"AFI 90-201\", \"source\": \"user_history\", \"date\": \"2021-06-08T13:55:23.000Z\"}], \"times_matched\": 1}}, \"S0000059\": {\"R0000014\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-05-26T14:07:23.000Z\", \"exact_matches\": [{\"exact_query\": \"climate change design standards\", \"exact_result\": \"2019\", \"source\": \"user_history\", \"date\": \"2021-05-26T14:07:23.000Z\"}], \"times_matched\": 1}}, \"S0000060\": {\"R0000049\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-28T19:02:17.000Z\", \"exact_matches\": [{\"exact_query\": \"'active shooter'\", \"exact_result\": \"AFI 31-118\", \"source\": \"user_history\", \"date\": \"2021-10-28T19:02:17.000Z\"}], \"times_matched\": 1}}, \"S0000061\": {\"R0000009\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-12-09T18:01:58.000Z\", \"exact_matches\": [{\"exact_query\": \"'intelligence and security'\", \"exact_result\": \"H.R. 5412 RH 117th\", \"source\": \"user_history\", \"date\": \"2021-12-09T18:01:58.000Z\"}], \"times_matched\": 1}}, \"S0000062\": {\"R0000008\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-09-09T18:56:29.000Z\", \"exact_matches\": [{\"exact_query\": \"'cloud security'\", \"exact_result\": \"H.R 21 IH 117th\", \"source\": \"user_history\", \"date\": \"2021-09-09T18:56:29.000Z\"}], \"times_matched\": 1}}, \"S0000063\": {\"R0000016\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-10-18T16:53:57.000Z\", \"exact_matches\": [{\"exact_query\": \"dodi 5530.03 international agreements\", \"exact_result\": \"DAFMAN 10-703\", \"source\": \"user_history\", \"date\": \"2021-10-18T16:53:57.000Z\"}], \"times_matched\": 1}}, \"S0000064\": {\"R0000041\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-27T16:39:26.347Z\", \"exact_matches\": [{\"exact_query\": \" senior accountable official\", \"exact_result\": \" DoDI 5010.40 Managers' Internal Control Program Procedures\", \"source\": \"matamo\", \"date\": \"2021-08-27T16:39:26.347Z\"}], \"times_matched\": 1}}, \"S0000065\": {\"R0000069\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-03-30T13:13:04.319Z\", \"exact_matches\": [{\"exact_query\": \" USAR\", \"exact_result\": \" CJCSI 3207.01C Department of Defense Support to Humanitarian Mine Action\", \"source\": \"matamo\", \"date\": \"2021-03-30T13:13:04.319Z\"}], \"times_matched\": 1}}, \"S0000066\": {\"R0000032\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-07-27T17:42:52.000Z\", \"exact_matches\": [{\"exact_query\": \"eo 13603\", \"exact_result\": \"EO 13603\", \"source\": \"user_history\", \"date\": \"2021-07-27T17:42:52.000Z\"}], \"times_matched\": 1}}, \"S0000067\": {\"R0000047\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-04-05T18:19:15.673Z\", \"exact_matches\": [{\"exact_query\": \" plain language policy\", \"exact_result\": \" DoDD 5110.04 Washington Headquarters Services (WHS)\", \"source\": \"matamo\", \"date\": \"2021-04-05T18:19:15.673Z\"}], \"times_matched\": 1}}, \"S0000068\": {\"R0000026\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-08-13T00:56:55.410Z\", \"exact_matches\": [{\"exact_query\": \" security cooperation\", \"exact_result\": \" Memo 2015 - CNAS - Security Cooperation and Assistance\", \"source\": \"matamo\", \"date\": \"2021-08-13T00:56:55.410Z\"}], \"times_matched\": 1}}, \"S0000069\": {\"R0000038\": {\"correct_match\": \"true\", \"last_match_date\": \"2021-11-18T20:21:19.000Z\", \"exact_matches\": [{\"exact_query\": \"operational contractor support\", \"exact_result\": \"DoDI 8320.04 CH 3\", \"source\": \"user_history\", \"date\": \"2021-11-18T20:21:19.000Z\"}], \"times_matched\": 1}}}, \"correct\": {\"S0000002\": [\"R0000031\"], \"S0000003\": [\"R0000027\"], \"S0000005\": [\"R0000035\"], \"S0000008\": [\"R0000042\"], \"S0000011\": [\"R0000068\", \"R0000049\"], \"S0000013\": [\"R0000000\"], \"S0000020\": [\"R0000015\"], \"S0000026\": [\"R0000067\"], \"S0000030\": [\"R0000045\"], \"S0000034\": [\"R0000062\"], \"S0000039\": [\"R0000013\"], \"S0000041\": [\"R0000036\"], \"S0000042\": [\"R0000028\"], \"S0000043\": [\"R0000023\"], \"S0000045\": [\"R0000066\"], \"S0000047\": [\"R0000050\"], \"S0000048\": [\"R0000055\"], \"S0000057\": [\"R0000039\"], \"S0000064\": [\"R0000041\"], \"S0000065\": [\"R0000069\"], \"S0000067\": [\"R0000047\"], \"S0000068\": [\"R0000026\"]}, \"incorrect\": {}, \"correct_vals\": {\"synchronizer and intelligence\": [\" DoDD 5143.01 Under Secretary of Defense for Intelligence and Security (USD(I&S))\"], \"national guard and ngb\": [\" DoDD 5105.77 National Guard Bureau (NGB)\"], \"navy\": [\" OPNAVINST 4650.17 UNUSUALLY ARDUOUS SEA DUTY FOR TRAVEL AND TRANSPORTATION ENTITLEMENTS\"], \"geospatial\": [\" DoDD 5105.60 National Geospatial-Intelligence Agency (NGA)\"], \"telework\": [\" AFI 36-816 Civilian Telework Program\", \"AFI 31-118\"], \"cyber range\": [\" CJCSM 3500.03E Joint Training Manual for the Armed Forces of the United States\"], \"safety and occupational health\": [\" SECNAVINST 5100.10K DEPARTMENT OF THE NAVY SAFETY PROGRAM\"], \"sexual assault prevention\": [\" SECNAVINST 1752.4C SEXUAL ASSAULT PREVENTION AND RESPONSE PROGRAM PROCEDURES\"], \"international cooperative administrative support services (icass)\": [\" DoDI 7060.06 International Cooperative Administrative Support Services (ICASS)\"], \"wet wing\": [\" AFI 11-235 SPECIALIZED REFUELING OPERATIONS\"], \"csa\": [\" DoDD 3000.06 Combat Support Agencies (CSAs)\"], \"who is sergeant major of the army\": [\" DoDI 1000.01 Identification (ID) Cards Required by the Geneva Conventions\"], \"use of alcohol and events\": [\" AFI 34-219 ALCOHOLIC BEVERAGE PROGRAM\"], \"interoperability\": [\" CJCSI 5123.01H Charter of the Joint Requirements Oversight Council (JROC) and the Implementation of the Joint Capabilities Integration and Development System\"], \"control system automation\": [\" CIM 11000.7 FACILITIES ENERGY MANUAL\"], \"dcma\": [\" Memo Joint Officer Handbook - Staffing and Action Guide (2011)\"], \"military\": [\" DoDI 1332.45 Retention Determinations for Non-Deployable Service Members\"], \"physical fitness\": [\" MCO 1700.39 MARINE CORPS RECREATION PROGRAMS\"], \"senior accountable official\": [\" DoDI 5010.40 Managers' Internal Control Program Procedures\"], \"usar\": [\" CJCSI 3207.01C Department of Defense Support to Humanitarian Mine Action\"], \"plain language policy\": [\" DoDD 5110.04 Washington Headquarters Services (WHS)\"], \"security cooperation\": [\" Memo 2015 - CNAS - Security Cooperation and Assistance\"]}, \"incorrect_vals\": {}}" \ No newline at end of file diff --git a/gamechangerml/data/test_data/test_validation/silver/intelligent_search_metadata.json b/gamechangerml/data/test_data/test_validation/silver/intelligent_search_metadata.json new file mode 100644 index 00000000..f2419933 --- /dev/null +++ b/gamechangerml/data/test_data/test_validation/silver/intelligent_search_metadata.json @@ -0,0 +1 @@ +{"date_created": "2022-03-09", "level": "silver", "number_queries": 70, "number_documents": 71, "number_correct": 22, "number_incorrect": 0, "start_date": "2020-12-01", "end_date": "2025-12-01", "exclude_searches": ["pizza", "shark"], "min_correct_matches": 2, "max_results": 10, "filter_queries": "False"} \ No newline at end of file diff --git a/gamechangerml/scripts/make_test_corpus.py b/gamechangerml/scripts/make_test_corpus.py deleted file mode 100644 index ba4b73e9..00000000 --- a/gamechangerml/scripts/make_test_corpus.py +++ /dev/null @@ -1,57 +0,0 @@ -from os import listdir -from os.path import isfile, join -import random -import argparse -from gamechangerml.src.utilities.test_utils import (check_file_size, check_directory, open_json, save_json) -from gamechangerml.api.utils.logger import logger - -def main(test_size, corpus_directory, save_directory, include_ids=None, max_file_size=100000): - '''Makes a small test corpus for checking validation''' - all_files = [f.split('.json')[0] + '.json' for f in listdir(corpus_directory) if isfile(join(corpus_directory, f))] - if include_ids: - - print(include_ids) - include_ids = [f.split('.json')[0] + '.json' for f in include_ids] - subset = list(set(all_files).intersection(include_ids)) - other = [i for i in all_files if i not in include_ids] - else: - subset = [] - other = all_files - for i in range(int(test_size) - len(subset)): - print(i) - filesize = 1000000 - while filesize > max_file_size: # filter out large files - random_index = random.randint(0,len(other)-1) - file = other[random_index] - filesize = check_file_size(file, corpus_directory) - subset.append(file) - subset = list(set(subset)) # remove duplicates - - save_directory = check_directory(save_directory) - for x in subset: - f = open_json(x, corpus_directory) - save_json(x, save_directory, f) - - return - -if __name__ == "__main__": - - parser = argparse.ArgumentParser(description="Profile Corpus") - - parser.add_argument("--test-size", "-ts", dest="test_size", required=True, help="size of test corpus to generate") - parser.add_argument("--corpus-directory", "-c", dest="corpus_directory", required=True, help="path to full corpus") - parser.add_argument("--save-directory", "-s", dest="save_directory", required=True, help="path for saving test corpus") - parser.add_argument("--include-ids", "-i", dest="include_ids", nargs="+", required=False, help="list of docids/filenames to include in the corpus") - parser.add_argument("--max-file-size", "-m", dest="max_file_size", required=False, help="max size (in bytes) of file to save to test_corpus") - - args = parser.parse_args() - if not args.include_ids: - args.include_ids = None - if not args.max_file_size: - args.max_file_size = 100000 - - logger.info("Creating a new test corpus of {} docs; saving to {}.".format(args.test_size, args.save_directory)) - - main(test_size=args.test_size, corpus_directory=args.corpus_directory, save_directory=args.save_directory, include_ids=args.include_ids, max_file_size=args.max_file_size) - - logger.info("Finished making test corpus.") \ No newline at end of file diff --git a/gamechangerml/scripts/make_training_data.py b/gamechangerml/scripts/make_training_data.py index 52ab6baa..e0b84b4a 100644 --- a/gamechangerml/scripts/make_training_data.py +++ b/gamechangerml/scripts/make_training_data.py @@ -4,8 +4,6 @@ import json from datetime import date from typing import List, Union, Dict, Tuple -import spacy - from gamechangerml.configs.config import ( TrainingConfig, @@ -13,14 +11,15 @@ SimilarityConfig, ) from gamechangerml.src.search.sent_transformer.model import SentenceSearcher +from gamechangerml.src.model_testing.query_es import * from gamechangerml.src.utilities.text_utils import normalize_query from gamechangerml.src.utilities.test_utils import * from gamechangerml.api.utils.logger import logger from gamechangerml.api.utils.pathselect import get_model_paths from gamechangerml.scripts.update_eval_data import make_tiered_eval_data -from gamechangerml.src.text_handling.corpus import LocalCorpus from gensim.utils import simple_preprocess from gamechangerml import DATA_PATH, CORPUS_PATH +from gamechangerml.src.utilities import gc_web_api, es_utils model_path_dict = get_model_paths() random.seed(42) @@ -34,170 +33,186 @@ ValidationConfig.DATA_ARGS["retriever_gc"]["gold_standard"], ) -CORPUS_DIR = CORPUS_PATH corpus_docs = [] try: corpus_docs = [ i.split(".json")[0] - for i in os.listdir(CORPUS_DIR) - if os.path.isfile(os.path.join(CORPUS_DIR, i)) + for i in os.listdir(CORPUS_PATH) + if os.path.isfile(os.path.join(CORPUS_PATH, i)) ] except Exception as e: logger.error(e) -def get_sample_paragraphs(pars, par_limit=100, min_length=150): - """Collect sample paragraphs longer than min_length (char), up to par_limit paragraphs""" +scores = { + "strong_match": 0.95, + "weak_match": 0.75, + "neutral": 0.5, + "negative": -0.95 +} - count = 0 - collected_pars = [] - for i in pars: - if count < par_limit: - if len(i["par_raw_text_t"]) >= min_length: - count += 1 - collected_pars.append( - {"text": i["par_raw_text_t"], "id": i["id"]}) - else: - break +gcClient = gc_web_api.GCWebClient() +esu = es_utils.ESUtils() - return collected_pars +def clean_id(id_1: str) -> str: + """Normalizes doc ids to compare""" + return id_1.split('.pdf')[0].upper().strip().lstrip() +def get_matching_es_result(query, doc): -def get_best_paragraphs( - query: str, doc_id: str, nlp, n_returns, min_score: float = 0.60 -) -> List[Dict[str, str]]: - """Retrieves the best paragraphs for expected doc using similarity model - Args: - data [pd.DataFrame]: data df with processed text at paragraph_id level for sent_index - query [str]: query - doc_id [str]: doc_id of the expected document to show up with the query - nlp: spacy nlp model for similarity reranking - Returns: - [List[Dict[str,str]]]: List of dictionaries of paragraph matches - """ - logger.info( - f"Retrieving matches for query: {query}, expected doc: {doc_id}") - pars = [] - doc1 = nlp(query) - if doc_id not in corpus_docs: - logger.warning(f"---Did not find {doc_id} in the corpus") - - json = open_json(doc_id + ".json", CORPUS_DIR) - paragraphs = json["paragraphs"] - sents = get_sample_paragraphs(paragraphs)[:n_returns] # get top n_returns - for sent in sents: - processed = " ".join(simple_preprocess( - sent["text"], min_len=2, max_len=100)) - pars.append({"id": sent["id"], "text": processed}) - - ranked = [] try: - if len(pars) == 0: - logger.info("---No paragraphs retrieved for this expected doc") - elif len(pars) == 1: - ranked = [{"score": "na", "id": pars[0] - ["id"], "text": pars[0]["text"]}] - else: - comparisons = [] - for par in pars: - doc2 = nlp(par["text"]) - sim = doc1.similarity(doc2) - if sim >= min_score: - record = {"score": sim, - "id": par["id"], "text": par["text"]} - comparisons.append(record) - else: - pass - ranked = sorted( - comparisons, key=lambda z: z["score"], reverse=True) - logger.info( - f"*** Collected {str(len(ranked))} / {str(len(pars))} paragraphs (passing sim threshold) retrieved for {doc_id}" - ) + docid = doc + ".pdf_0" + search_query = make_query_one_doc(query, docid) + r = esu.client.search(index=esu.es_index, body=dict(search_query)) + return r except Exception as e: - logger.info(f"---Could not re-rank the paragraphs for {query}") + logger.warning("Failed to get ES results") logger.warning(e) - # if no paragraphs are returned, get the title - if len(ranked) == 0: - clean_title = " ".join(simple_preprocess( - json["title"], min_len=2, max_len=100)) - ranked.append({"score": 1, "id": doc_id + - ".pdf_0", "text": clean_title}) +def get_any_es_result(query): - return ranked + try: + search_query = make_query(query) + r = esu.client.search(index=esu.es_index, body=dict(search_query)) + return r + except Exception as e: + logger.warning("Failed to get ES results") + logger.warning(e) +def get_paragraph_results(resp): + """Get list of paragraph texts for each search result""" + + texts = [] + if resp["hits"]["total"]["value"] > 0: + docs = resp["hits"]["hits"] + for doc in docs: + doc_id = "_".join(doc["fields"]["id"][0].split("_")[:-1]) + hits = doc["inner_hits"]["paragraphs"]["hits"]["hits"] + for par in hits: + par_id = doc_id + "_" + str(par["_nested"]["offset"]) + par_text = par["fields"]["paragraphs.par_raw_text_t"][0] + processed = ' '.join(simple_preprocess(par_text, min_len=2, max_len=100)) + texts.append({"par_id": par_id, "par_text": processed}) + + return texts + +def format_matching_paragraphs(query, doc, uid, score): + """Retrieve & format matching positive/negative paragraphs from ES""" + found = {} + not_found = {} + try: + matches = get_matching_es_result(query, doc) + results = get_paragraph_results(matches) + for r in results: + offset = r['par_id'].split('_')[-1] + uid = uid + "_" + offset + found[uid] = { + "query": query, + "doc": r['par_id'], + "paragraph": r['par_text'], + "label": score + } -def check_no_match(id_1: str, id_2: str) -> bool: - """Checks if paragraph ID matches the expected doc ID""" - if ( - id_1.split(".pdf")[0].upper().strip().lstrip() - == id_2.split(".pdf")[0].upper().strip().lstrip() - ): - return False - else: - return True - - -def get_negative_paragraphs( - data: pd.DataFrame, - query: str, - doc_id: str, - retriever, - n_returns: int, - any_matches: Dict[str, str], -) -> List[Dict[str, str]]: - """Looks up negative (not matching) paragraphs for each query - Args: - data [pd.DataFrame]: data df with processed text at paragraph_id level for sent_index - query [str]: query - doc_id [str]: doc_id of the expected document to show up with the query - retriever: SentenceSearcher class - n_returns [int]: number of negative samples to retrieve for each query - label [int]: label to assign paragraphs (1=correct, 0=neutral, -1=confirmed nonmatch) - Returns: - [List[Dict[str,str]]]: list of dictionaries of negative sample paragraphs - """ + except Exception as e: + logger.error(f"Could not get results for {query} / {doc}") + logger.error(e) + not_found[uid] = {"query": query, "doc": doc, "label": score} + + return found, not_found - checked_results = [] - try: - single_matching_docs = [ - i for i in any_matches[query] if check_no_match(i, doc_id) - ] - except: - single_matching_docs = [] +def format_nonmatching_paragraphs(query, matching_docs, single_matching_docs, par_count): + found = {} try: - results = retriever.retrieve_topn(query, n_returns) - logger.info( - f"Retrieved {str(len(results))} negative samples for query: {query} / doc: {doc_id}" - ) - for result in results: - par = data[data["paragraph_id"] == result["id"]].iloc[0]["text"] - par = " ".join(par.split(" ")[:400]) - if check_no_match(doc_id, result["id"]): - for s in single_matching_docs: - if s and check_no_match(s, result["id"]): - checked_results.append( - { - "query": query, - "doc": result["id"], - "paragraph": par, - "label": 0, - } - ) + non_matches = get_any_es_result(query) + results = get_paragraph_results(non_matches) + previous_text = '' + logger.info(f"Positive num: {par_count}") + for r in results: + if len(found) >= (par_count * 4): # stop getting negatives after 1:4 ratio + logger.info("Exceeded balance, stop retrieving paragraphs") + break + else: + doc_id = r['par_id'].split('.pdf_')[0] + if doc_id in matching_docs: + logger.info("Paragraph comes from a matching doc, skipping") + else: + if doc_id in single_matching_docs: + label = scores["weak_match"] else: - checked_results.append( - { - "query": query, - "doc": result["id"], - "paragraph": par, - "label": 0.5, - } - ) + label = scores["neutral"] + if r['par_text'] != previous_text: # skip duplicate paragraphs + uid = query + "_|_" + r['par_id'] + resultdict = { + "query": query, + "doc": r['par_id'], + "paragraph": r['par_text'], + "label": label, + } + found[uid] = resultdict + previous_text = r['par_text'] except Exception as e: - logger.warning("Could not get negative paragraphs") - logger.warning(e, exc_info=True) + logger.warning(f"Could not get non-matching results from ES for {query}") + + return found - return checked_results +def get_any_matches(any_matches, matching_docs, query): + """Collect docs that were clicked on at all for this query (so we can adjust their score)""" + try: + single_matching_docs = [clean_id(i) for i in any_matches[query] if clean_id(i) not in matching_docs] + logger.info(f"Found {str(len(single_matching_docs))} other docs opened for this query.") + return single_matching_docs + except: + return [] + +def collect_paragraphs_es(correct, incorrect, queries, collection, any_matches): + """Query ES for search/doc matches and negative samples and add them to query results with a label""" + + all_found = {} + all_not_found = {} + fullcount = 0 + total = len(correct.keys()) + for i in correct.keys(): + found = {} + notfound = {} + logger.info(f"{str(fullcount)} / {str(total)}") + fullcount += 1 + query = queries[i] + matching_docs = [] + par_count = 0 + for k in correct[i]: # for each possible match, collect positive samples + doc = collection[k] # get the docid + uid = query + "_|_" + doc + matching_docs.append(doc) + logger.info(f" *** Querying ES: {query} / {doc} (POS)***") + p_found, p_not_found = format_matching_paragraphs(query, doc, uid, score=scores['strong_match']) + found.update(p_found) + notfound.update(p_not_found) + par_count += len(p_found) + + # check for negative samples + if i in list(incorrect.keys()): + for n in incorrect[i]: + doc = collection[n] # get the docid + uid = query + "_|_" + doc + matching_docs.append(doc) + logger.info(f" *** Querying ES: {query} / {doc} (NEG)***") + n_found, n_not_found = format_matching_paragraphs(query, doc, uid, score=scores['negative']) + found.update(n_found) + notfound.update(n_not_found) + + if par_count > 0: + single_matching_docs = get_any_matches(any_matches, matching_docs, query) + neutral_found = format_nonmatching_paragraphs(query, matching_docs, single_matching_docs, par_count) + if len(neutral_found) > 0: + found.update(neutral_found) + all_found.update(found) + all_not_found.update(notfound) + else: + logger.info(f"\n**** No non-matching results retrieved for {query}") + else: + logger.info(f"\n**** No matching results retrieved for {query}") + + return all_found, all_not_found def add_gold_standard( @@ -210,10 +225,35 @@ def add_gold_standard( Returns: intel [Dict[str,str]: intelligent search evaluation data with manual entries added """ - gold = pd.read_csv(gold_standard_path, names=["query", "document"]) - gold["query_clean"] = gold["query"].apply(lambda x: normalize_query(x)) - gold["docs_split"] = gold["document"].apply(lambda x: x.split(";")) - all_docs = list(set([a for b in gold["docs_split"].tolist() for a in b])) + gold_original = pd.read_csv(gold_standard_path, names=['query', 'document']) + logger.info(f"Reading in {gold_original.shape[0]} queries from the Gold Standard data") + + def add_extra_queries(intel: Dict[str,str]) -> Dict[str,str]: + '''Multiply query/doc pairs to add by using title/filename/id as queries''' + extra_queries = [] + docs = [] + for doc_id in intel['collection'].values(): + try: + json = open_json(doc_id + '.json', CORPUS_PATH) + extra_queries.append(json['display_title_s']) + docs.append(doc_id) + logger.info(f"Added extra queries for {doc_id}") + except: + logger.warning(f"Could not add extra queries for {doc_id}") + + df = pd.DataFrame() + df['query'] = extra_queries + df['document'] = docs + return df + + extra_queries_df = add_extra_queries(intel) + gold = pd.concat([gold_original, extra_queries_df]) + gold.reset_index(inplace = True) + logger.info(f"Added {extra_queries_df.shape[0]} extra queries to the Gold Standard") + + gold['query_clean'] = gold['query'].apply(lambda x: normalize_query(x)) + gold['docs_split'] = gold['document'].apply(lambda x: x.split(';')) + all_docs = list(set([a for b in gold['docs_split'].tolist() for a in b])) def add_key(mydict: Dict[str, str]) -> str: """Adds new key to queries/collections dictionaries""" @@ -274,12 +314,10 @@ def train_test_split(data: Dict[str, str], tts_ratio: float) -> Tuple[Dict[str, neg_passing = {} pos_passing = {} for q in queries: - subset = {i: data[i] for i in data.keys() if data[i]["query"] == q} - pos_sample = [i for i in subset.keys() if subset[i]["label"] == 1] - neg_sample = [i for i in subset.keys() if subset[i]["label"] == -1] - if ( - len(neg_sample) > 0 - ): # since we have so few negative samples, add to neg list if it has a negative ex + subset = {i:data[i] for i in data.keys() if data[i]['query']==q} + pos_sample = [i for i in subset.keys() if subset[i]['label']==0.95] + neg_sample = [i for i in subset.keys() if subset[i]['label']==-0.5] + if len(neg_sample)>0: #since we have so few negative samples, add to neg list if it has a negative ex neg_passing[q] = subset elif ( len(pos_sample) > 0 @@ -311,130 +349,15 @@ def train_test_split(data: Dict[str, str], tts_ratio: float) -> Tuple[Dict[str, metadata = { "date_created": str(date.today()), - "n_positive_samples": f"{str(len(pos_train_keys))} train queries / {str(len(pos_test_keys))} test queries", - "n_negative_samples": f"{str(len(neg_train_keys))} train queries / {str(len(neg_test_keys))} test queries", + "n_queries": f"{str(len(pos_train_keys))} train queries / {str(len(pos_test_keys))} test queries", "total_train_samples_size": len(train), "total_test_samples_size": len(test), - "train_queries": pos_train_keys + neg_train_keys, - "test_queries": pos_test_keys + neg_test_keys, - "split_ratio": tts_ratio, + "split_ratio": tts_ratio } return train, test, metadata - -def collect_matches( - data: pd.DataFrame, - nlp, - n_returns, - relations: Dict[str, str], - queries: Dict[str, str], - collection: Dict[str, str], - label: int, -) -> Tuple[Dict[str, str]]: - """Gets matching paragraphs for each query/docid pair - Args: - data [pd.DataFrame]: data df with processed text at paragraph_id level for sent_index - nlp: spacy nlp model for sim model reranking - relations [Dict[str, str]]: dictionary of query:doc matches from intelligent search data - queries [Dict[str, str]]: dictionary of query ids : query text from intelligent search data - collection [Dict[str, str]]: dictionary of match ids : match text (doc ids) from intelligent search data - label [int]: label to assign paragraphs (1=correct, 0=neutral, -1=confirmed nonmatch) - Returns: - [Tuple[Dict[str, str]]]: one dictionary of found search pairs, one dictionary of notfound search pairs - """ - found = {} - not_found = {} - count = 0 - for i in relations.keys(): - count += 1 - logger.info(count) - query = queries[i] - for k in relations[i]: - doc = collection[k] - # backup UID, overwritten if there are results - uid = str(i) + "_" + str(k) - try: - matching = get_best_paragraphs(query, doc, nlp, n_returns) - for match in matching: - uid = str(i) + "_" + str(match["id"]) - text = " ".join( - match["text"].split(" ")[:400] - ) # truncate to 400 tokens - found[uid] = { - "query": query, - "doc": doc, - "paragraph": text, - "label": label, - } - except Exception as e: - logger.warning("Could not get positive matches") - logger.warning(e) - not_found[uid] = {"query": query, "doc": doc, "label": label} - return found, not_found - - -def collect_negative_samples( - data: pd.DataFrame, - retriever, - n_returns: int, - relations: Dict[str, str], - queries: Dict[str, str], - collection: Dict[str, str], - any_matches: Dict[str, str], -) -> Tuple[Dict[str, str]]: - """Gets negative samples each query/docid pair - Args: - data [pd.DataFrame]: data df with processed text at paragraph_id level for sent_index - retriever: SentenceSearcher class - n_returns [int]: number of non-matching paragraphs to retrieve for each query - relations [Dict[str, str]]: dictionary of query:doc matches from intelligent search data - queries [Dict[str, str]]: dictionary of query ids : query text from intelligent search data - collection [Dict[str, str]]: dictionary of match ids : match text (doc ids) from intelligent search data - label [int]: label to assign paragraphs (1=correct, 0=neutral, -1=confirmed nonmatch) - Returns: - [Tuple[Dict[str, str]]]: one dictionary of found search pairs, one dictionary of notfound search pairs - """ - found = {} - not_found = {} - for i in relations.keys(): - query = queries[i] - for k in relations[i]: - doc = collection[k] - uid = ( - str(i) + "_" + str(k) + "_neg" - ) # backup UID, overwritten if there are results - try: - not_matching = get_negative_paragraphs( - data=data, - query=query, - doc_id=k, - retriever=retriever, - n_returns=n_returns, - any_matches=any_matches, - ) - for match in not_matching: - uid = str(i) + "_" + str(match["doc"]) - text = " ".join( - match["paragraph"].split(" ")[:400] - ) # truncate to 400 tokens - found[uid] = { - "query": query, - "doc": doc, - "paragraph": text, - "label": 0, - } - except Exception as e: - logger.warning(e) - not_found[uid] = {"query": query, "doc": doc, "label": 0} - - return found, not_found - - -def get_all_single_matches(): - validation_dir = get_most_recent_dir( - os.path.join(DATA_PATH, "validation", "domain", "sent_transformer") - ) +def get_all_single_matches(validation_dir): directory = os.path.join(validation_dir, "any") any_matches = {} try: @@ -451,27 +374,62 @@ def get_all_single_matches(): return any_matches +def make_training_data_csv(data, label): + + df = pd.DataFrame(data).T + df['match'] = df['label'].apply(lambda x: 1 if x >= 0.95 else 0) + matches = df[df['match']==1] + non_matches = df[df['match']==0] + + + def get_docs(mylist): + try: + return [i.split('.pdf')[0] for i in mylist] + except: + return [] + + def count_unique(mylist): + + return len(set(get_docs(mylist))) + + agg_match = pd.DataFrame(matches.groupby('query')['doc'].apply(list)) + agg_match.rename(columns = {'doc': 'matching_paragraphs'}, inplace = True) + agg_match['num_matching_paragraphs'] = agg_match['matching_paragraphs'].apply(lambda x: len(x)) + agg_match['num_matching_docs'] = agg_match['matching_paragraphs'].apply(lambda x: count_unique(x)) + + agg_nonmatch = pd.DataFrame(non_matches.groupby('query')['doc'].apply(list)) + agg_nonmatch.rename(columns = {'doc': 'nonmatching_paragraphs'}, inplace = True) + agg_nonmatch['num_nonmatching_paragraphs'] = agg_nonmatch['nonmatching_paragraphs'].apply(lambda x: len(x)) + agg_nonmatch['num_nonmatching_docs'] = agg_nonmatch['nonmatching_paragraphs'].apply(lambda x: count_unique(x)) + + combined = agg_match.merge(agg_nonmatch, on='query', how = 'outer') + combined['label'] = label + + def check_overlap(list1, list2): + return len(set(get_docs(list1)).intersection(get_docs(list2))) + + combined['overlap'] = [check_overlap(x, y) for x, y in zip(combined['matching_paragraphs'], combined['nonmatching_paragraphs'])] + combined['par_balance'] = combined['num_matching_paragraphs'] / combined['num_nonmatching_paragraphs'] + combined['doc_balance'] = combined['num_matching_docs'] / combined['num_nonmatching_docs'] + + combined.fillna(0, inplace = True) + + return combined + def make_training_data( index_path: Union[str, os.PathLike], - n_returns: int, - level: str, - update_eval_data: bool, - retriever=None, - sim_model_name: str = SIM_MODEL, - transformers_dir: Union[str, os.PathLike] = LOCAL_TRANSFORMERS_DIR, - gold_standard_path: Union[str, os.PathLike] = gold_standard_path, - tts_ratio: float = tts_ratio, - training_dir: Union[str, os.PathLike] = training_dir, -) -> Tuple[Dict[str, str]]: + level: str, + update_eval_data: bool, + testing_only: bool=False, + gold_standard_path: Union[str,os.PathLike]=gold_standard_path, + tts_ratio: float=tts_ratio, + training_dir: Union[str,os.PathLike]=training_dir) -> Tuple[Dict[str,str]]: """Makes training data based on new user search history data Args: index_path [str|os.PathLike]: path to the sent index for retrieving the training data (should be most recent index) - n_returns [int]: number of non-matching paragraphs to retrieve for each query level [str]: level of eval tier to use for training data (options: ['all', 'silver', 'gold']) update_eval_data [bool]: whether or not to update the eval data before making training data - sim_model_name [str]: name of sim model for loading SimilarityRanker - transformers_dir [Union[str,os.PathLike]]: directory of transformer models gold_standard_path [Union[str,os.PathLike]]: path to load in the manually curated gold_standard.csv tts_ratio [float]: train/test split ratio, float from 0-1 training_dir [Union[str,os.PathLike]]: directory for saving training data @@ -486,7 +444,7 @@ def make_training_data( or update_eval_data ): logger.info("**** Updating the evaluation data") - make_tiered_eval_data(index_path) + make_tiered_eval_data(index_path, testing_only) validation_dir = get_most_recent_dir( os.path.join(DATA_PATH, "validation", "domain", "sent_transformer") @@ -502,124 +460,53 @@ def make_training_data( logger.warning(e) intel = {} + # make save_dir + timestamp = str(validation_dir).split('/')[-1] + save_dir = os.path.join(training_dir, timestamp) + os.makedirs(save_dir) + logger.info(f"Created training data save directory {str(save_dir)}") + + ## gather all possible matches + any_matches = get_all_single_matches(validation_dir) + # add gold standard samples logger.info("**** Adding gold standard examples") intel = add_gold_standard(intel, gold_standard_path) - # set up save dir - save_dir = make_timestamp_directory(training_dir) - - try: - nlp = spacy.load("en_core_web_lg") - except: - logger.warning("Could not load spacy model") - - if not retriever: - logger.info("Did not init SentenceSearcher, loading now") - retriever = SentenceSearcher( - sim_model_name=sim_model_name, - index_path=index_path, - transformer_path=transformers_dir, - ) - # read in sent_index data - logger.info("**** Loading in sent index data from retriever") - try: - data = retriever.data - data["doc_id"] = data["paragraph_id"].apply( - lambda x: x.split(".pdf")[0]) - except Exception as e: - logger.info("Could not load in data from retriever") - logger.warning(e) - - any_matches = get_all_single_matches() - # get matching paragraphs - try: - correct_found, correct_notfound = collect_matches( - data=data, - queries=intel["queries"], - collection=intel["collection"], - relations=intel["correct"], - label=1, - nlp=nlp, - n_returns=n_returns, - ) - logger.info( - f"---Number of correct query/result pairs that were not found: {str(len(correct_notfound))}" - ) - except Exception as e: - logger.warning(e) - logger.warning("\nCould not retrieve positive matches\n") - try: - incorrect_found, incorrect_notfound = collect_matches( - data=data, - queries=intel["queries"], - collection=intel["collection"], - relations=intel["incorrect"], - label=-1, - nlp=nlp, - n_returns=n_returns, - ) - logger.info( - f"---Number of incorrect query/result pairs that were not found: {str(len(incorrect_notfound))}" - ) - except Exception as e: - logger.warning(e) - logger.warning("\nCould not retrieve negative matches\n") - - # get negative samples try: - all_relations = {**intel["correct"], **intel["incorrect"]} - neutral_found, neutral_notfound = collect_negative_samples( - data=data, - retriever=retriever, - n_returns=n_returns, - queries=intel["queries"], - collection=intel["collection"], - relations=all_relations, - any_matches=any_matches, - ) - logger.info( - f"---Number of negative sample pairs that were not found: {str(len(neutral_notfound))}" - ) + found, notfound = collect_paragraphs_es( + correct=intel['correct'], + incorrect=intel['incorrect'], + queries=intel['queries'], + collection=intel['collection'], + any_matches=any_matches) + logger.info(f"---Number of correct query/result pairs that were not found: {str(len(notfound))}") except Exception as e: logger.warning(e) - logger.warning("\nCould not retrieve negative samples\n") + logger.warning("\nCould not retrieve positive matches from ES\n") - # save a json of the query-doc pairs that did not retrieve an ES paragraph for training data - notfound = {**correct_notfound, **incorrect_notfound, **neutral_notfound} - logger.info( - f"---Number of total query/result pairs that were not found: {str(len(notfound))}" - ) - notfound_path = os.path.join(save_dir, "not_found_search_pairs.json") - with open(notfound_path, "w") as outfile: - json.dump(notfound, outfile) - - all_examples = {**neutral_found, **incorrect_found, **correct_found} - logger.info(f"Total size of query-doc pairs: {str(len(all_examples))}") - - # train/test split - train, test, metadata = train_test_split(all_examples, tts_ratio) + ## train/test split + train, test, metadata = train_test_split(found, tts_ratio) + metadata["sent_index_used"] = index_path + metadata["validation_data_used"] = validation_dir + metadata["not_found_search_pairs"] = str(len(notfound)) data = {"train": train, "test": test} logger.info(f"**** Generated training data: \n {metadata}") - # save data and metadata files - data_path = os.path.join(save_dir, "training_data.json") - metadata_path = os.path.join(save_dir, "training_metadata.json") - - with open(data_path, "w") as outfile: - json.dump(data, outfile) + ## Make summary csv of training data + train_df = make_training_data_csv(train, "train") + test_df = make_training_data_csv(test, "test") + fulldf = pd.concat([train_df, test_df]) + csv_path = os.path.join(save_dir, "retrieved_paragraphs.csv") + fulldf.to_csv(csv_path) - with open(metadata_path, "w") as outfile: - json.dump(metadata, outfile) + ## save data and metadata files + save_json("training_data.json", save_dir, data) + save_json("training_metadata.json", save_dir, metadata) + save_json("not_found_search_pairs.json", save_dir, notfound) + logger.info(f"Finished saving training data files to {save_dir}") -if __name__ == "__main__": - - make_training_data( - index_path="gamechangerml/models/sent_index_20220103", - n_returns=50, - level="silver", - update_eval_data=True, - ) + return diff --git a/gamechangerml/scripts/run_evaluation.py b/gamechangerml/scripts/run_evaluation.py index 41d8e6aa..189318a5 100644 --- a/gamechangerml/scripts/run_evaluation.py +++ b/gamechangerml/scripts/run_evaluation.py @@ -17,21 +17,33 @@ def eval_qa(model_name, sample_limit, eval_type="original"): logger.info("No eval_type selected. Options: ['original', 'gamechanger'].") def eval_sent(model_name, validation_data, eval_type="domain"): - metadata = open_json('metadata.json', os.path.join(MODEL_PATH, model_name)) - encoder = metadata['encoder_model'] - logger.info(f"Evaluating {model_name} created with {encoder}") + if "sent_index" in model_name: + logger.info("Evaluating a sentence index") + metadata = open_json('metadata.json', os.path.join(MODEL_PATH, model_name)) + encoder = metadata['encoder_model'] + index = model_name + logger.info(f"Evaluating {model_name} created with {encoder}") + else: + logger.info("Evaluating an encoder model") + encoder = model_name + index = None + logger.info(f"Evaluating encoder: {encoder}") if eval_type == "domain": + base_data_dir = os.path.join(DATA_PATH, "validation", "domain", "sent_transformer") if validation_data != "latest": - if os.path.exists(os.path.join(DATA_PATH, "validation", "domain", "sent_transformer", validation_data)): - data_path = os.path.join(DATA_PATH, "validation", "domain", "sent_transformer", validation_data) + if os.path.exists(os.path.join(base_data_dir, validation_data)): + data_path = os.path.join(base_data_dir, validation_data) else: logger.warning("Could not load validation data, path doesn't exist") data_path = None else: - data_path = None + try: + data_path = get_most_recent_dir(base_data_dir) + except: + data_path = None results = {} for level in ['gold', 'silver']: - domainEval = IndomainRetrieverEvaluator(index=model_name, data_path=data_path, data_level=level, encoder_model_name=encoder, sim_model_name=SimilarityConfig.BASE_MODEL, **EmbedderConfig.MODEL_ARGS) + domainEval = IndomainRetrieverEvaluator(index=index, data_path=data_path, data_level=level, encoder_model_name=encoder, sim_model_name=SimilarityConfig.BASE_MODEL, **EmbedderConfig.MODEL_ARGS) results[level] = domainEval.results elif eval_type == "original": originalEval = MSMarcoRetrieverEvaluator(**EmbedderConfig.MODEL_ARGS, encoder_model_name=EmbedderConfig.BASE_MODEL, sim_model_name=SimilarityConfig.BASE_MODEL) diff --git a/gamechangerml/scripts/update_eval_data.py b/gamechangerml/scripts/update_eval_data.py index a72d1809..895e88a8 100644 --- a/gamechangerml/scripts/update_eval_data.py +++ b/gamechangerml/scripts/update_eval_data.py @@ -16,7 +16,7 @@ SENT_INDEX = model_path_dict['sentence'] -def make_tiered_eval_data(index_path): +def make_tiered_eval_data(index_path, testing_only): if not index_path: index_path = SENT_INDEX @@ -36,6 +36,7 @@ def save_data( end_date: str, exclude_searches: List[str], filter_queries: bool, + testing_only: bool, save_dir: Union[str,os.PathLike]=save_dir) -> Tuple[Dict[str,str], Dict[str,str], Dict[str,str]]: """Makes eval data for each tier level using args from config.py and saves to save_dir Args: @@ -59,7 +60,8 @@ def save_data( min_correct_matches=min_matches, max_results=max_res, filter_queries=filter_queries, - index_path=index_path + index_path=index_path, + testing_only=testing_only ) save_intel = { @@ -67,7 +69,10 @@ def save_data( "collection": intel.collection, "meta_relations": intel.all_relations, "correct": intel.correct, - "incorrect": intel.incorrect} + "incorrect": intel.incorrect, + "correct_vals": intel.correct_vals, + "incorrect_vals": intel.incorrect_vals + } metadata = { "date_created": str(date.today()), @@ -100,18 +105,21 @@ def save_data( all_data = save_data( level='any', filter_queries = False, + testing_only = testing_only, **ValidationConfig.TRAINING_ARGS ) silver_data = save_data( level='silver', filter_queries = False, + testing_only=testing_only, **ValidationConfig.TRAINING_ARGS ) gold_data = save_data( level='gold', filter_queries = False, # should use same (updated) exclude list of queries as silver_data + testing_only=testing_only, **ValidationConfig.TRAINING_ARGS ) @@ -120,6 +128,6 @@ def save_data( if __name__ == '__main__': try: - make_tiered_eval_data(index_path=None) + make_tiered_eval_data(index_path=None, testing_only=False) except Exception as e: logger.warning(e, exc_info=True) \ No newline at end of file diff --git a/gamechangerml/src/model_testing/evaluation.py b/gamechangerml/src/model_testing/evaluation.py index b238119e..08742a5e 100644 --- a/gamechangerml/src/model_testing/evaluation.py +++ b/gamechangerml/src/model_testing/evaluation.py @@ -4,6 +4,8 @@ import csv import math from datetime import datetime +from sentence_transformers import util +from gamechangerml import REPO_PATH, CORPUS_PATH from gamechangerml.src.search.sent_transformer.model import ( SentenceEncoder, SentenceSearcher, @@ -13,10 +15,6 @@ from gamechangerml.src.search.query_expansion.qe import QE from gamechangerml.src.search.query_expansion.utils import remove_original_kw from gamechangerml.configs.config import ( - QAConfig, - EmbedderConfig, - SimilarityConfig, - QexpConfig, ValidationConfig, ) from gamechangerml.src.utilities.text_utils import normalize_answer @@ -26,7 +24,6 @@ NLIData, MSMarcoData, QADomainData, - RetrieverGSData, UpdatedGCRetrieverData, QEXPDomainData, ) @@ -40,10 +37,12 @@ init_timer() model_path_dict = get_model_paths() -LOCAL_TRANSFORMERS_DIR = model_path_dict["transformers"] +try: + LOCAL_TRANSFORMERS_DIR = model_path_dict["transformers"] +except: + LOCAL_TRANSFORMERS_DIR = 'gamechangerml/models/transformers' SENT_INDEX_PATH = model_path_dict["sentence"] - class TransformerEvaluator: def __init__(self, transformer_path=LOCAL_TRANSFORMERS_DIR, use_gpu=False): @@ -307,9 +306,9 @@ def __init__( self.encoder_model_name = encoder_model_name self.model_path = os.path.join(encoder_model_name, transformer_path) - def make_index(self, encoder, corpus_path): + def make_index(self, encoder, corpus_path, index_path, files_to_use=None): - return encoder.index_documents(corpus_path) + return encoder.index_documents(corpus_path, index_path, files_to_use) def predict(self, data, index, retriever, eval_path, k): @@ -317,6 +316,7 @@ def predict(self, data, index, retriever, eval_path, k): "index", "queries", "top_expected_ids", + f"results@{k}", "hits", "true_positives", "false_positives", @@ -324,12 +324,19 @@ def predict(self, data, index, retriever, eval_path, k): "true_negatives", "reciprocal_rank", "average_precision", - "precision@{}".format(k), - "recall@{}".format(k), + f"precision@{k}", + f"recall@{k}" ] - fname = index.split("/")[-1] + ## make name for the csv of results + if "/" in index: + fname = index.split("/")[-1] + else: + fname = index csv_filename = os.path.join( eval_path, timestamp_filename(fname, ".csv")) + logger.info(f"Making a csv of test results, saved at: {csv_filename}") + + # make the csv with open(csv_filename, "w") as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(columns) @@ -344,7 +351,7 @@ def predict(self, data, index, retriever, eval_path, k): doc_texts = [x["text"] for x in doc_results] doc_ids = [x["id"] for x in doc_results] doc_scores = [x["score"] for x in doc_results] - if index != "msmarco_index": + if fname != "msmarco_index": doc_ids = [".".join(i.split(".")[:-1]) for i in doc_ids] logger.info( f"retrieved: {str(doc_texts)}, {str(doc_ids)}, {str(doc_scores)}" @@ -377,6 +384,8 @@ def predict(self, data, index, retriever, eval_path, k): hit["score"] = doc_scores[rank] hits.append(hit) true_pos += 1 + else: + false_pos += 1 if ( len(doc_ids) < k ): # if there are not k predictions, there are pred negatives @@ -391,14 +400,16 @@ def predict(self, data, index, retriever, eval_path, k): fn += false_neg tn += true_neg tp += true_pos + fp += false_pos logger.info( - f"Metrics: fn: {str(fn)}, tn: {str(tn)}, tp: {str(tp)}") + f"Metrics: fn: {str(fn)}, fp: {str(fp)}, tn: {str(tn)}, tp: {str(tp)}") # save metrics to csv row = [ [ str(query_count), str(query), str(expected_docs), + str(doc_results), str(hits), str(true_pos), str(false_pos), @@ -415,7 +426,7 @@ def predict(self, data, index, retriever, eval_path, k): def eval( self, data, index, retriever, data_name, eval_path, model_name, k=retriever_k - ): + ): df, tp, tn, fp, fn, total_expected = self.predict( data, index, retriever, eval_path, k @@ -515,7 +526,6 @@ def __init__( model_name=encoder_model_name, ) - class IndomainRetrieverEvaluator(RetrieverEvaluator): def __init__( self, @@ -525,52 +535,105 @@ def __init__( return_id, verbose, data_level, + index, create_index=True, data_path=None, encoder=None, retriever=None, transformer_path=LOCAL_TRANSFORMERS_DIR, - index=SENT_INDEX_PATH, - use_gpu=False, - ): + overwrite_test_corpus=True, + use_gpu=False + ): super().__init__(transformer_path, encoder_model_name, use_gpu) self.model_path = os.path.join(transformer_path, encoder_model_name) - if not index: - logger.info("No index provided for evaluating.") - if create_index: - self.index_path = os.path.join( - os.path.dirname(transformer_path), "sent_index_TEST" - ) - logger.info( - "Making new embeddings index at {}".format( - str(self.index_path)) + self.data_path = data_path + self.data_level = data_level + logger.info(f"Using {str(self.data_path)} for validation data") + if not index: # if there is no index to evaluate, we need to make one + logger.info("No index provided for evaluating. Checking if test index exists.") + self.index_path = os.path.join( + transformer_path, encoder_model_name, "sent_index_TEST" ) - if not os.path.exists(self.index_path): - os.makedirs(self.index_path) - if encoder: - self.encoder = encoder - else: - self.encoder = SentenceEncoder( - encoder_model_name=encoder_model_name, - min_token_len=min_token_len, - return_id=return_id, - verbose=verbose, - use_gpu=use_gpu, + # make evaluations path + self.eval_path = check_directory( + os.path.join(self.model_path, "evals_gc", data_level) + ) + if os.path.isdir(self.index_path) and len(os.listdir(self.index_path)) > 0: + logger.info("Found a test index for this model, using that.") + else: + logger.info("Did not find a test index - creating one.") + if create_index: # make test index in the encoder model directory + # create directory for the test index + if not os.path.exists(self.index_path): + os.makedirs(self.index_path) + logger.info( + "Making new embeddings index at {}".format( + str(self.index_path)) ) - self.make_index( - encoder=self.encoder, - corpus_path=ValidationConfig.DATA_ARGS["test_corpus_dir"], - index_path=self.index_path, - ) - else: + + # set up the encoder to make the index + if encoder: # if encoder model is passed, use that + logger.info(f"Using pre-init encoder to make the index") + self.encoder = encoder + else: # otherwise init an encoder to make the index + logger.info(f"Loading {encoder_model_name} to make the index") + self.encoder = SentenceEncoder( + encoder_model_name=encoder_model_name, + min_token_len=min_token_len, + return_id=return_id, + verbose=verbose, + use_gpu=use_gpu, + transformer_path=LOCAL_TRANSFORMERS_DIR + ) + + # create the test corpus + include_ids = self.collect_docs_for_index() + if len(include_ids) > 0: + logger.info(f"Collected {str(len(include_ids))} doc IDs to include in test index") + logger.info(f"{str(include_ids[:5])}") + else: + logger.warning("Function to retrieve doc IDs didn't work") + quit + + # make a (test) index for evaluating the model + logger.info("Making the test index") + self.make_index( + encoder=self.encoder, + corpus_path=CORPUS_PATH, + index_path=self.index_path, + files_to_use=include_ids + ) + + ## save index metadata + metadata = { + "date": datetime.now().strftime("%Y-%m-%d"), + "model_type": "sentence index", + "base_model_path": self.model_path, + "current_model_path": self.index_path, + "validation_data_dir": self.data_path, + "include_ids": include_ids, + } + save_json("metadata.json", self.index_path, metadata) + logger.info("Saved metadata to the index dir") + + index = self.index_path + else: # if a full index is passed, use that for evaluating self.index_path = os.path.join( os.path.dirname(transformer_path), index) - if self.index_path: + # make evaluations path + self.eval_path = check_directory( + os.path.join(self.index_path, "evals_gc", data_level) + ) + + if self.index_path: # at this point, there should be an index path + # collect all the doc ids in the index self.doc_ids = open_txt(os.path.join( self.index_path, "doc_ids.txt")) + + # if retriever exists, use that, otherwise make one if retriever: self.retriever = retriever else: @@ -579,12 +642,15 @@ def __init__( index_path=self.index_path, transformer_path=transformer_path, ) - self.eval_path = check_directory( - os.path.join(self.index_path, "evals_gc", data_level) - ) + + # make the validation data + logger.info("Collecting query/result pairs for testing") self.data = UpdatedGCRetrieverData( - available_ids=self.doc_ids, level=data_level, data_path=data_path + available_ids=self.doc_ids, level=self.data_level, data_path=self.data_path ) + + logger.info("Generating results") + # generate the evaluation results self.results = self.eval( data=self.data, index=index, @@ -594,6 +660,24 @@ def __init__( model_name=encoder_model_name, ) + def collect_docs_for_index(self): + '''Check if the model has an associated training data file with IDs to include in test index.''' + + if os.path.isfile(os.path.join(self.model_path, "metadata.json")): + logger.info("This is a finetuned model: collecting training data IDs for index") + metadata = open_json("metadata.json", self.model_path) + train_data_path = metadata['training_data_dir'] + training_data = pd.read_csv(train_data_path) + include_ids = [i.split('.pdf_')[0] for i in list(set(training_data['doc']))] + else: + logger.info("This is a base model: collecting validation data IDs for index") + base_val_path = os.path.join(self.data_path, self.data_level) + validation_data = open_json("intelligent_search_data.json", base_val_path) + validation_data = json.loads(validation_data) + include_ids = [i.strip().lstrip() for i in validation_data['collection'].values()] + + include_ids = [i + '.json' if i[-5:] != 'json' else i for i in include_ids] + return include_ids class SimilarityEvaluator(TransformerEvaluator): def __init__( diff --git a/gamechangerml/src/model_testing/query_es.py b/gamechangerml/src/model_testing/query_es.py new file mode 100644 index 00000000..a1d86f0f --- /dev/null +++ b/gamechangerml/src/model_testing/query_es.py @@ -0,0 +1,220 @@ +def make_query_one_doc(query, docid): + """Make a query for ES that will search one doc for the best matching paragraphs.""" + + true = True + false = False + + search_query = { + "_source": { + "includes": ["pagerank_r", "kw_doc_score_r", "orgs_rs", "topics_rs"] + }, + "stored_fields": [ + "filename", + "title", + "page_count", + "doc_type", + "doc_num", + "ref_list", + "id", + "summary_30", + "keyw_5", + "p_text", + "type", + "p_page", + "display_title_s", + "display_org_s", + "display_doc_type_s", + "is_revoked_b", + "access_timestamp_dt", + "publication_date_dt", + "crawler_used_s", + ], + "from": 0, + "size": 50, + "track_total_hits": true, + "query": { + "bool": { + "must": [ + {"match": {"id": docid}}, + { + "nested": { + "path": "paragraphs", + "inner_hits": { + "_source": false, + "stored_fields": [ + "paragraphs.page_num_i", + "paragraphs.filename", + "paragraphs.par_raw_text_t", + ], + "from": 0, + "size": 5, + "highlight": { + "fields": { + "paragraphs.filename.search": { + "number_of_fragments": 0 + }, + "paragraphs.par_raw_text_t": { + "fragment_size": 200, + "number_of_fragments": 1, + }, + }, + "fragmenter": "span", + }, + }, + "query": { + "bool": { + "should": [ + { + "wildcard": { + "paragraphs.filename.search": { + "value": query, + "boost": 15, + } + } + }, + { + "query_string": { + "query": query, + "default_field": "paragraphs.par_raw_text_t", + "default_operator": "AND", + "fuzzy_max_expansions": 100, + "fuzziness": "AUTO", + } + }, + ] + } + }, + } + }, + ], + "should": [ + { + "multi_match": { + "query": query, + "fields": [ + "keyw_5^2", + "id^2", + "summary_30", + "paragraphs.par_raw_text_t", + ], + "operator": "or", + } + }, + {"rank_feature": {"field": "pagerank_r", "boost": 0.5}}, + {"rank_feature": {"field": "kw_doc_score_r", "boost": 0.1}}, + ], + } + }, + } + + return search_query + +def make_query(query): + """Make a query for ES that will search any docs for matching paragraphs.""" + + true = True + false = False + + search_query = { + "_source": { + "includes": ["pagerank_r", "kw_doc_score_r", "orgs_rs", "topics_rs"] + }, + "stored_fields": [ + "filename", + "title", + "page_count", + "doc_type", + "doc_num", + "ref_list", + "id", + "summary_30", + "keyw_5", + "p_text", + "type", + "p_page", + "display_title_s", + "display_org_s", + "display_doc_type_s", + "is_revoked_b", + "access_timestamp_dt", + "publication_date_dt", + "crawler_used_s", + ], + "from": 0, + "size": 50, + "track_total_hits": true, + "query": { + "bool": { + "must": [ + { + "nested": { + "path": "paragraphs", + "inner_hits": { + "_source": false, + "stored_fields": [ + "paragraphs.page_num_i", + "paragraphs.filename", + "paragraphs.par_raw_text_t", + ], + "from": 0, + "size": 5, + "highlight": { + "fields": { + "paragraphs.filename.search": { + "number_of_fragments": 0 + }, + "paragraphs.par_raw_text_t": { + "fragment_size": 200, + "number_of_fragments": 1, + }, + }, + "fragmenter": "span", + }, + }, + "query": { + "bool": { + "should": [ + { + "wildcard": { + "paragraphs.filename.search": { + "value": query, + "boost": 15, + } + } + }, + { + "query_string": { + "query": query, + "default_field": "paragraphs.par_raw_text_t", + "default_operator": "AND", + "fuzzy_max_expansions": 100, + "fuzziness": "AUTO", + } + }, + ] + } + }, + } + }, + ], + "should": [ + { + "multi_match": { + "query": query, + "fields": [ + "keyw_5^2", + "id^2", + "summary_30", + "paragraphs.par_raw_text_t", + ], + "operator": "or", + } + }, + {"rank_feature": {"field": "pagerank_r", "boost": 0.5}}, + {"rank_feature": {"field": "kw_doc_score_r", "boost": 0.1}}, + ], + } + }, + } + + return search_query diff --git a/gamechangerml/src/model_testing/train_tests.py b/gamechangerml/src/model_testing/train_tests.py new file mode 100644 index 00000000..ab992e0f --- /dev/null +++ b/gamechangerml/src/model_testing/train_tests.py @@ -0,0 +1,283 @@ +from tkinter import NONE +from path import Path +import requests +import logging +import os +import json +import time +import shutil +import pandas as pd +import argparse + +logger = logging.getLogger() +training_dir= "gamechangerml/data/test" +http = requests.Session() + +GC_ML_HOST = os.environ.get("GC_ML_HOST", default="localhost") +API_URL = f"{GC_ML_HOST}:5000" if "http" in GC_ML_HOST else f"http://{GC_ML_HOST}:5000" + +def open_json(filename, path): + '''Opens a json file''' + with open(os.path.join(path, filename)) as f: + return json.load(f) + +def get_most_recent_dir(parent_dir): + + subdirs = [os.path.join(parent_dir, d) for d in os.listdir(parent_dir) if os.path.isdir(os.path.join(parent_dir, d))] + if len(subdirs) > 0: + return max(subdirs, key=os.path.getctime) + else: + logger.error("There are no subdirectories to retrieve most recent data from") + return None + +def delete_files(path): + '''Deletes all files in a directory''' + print(f"Cleaning up: removing test files from {str(path)}") + for file in os.listdir(path): + fpath = os.path.join(path, file) + print(fpath) + try: + shutil.rmtree(fpath) + except OSError: + os.remove(fpath) + try: + os.rmdir(path) + except OSError as e: + logger.error("Error: %s : %s" % (path, e.strerror)) + +def wait(filename, path, type, attempts=180): + + i = 0 + condition = False + while i < attempts: + if type == 'open_json': + try: + condition = open_json(filename, path) + except: + pass + elif type == 'check_file': + condition = os.path.isfile(os.path.join(path, filename)) + elif type == 'check_dir': + condition = os.path.isdir(path) and len(os.listdir(path)) >0 + if condition: + print("Condition met, breaking the wait loop") + break + else: + print(f"Countdown: {str((attempts-i)*5)} seconds left...") + i += 1 + time.sleep(5) + + return condition + +def wait_matching_dir(base_dir, timestamp, attempts = 180): + + i = 0 + passed = False + while i < attempts: + print(f"Countdown: {str((attempts-i)*5)} seconds left...") + most_recent = get_most_recent_dir(base_dir) + if str(most_recent).split('/')[-1] == str(timestamp): + print("Directory available, breaking the wait loop") + passed = True + break + else: + i += 1 + time.sleep(5) + + return passed + +class TestTrain(): + + def __init__(self, model): + + self.model = model + + def call_finetune(self): + + print("*** Requesting finetune from MLAPI...") + + model_dict = { + "build_type": "sent_finetune", + "model": self.model, + "batch_size": 8, + "epochs": 1, + "warmup_steps": 100, + "remake_train_data": True, + "testing_only": True + } + resp = http.post(API_URL + "/trainModel", json=model_dict) + + print(f"Connected to MLAPI: {str(resp.ok)}") + + + def made_validation_data(self, val_path): + + try: + ## check created validation data + test_any = json.loads(open_json("intelligent_search_data.json", "gamechangerml/data/test_data/test_validation/any")) + test_silver = json.loads(open_json("intelligent_search_data.json", "gamechangerml/data/test_data/test_validation/silver")) + test_gold = json.loads(open_json("intelligent_search_data.json", "gamechangerml/data/test_data/test_validation/gold")) + + gold = json.loads(open_json("intelligent_search_data.json", os.path.join(val_path, "gold"))) + silver = json.loads(open_json("intelligent_search_data.json", os.path.join(val_path, "silver"))) + any_ = json.loads(open_json("intelligent_search_data.json", os.path.join(val_path, "any"))) + + results = { + "Gold correct data match": gold['correct_vals'] == test_gold['correct_vals'], + "Gold incorrect data match": gold['incorrect_vals'] == test_gold['incorrect_vals'], + "Silver correct data match": silver['correct_vals'] == test_silver['correct_vals'], + "Silver incorrect data match": silver['incorrect_vals'] == test_silver['incorrect_vals'], + "Any correct data match": any_['correct_vals'] == test_any['correct_vals'], + "Any incorrect data match": any_['incorrect_vals'] == test_any['incorrect_vals'] + } + except: + results = {} + + print(results) + + def made_training_data(self, metadata, training_path): + + try: + df = pd.read_csv(os.path.join(training_path, "retrieved_paragraphs.csv")) + + results = { + "At least one matching doc per query": df['num_matching_docs'].min() >= 1, + "At least one nonmatching doc per query": df['num_nonmatching_docs'].min() >= 1, + "No overlapping docs between match/nonmatch paragraphs": df['overlap'].sum() == 0, + "Balance of classes >=0.2 ": df['par_balance'].min() >= 0.2, + "Train/test query counts match": metadata['n_queries'] == '21 train queries / 5 test queries' + } + + except: + results = {} + + print(results) + + + def finetuned_model(self, metadata): + + try: + results = { + "Num training samples > 100": metadata["n_training_samples"] > 100 + } + except: + results = {} + + print(results) + + def evaluated_model(self, gold_evals_path): + + try: + eval_file = [i for i in os.listdir(gold_evals_path) if i[:14]=="retriever_eval"][0] + print(eval_file) + gold_evals = open_json(eval_file, gold_evals_path) + + results = { + "Made gold data evals": bool(gold_evals), + "Query count matches": gold_evals["query_count"] == 33, + "MRR greater than 0": gold_evals["MRR"] > 0, + "mAP greater than 0": gold_evals["mAP"] > 0, + "recall greater than 0": gold_evals["recall"] > 0 + } + except: + results = {} + + print(results) + + def cleanup_files(self, eval_path, training_path, model_path): + + try: + delete_files(training_path) + delete_files(eval_path) + delete_files(model_path) + except Exception as e: + logger.warning("Could not delete test files") + logger.warning(e) + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description="Test finetuning") + + parser.add_argument("--model", "-m", dest="model", help="base transformer model for finetuning") + parser.add_argument("--cleanup", "-c", dest="cleanup", required=False, help="whether to delete files after test") + args = parser.parse_args() + model = args.model + cleanup = args.cleanup if args.cleanup else True + + try: + test = TestTrain(model) + test.call_finetune() + + time.sleep(15) # wait for a new directory + + print("\n*** Checking validation data created...") + val_path = get_most_recent_dir("gamechangerml/data/validation/domain/sent_transformer") + timestamp = str(val_path).split('/')[-1] + gold_dir = os.path.join(val_path, "gold") + print(f"Looking for gold validation data at: {gold_dir}") + passed = wait("intelligent_search_data.json", gold_dir, type='check_file') + + if passed: + test.made_validation_data(val_path) + else: + print("Did not find validation data") + quit + + print("\n*** Checking training data created...") + train_dir = ("gamechangerml/data/training/sent_transformer") + passed = wait_matching_dir(train_dir, timestamp) + + if passed: + training_path = get_most_recent_dir(train_dir) + else: + print("Could not get updated training data dir") + quit + + print(f"Looking for training metadata at: {training_path}") + metadata = wait("training_metadata.json", training_path, type='open_json') + + if metadata: + time.sleep(2) + test.made_training_data(metadata, training_path) + else: + print("Did not find training data") + quit + + + print("\n*** Checking finetuned model created...") + model_name = test.model + '_TEST_' + timestamp + model_dir = "gamechangerml/models/transformers" + passed = wait_matching_dir(model_dir, model_name) + if passed: + model_path = get_most_recent_dir(model_dir) + else: + print("Could not get updated model dir") + quit + + print(f"Looking for finetuned model files at: {model_path}") + model_metadata = wait("metadata.json", model_path, type = 'open_json') + + if model_metadata: + test.finetuned_model(model_metadata) + else: + print("Did not finetune the model") + quit + + + print("\n*** Checking model evaluations created...") + gold_evals_dir = os.path.join(model_path, "evals_gc", "gold") + print(f"Looking for gold standard evaluations at: {gold_evals_dir}") + passed = wait("", gold_evals_dir, type='check_dir', attempts=500) + + if passed: + test.evaluated_model(gold_evals_dir) + else: + print("Did not evaluate the model") + quit + + print("*** Deleting test files...") + if passed and cleanup: + test.cleanup_files(val_path, training_path, model_path) + + except Exception as e: + logger.warning(e, exc_info=True) \ No newline at end of file diff --git a/gamechangerml/src/model_testing/validation_data.py b/gamechangerml/src/model_testing/validation_data.py index e2edacf1..f3026061 100644 --- a/gamechangerml/src/model_testing/validation_data.py +++ b/gamechangerml/src/model_testing/validation_data.py @@ -1,5 +1,6 @@ import pandas as pd import numpy as np +from collections import OrderedDict from gamechangerml.src.utilities.text_utils import normalize_answer, normalize_query, filter_title_queries from gamechangerml.src.utilities.test_utils import * from gamechangerml.configs.config import ValidationConfig @@ -194,7 +195,7 @@ def dictify_data(self, available_ids): relations = dict(zip(q_idx, doc_list)) logger.info( - "Generated {} test queries of gold standard data".format( + "Generated {} test queries of gold standard data from search history".format( len(query_list)) ) @@ -219,7 +220,7 @@ def __init__( else: new_data = get_most_recent_dir( os.path.join( - ValidationConfig.DATA_ARGS["validation_dir"], "sent_transformer" + ValidationConfig.DATA_ARGS["validation_dir"], "domain", "sent_transformer" ) ) self.data_path = os.path.join(new_data, level) @@ -329,9 +330,9 @@ def get_sample_csv(self, sample_limit): class MatamoFeedback: - def __init__(self, start_date, end_date, exclude_searches): + def __init__(self, start_date, end_date, exclude_searches, testing_only=False): - self.matamo = concat_matamo() + self.matamo = concat_matamo(testing_only) self.start_date = start_date self.end_date = end_date self.exclude_searches = exclude_searches @@ -404,9 +405,9 @@ def process_row(row, col_name): class SearchHistory: - def __init__(self, start_date, end_date, exclude_searches): + def __init__(self, start_date, end_date, exclude_searches, testing_only=False): - self.history = concat_search_hist() + self.history = concat_search_hist(testing_only) self.start_date = start_date self.end_date = end_date self.exclude_searches = exclude_searches @@ -428,7 +429,9 @@ def clean_quot(string): return string.replace(""", "'").replace("'", "'").lower() def clean_doc(string): - return string.split(".pdf")[0] + doc = string.split(".pdf")[0] + doc = ' '.join([i for i in doc.split(' ') if i != '']) + return doc def is_question(string): """If we find a good way to use search history for QA validation (not used currently)""" @@ -461,16 +464,17 @@ def is_question(string): class SearchValidationData: - def __init__(self, start_date, end_date, exclude_searches): + def __init__(self, start_date, end_date, exclude_searches, testing_only): self.start_date = start_date self.end_date = end_date self.exclude_searches = exclude_searches + self.testing_only = testing_only self.matamo_data = MatamoFeedback( - self.start_date, self.end_date, self.exclude_searches + self.start_date, self.end_date, self.exclude_searches, self.testing_only ) self.history_data = SearchHistory( - self.start_date, self.end_date, self.exclude_searches + self.start_date, self.end_date, self.exclude_searches, self.testing_only ) @@ -538,10 +542,11 @@ def __init__( min_correct_matches, max_results, filter_queries, - index_path + index_path, + testing_only ): - super().__init__(start_date, end_date, exclude_searches) + super().__init__(start_date, end_date, exclude_searches, testing_only) self.exclude_searches = exclude_searches self.data = pd.concat( [self.matamo_data.intel, self.history_data.intel_matched] @@ -550,7 +555,7 @@ def __init__( self.max_results = max_results self.filter_queries = filter_queries self.index_path = index_path - self.queries, self.collection, self.all_relations, self.correct, self.incorrect = self.make_intel() + self.queries, self.collection, self.all_relations, self.correct, self.incorrect, self.correct_vals, self.incorrect_vals = self.make_intel() def make_intel(self): @@ -596,12 +601,40 @@ def make_intel(self): max_results=self.max_results, ) + def map_values(queries, collection, relations): + vals_dict = {} + for key in relations.keys(): + query = queries[key] + doc_keys = relations[key] + docs = [collection[i] for i in doc_keys] + vals_dict[query] = docs + + return vals_dict + + correct_vals = map_values(intel_search_queries, intel_search_results, correct) + incorrect_vals = map_values(intel_search_queries, intel_search_results, incorrect) + + def sort_dictionary(dictionary): + + mydict = OrderedDict(dictionary.items()) + mydict_new = {} + for key in mydict.keys(): + vals = mydict[key] + vals.sort() + mydict_new[key] = vals + return mydict_new + + correct_vals = sort_dictionary(correct_vals) + incorrect_vals = sort_dictionary(incorrect_vals) + return ( intel_search_queries, intel_search_results, new_intel_metadata, correct, incorrect, + correct_vals, + incorrect_vals ) diff --git a/gamechangerml/src/search/sent_transformer/finetune.py b/gamechangerml/src/search/sent_transformer/finetune.py index e40e4a1e..8c3123c0 100644 --- a/gamechangerml/src/search/sent_transformer/finetune.py +++ b/gamechangerml/src/search/sent_transformer/finetune.py @@ -3,13 +3,14 @@ from datetime import datetime from gamechangerml.api.utils.logger import logger from gamechangerml.src.utilities import utils as utils -from gamechangerml.src.utilities.test_utils import open_json, timestamp_filename, cos_sim +from gamechangerml.src.utilities.test_utils import open_json, save_json, timestamp_filename +from gamechangerml.scripts.run_evaluation import eval_sent from time import sleep import tqdm import threading import logging import gc -from sentence_transformers import SentenceTransformer, InputExample, util, losses +from sentence_transformers import SentenceTransformer, InputExample, losses from torch.utils.data import DataLoader import pandas as pd from datetime import date @@ -21,7 +22,6 @@ import torch.nn.functional as F from torch import nn torch.cuda.empty_cache() -from gamechangerml.src.model_testing.metrics import reciprocal_rank_score, get_MRR S3_DATA_PATH = "bronze/gamechanger/ml-data" @@ -48,9 +48,8 @@ def fix_model_config(model_load_path): except: logger.info("Could not update model config file") - def format_inputs(train, test, data_dir): - """Create input data for dataloader and df for tracking cosine sim""" + """Create input data for dataloader and df with train/test split data""" train_samples = [] all_data = [] @@ -61,28 +60,30 @@ def format_inputs(train, test, data_dir): score = float(train[i]["label"]) inputex = InputExample(str(count), texts, score) train_samples.append(inputex) - all_data.append([train[i]["query"], texts, score, "train"]) + all_data.append([train[i]["query"], train[i]["doc"], score, "train"]) count += 1 - #processmanager.update_status(processmanager.loading_data, count, total) for x in test.keys(): texts = [test[x]["query"], test[x]["paragraph"]] score = float(test[x]["label"]) - all_data.append([test[x]["query"], texts, score, "test"]) + all_data.append([test[x]["query"], test[x]["doc"], score, "test"]) count += 1 processmanager.update_status(processmanager.loading_data, count, total) - df = pd.DataFrame(all_data, columns=["key", "pair", "score", "label"]) - df.to_csv(os.path.join(data_dir, timestamp_filename("finetuning_data", ".csv"))) - - return train_samples + df = pd.DataFrame(all_data, columns=["key", "doc", "score", "label"]) + df.drop_duplicates(subset = ['doc', 'score', 'label'], inplace = True) + logger.info(f"Generated training data CSV of {str(df.shape[0])} rows") + df_path = os.path.join(data_dir, timestamp_filename("finetuning_data", ".csv")) + df.to_csv(df_path) + return train_samples, df_path class STFinetuner(): def __init__(self, model_load_path, model_save_path, shuffle, batch_size, epochs, warmup_steps): fix_model_config(model_load_path) + self.model_load_path = model_load_path self.model = SentenceTransformer(model_load_path) self.model_save_path = model_save_path self.shuffle = shuffle @@ -98,30 +99,30 @@ def retrain(self, data_dir, testing_only, version): train = data["train"] test = data["test"] - del data - gc.collect() - if testing_only: logger.info( "Creating smaller dataset just for testing finetuning.") - train_keys = list(train.keys())[:10] - test_keys = list(test.keys())[:10] - train = {k: train[k] for k in train_keys} - test = {k: test[k] for k in test_keys} + train_queries = list(set([train[i]['query'] for i in train.keys()]))[:30] + test_queries = list(set([test[i]['query'] for i in test.keys()]))[:10] + train = {k: train[k] for k in train.keys() if train[k]['query'] in train_queries} + test = {k: test[k] for k in test.keys() if test[k]['query'] in test_queries} + + del data + gc.collect() processmanager.update_status(processmanager.training, 0, 1,thread_id=threading.current_thread().ident) sleep(0.1) # make formatted training data - train_samples = format_inputs(train, test, data_dir) - + train_samples, df_path = format_inputs(train, test, data_dir) + len_samples = len(train_samples) # finetune on samples logger.info("Starting dataloader...") # pin_memory=self.pin_memory) train_dataloader = DataLoader( train_samples, shuffle=self.shuffle, batch_size=self.batch_size) train_loss = losses.CosineSimilarityLoss(model=self.model) - del train_samples - gc.collect() + #del train_samples + #gc.collect() logger.info("Finetuning the encoder model...") self.model.fit(train_objectives=[ (train_dataloader, train_loss)], epochs=self.epochs, warmup_steps=self.warmup_steps) @@ -132,6 +133,25 @@ def retrain(self, data_dir, testing_only, version): logger.info("Finetuned model saved to {}".format( str(self.model_save_path))) + # save metadata with the finetuned model + metadata = { + "date": datetime.now().strftime("%Y-%m-%d"), + "model_type": "finetuned encoder", + "base_model_path": self.model_load_path, + "current_model_path": self.model_save_path, + "training_data_dir": df_path, + "n_training_samples": len_samples, + "version": version, + "testing_only": testing_only, + "shuffle": self.shuffle, + "batch_size": self.batch_size, + "epochs": self.epochs, + "warmup_steps": self.warmup_steps + } + + save_json("metadata.json", self.model_save_path, metadata) + logger.info(f"Finetuned model metadata saved to {self.model_save_path}/metadata.json") + # when not testing only, save to S3 if not testing_only: logger.info("Saving data to S3...") @@ -155,8 +175,8 @@ def retrain(self, data_dir, testing_only, version): utils.upload(s3_path, dst_path, "transformers", model_id) logger.info(f"*** Saved model to S3: {s3_path}") - return {} - except Exception as e: logger.warning("Could not complete finetuning") logger.error(e) + + return diff --git a/gamechangerml/src/search/sent_transformer/model.py b/gamechangerml/src/search/sent_transformer/model.py index 8bfad473..409fade0 100644 --- a/gamechangerml/src/search/sent_transformer/model.py +++ b/gamechangerml/src/search/sent_transformer/model.py @@ -39,7 +39,7 @@ def __init__( transformer_path, model=None, use_gpu=False, - bert_tokenize=False, + bert_tokenize=False ): if model: @@ -149,7 +149,7 @@ def _index(self, corpus, index_path, overwrite=False, save_embedding=False): self.embedder.embeddings.index(embeddings) logger.info(f"Built the embeddings index") - def index_documents(self, corpus_path, index_path): + def index_documents(self, corpus_path, index_path, files_to_use=None): """ Create the index and accompanying dataframe to perform text and paragraph id search @@ -168,6 +168,7 @@ def index_documents(self, corpus_path, index_path): min_token_len=self.min_token_len, verbose=self.verbose, bert_based_tokenizer=self.bert_tokenizer, + files_to_use=files_to_use ) corpus = [(para_id, " ".join(tokens), None) for tokens, para_id in corp] diff --git a/gamechangerml/src/text_handling/corpus.py b/gamechangerml/src/text_handling/corpus.py index ba3d35da..37745358 100644 --- a/gamechangerml/src/text_handling/corpus.py +++ b/gamechangerml/src/text_handling/corpus.py @@ -17,13 +17,21 @@ def __init__( min_token_len=3, verbose=False, bert_based_tokenizer=None, + files_to_use=None ): self.directory = directory - self.file_list = [ - os.path.join(directory, file) - for file in os.listdir(directory) - if file[-5:] == ".json" - ] + if files_to_use: ## if we only want to do this on a subset + self.file_list = list(set([os.path.join(directory, i) for i in files_to_use]).intersection([ + os.path.join(directory, file) + for file in os.listdir(directory) + if file[-5:] == ".json" + ])) + else: + self.file_list = [ + os.path.join(directory, file) + for file in os.listdir(directory) + if file[-5:] == ".json" + ] self.file_list self.return_id = return_id self.min_token_len = min_token_len diff --git a/gamechangerml/src/utilities/es_utils.py b/gamechangerml/src/utilities/es_utils.py index 6fa0ef8a..5b026bbe 100644 --- a/gamechangerml/src/utilities/es_utils.py +++ b/gamechangerml/src/utilities/es_utils.py @@ -1,12 +1,7 @@ from elasticsearch import Elasticsearch -import json import requests -import re -import pandas as pd import os import logging -import time -from gamechangerml import MODEL_PATH, DATA_PATH import typing as t import base64 from urllib.parse import urljoin @@ -109,173 +104,3 @@ def get(self, url: str, **request_opts) -> requests.Response: def delete(self, url: str, **request_opts) -> requests.Response: return self.request(method="DELETE", url=url, **request_opts) - - -def connect_es(es_url): - """Connect to ES""" - - tries = 0 - while tries < 5: - try: - es = Elasticsearch([es_url]) - time.sleep(1) - print("ES connected\n") - break - except ConnectionError: - print("ES not connected, trying again\n") - tries += 1 - - return es - - -def get_es_responses_doc(es, query, docid): - """Query ES for a search string and a docid (from search results)""" - - true = True - false = False - - search = { - "_source": { - "includes": ["pagerank_r", "kw_doc_score_r", "orgs_rs", "topics_rs"] - }, - "stored_fields": [ - "filename", - "title", - "page_count", - "doc_type", - "doc_num", - "ref_list", - "id", - "summary_30", - "keyw_5", - "p_text", - "type", - "p_page", - "display_title_s", - "display_org_s", - "display_doc_type_s", - "is_revoked_b", - "access_timestamp_dt", - "publication_date_dt", - "crawler_used_s", - ], - "from": 0, - "size": 20, - "track_total_hits": true, - "query": { - "bool": { - "must": [ - {"match": {"id": docid}}, - { - "nested": { - "path": "paragraphs", - "inner_hits": { - "_source": false, - "stored_fields": [ - "paragraphs.page_num_i", - "paragraphs.filename", - "paragraphs.par_raw_text_t", - ], - "from": 0, - "size": 5, - "highlight": { - "fields": { - "paragraphs.filename.search": { - "number_of_fragments": 0 - }, - "paragraphs.par_raw_text_t": { - "fragment_size": 200, - "number_of_fragments": 1, - }, - }, - "fragmenter": "span", - }, - }, - "query": { - "bool": { - "should": [ - { - "wildcard": { - "paragraphs.filename.search": { - "value": query, - "boost": 15, - } - } - }, - { - "query_string": { - "query": query, - "default_field": "paragraphs.par_raw_text_t", - "default_operator": "AND", - "fuzzy_max_expansions": 100, - "fuzziness": "AUTO", - } - }, - ] - } - }, - } - }, - ], - "should": [ - { - "multi_match": { - "query": query, - "fields": [ - "keyw_5^2", - "id^2", - "summary_30", - "paragraphs.par_raw_text_t", - ], - "operator": "or", - } - }, - {"rank_feature": {"field": "pagerank_r", "boost": 0.5}}, - {"rank_feature": {"field": "kw_doc_score_r", "boost": 0.1}}, - ], - } - }, - } - - return es.search(index="gamechanger", body=search) - - -def get_paragraph_results(es, query, doc): - """Get list of paragraph texts for each search result""" - - docid = doc + ".pdf_0" - resp = get_es_responses_doc(es, query, docid) - - texts = [] - if resp["hits"]["total"]["value"] > 0: - hits = resp["hits"]["hits"][0]["inner_hits"]["paragraphs"]["hits"]["hits"] - for par in hits: - texts.append(par["fields"]["paragraphs.par_raw_text_t"]) - - return texts - - -def collect_results(relations, queries, collection, es, label): - """Query ES for search/doc matches and add them to query results with a label""" - - found = {} - not_found = {} - for i in relations.keys(): - query = queries[i] - for k in relations[i]: - doc = collection[k] - uid = str(i) + "_" + str(k) - try: - para = get_paragraph_results(es, query, doc)[0][0] - # truncate to 400 tokens - para = " ".join(para.split(" ")[:400]) - found[uid] = { - "query": query, - "doc": doc, - "paragraph": para, - "label": label, - } - except: - not_found[uid] = {"query": query, "doc": doc, "label": label} - - return found, not_found diff --git a/gamechangerml/src/utilities/test_utils.py b/gamechangerml/src/utilities/test_utils.py index 432107bf..a1f01d4d 100644 --- a/gamechangerml/src/utilities/test_utils.py +++ b/gamechangerml/src/utilities/test_utils.py @@ -8,12 +8,18 @@ from datetime import date, datetime import signal import torch +import random +import shutil from gamechangerml.api.utils.logger import logger from gamechangerml.configs.config import ValidationConfig MATAMO_DIR = ValidationConfig.DATA_ARGS['matamo_dir'] SEARCH_HIST = ValidationConfig.DATA_ARGS['search_hist_dir'] +MATAMO_TEST_FILE = "gamechangerml/data/test_data/MatamoFeedback_TEST.csv" +SEARCH_TEST_FILE = "gamechangerml/data/test_data/SearchPDFMapping_TEST.csv" + + # https://stackoverflow.com/questions/25027122/break-the-function-after-certain-time/25027182 class TimeoutException(Exception): # Custom exception class pass @@ -335,11 +341,17 @@ def concat_csvs(directory): pass return df -def concat_matamo(): - return concat_csvs(MATAMO_DIR) +def concat_matamo(testing_only=False): + if testing_only: + return pd.read_csv(MATAMO_TEST_FILE) + else: + return concat_csvs(MATAMO_DIR) -def concat_search_hist(): - return concat_csvs(SEARCH_HIST) +def concat_search_hist(testing_only=False): + if testing_only: + return pd.read_csv(SEARCH_TEST_FILE) + else: + return concat_csvs(SEARCH_HIST) def get_most_recent_dir(parent_dir): @@ -349,3 +361,45 @@ def get_most_recent_dir(parent_dir): else: logger.error("There are no subdirectories to retrieve most recent data from") return None + +def make_test_corpus( + corpus_dir, # main corpus dir + save_dir, # where to save the test corpus + percent_random, # float from 0-1 percentage of index to make from random docs + max_size=1000, # max size of the index (to save on time building) + include_ids=None, # if any IDs need to be in the test, pass as list + max_file_size=100000 # max size of random files to add to the test corpus + ): + '''Makes a small test corpus for checking validation''' + all_files = [f.split('.json')[0] + '.json' for f in os.listdir(corpus_dir) if os.path.isfile(os.path.join(corpus_dir, f))] + if percent_random > 1: + percent_random = percent_random / 100 + if include_ids: + logger.info(f"{str(len(include_ids))} ids required in test corpus") + include_ids = [f.split('.json')[0] + '.json' for f in include_ids] # make sure json at end of filenames + subset = list(set(all_files).intersection(include_ids)) # only get ids in the main corpus + if len(subset) < len(include_ids): + logger.info(f"Did not find all required ids in the main corpus dir.") + logger.info(f"Found {str(len(subset))} / {str(len(include_ids))} ids") + other = [i for i in all_files if i not in include_ids] + if percent_random > 0: + num_add = round(len(subset)/percent_random - len(subset)) + else: + num_add = 0 + else: + subset = [] + other = all_files + num_add = max_size + + ## add random docs + for i in range(num_add): + filesize = 1000000 + while filesize > max_file_size: # as we iterate, skip large files + random_index = random.randint(0,len(other)-1) + file = other[random_index] # pick a random file + filesize = check_file_size(file, corpus_dir) # if filesize is smaller than max, break loop + subset.append(file) + subset = list(set(subset)) # remove duplicates + + logger.info(f"Collected {str(len(subset))} jsons") + return subset diff --git a/gamechangerml/src/utilities/text_utils.py b/gamechangerml/src/utilities/text_utils.py index 5b0c39e5..260014d5 100644 --- a/gamechangerml/src/utilities/text_utils.py +++ b/gamechangerml/src/utilities/text_utils.py @@ -185,10 +185,13 @@ def normalize_query(s: str) -> str: Lower text and remove extra whitespace. """ def white_space_fix(text): - return ' '.join(text.split()) + return ' '.join(text.strip().lstrip().split()) def lower(text): return text.lower() - return white_space_fix(lower(s)) + def remove_quotes(text): + exclude = ["'", '"'] + return ''.join(ch for ch in text if ch not in exclude) + return white_space_fix(remove_quotes(lower(s))) def clean_query(query: str) -> str: '''Removes all non alphanumeric characters and 'and' / 'or' from query string''' diff --git a/gamechangerml/train/pipeline.py b/gamechangerml/train/pipeline.py index 1be955c1..bc7025ae 100644 --- a/gamechangerml/train/pipeline.py +++ b/gamechangerml/train/pipeline.py @@ -151,6 +151,7 @@ def run_pipeline(self, steps={}): def create_metadata( self, meta_steps, + testing_only:bool, corpus_dir: t.Union[str, os.PathLike] = CORPUS_DIR, index_path: t.Union[str, os.PathLike] = os.path.join( MODEL_PATH, "sent_index_20210715" @@ -159,7 +160,7 @@ def create_metadata( prod_data_file=PROD_DATA_FILE, n_returns: int=50, level: str='silver', - update_eval_data: bool=False, + update_eval_data: bool=True, retriever=None, upload: bool = True, version: str = "v1" @@ -190,7 +191,9 @@ def create_metadata( make_corpus_meta(corpus_dir, days, prod_data_file, upload) if "update_sent_data" in meta_steps: try: - make_training_data(index_path, n_returns, level, update_eval_data, retriever) + make_training_data( + index_path=index_path, level=level, update_eval_data=update_eval_data, testing_only=testing_only + ) except Exception as e: logger.warning(e, exc_info=True) if upload: @@ -212,8 +215,9 @@ def finetune_sent( epochs: int = 3, warmup_steps: int = 100, testing_only: bool = False, - remake_train_data: bool = False, + remake_train_data: bool = True, retriever = None, + model = None, version: str = "v1" ) -> t.Dict[str, str]: """finetune_sent: finetunes the sentence transformer - saves new model, @@ -228,13 +232,15 @@ def finetune_sent( """ try: - model_load_path = os.path.join( - LOCAL_TRANSFORMERS_DIR, EmbedderConfig.BASE_MODEL - ) - model_id = datetime.now().strftime("%Y%m%d") - model_save_path = model_load_path + "_" + model_id - logger.info( - f"Setting {str(model_save_path)} as save path for new model") + if not model: + model_load_path = os.path.join( + LOCAL_TRANSFORMERS_DIR, EmbedderConfig.BASE_MODEL + ) + else: + model_load_path = os.path.join( + LOCAL_TRANSFORMERS_DIR, model + ) + no_data=False base_dir = os.path.join(DATA_PATH, "training", "sent_transformer") @@ -256,13 +262,23 @@ def finetune_sent( if no_data: make_training_data( index_path=SENT_INDEX, - n_returns=50, level='silver', - update_eval_data=True, - retriever=retriever + update_eval_data=True, + testing_only=testing_only ) data_path = get_most_recent_dir(base_dir) + timestamp = str(data_path).split('/')[-1] + + ## set model save path + if testing_only: + model_save_path = model_load_path + '_TEST_' + timestamp + else: + model_id = datetime.now().strftime("%Y%m%d") + model_save_path = model_load_path + "_" + model_id + logger.info( + f"Setting {str(model_save_path)} as save path for new model") + logger.info(f"Loading in domain data to finetune from {data_path}") finetuner = STFinetuner( model_load_path=model_load_path, @@ -275,10 +291,31 @@ def finetune_sent( logger.info("Loaded finetuner class...") logger.info(f"Testing only is set to: {testing_only}") - return finetuner.retrain(data_path, testing_only, version) + # finetune + finetuner.retrain(data_path, testing_only, version) + + # eval finetuned model + logger.info("Done making finetuned model, runnin evals") + model_name = model_save_path.split('/')[-1] + train_meta = open_json("training_metadata.json", data_path) + validation_data = train_meta['validation_data_used'].split('/')[-1] + evals = eval_sent(model_name, validation_data, eval_type="domain") + + try: + for metric in evals: + if metric != "model_name": + mlflow.log_metric( + key=metric, value=evals[metric]) + except Exception as e: + logger.warning(e) + + return evals + except Exception as e: logger.warning("Could not finetune sentence model - pipeline") - logger.error(e, exc_info=True) + logger.error(e) + + return {} def evaluate( self, @@ -306,9 +343,14 @@ def evaluate( if "bert-base-cased-squad2" in model_name: results[eval_type] = eval_qa( model_name, sample_limit, eval_type) - elif "msmarco-distilbert-base-v2" in model_name: - results["original"] = eval_sent( - model_name, validation_data, eval_type="original" + elif "msmarco-distilbert" in model_name: + for e_type in ["domain", "original"]: + results[e_type] = eval_sent( + model_name, validation_data, e_type + ) + elif "multi-qa-MiniLM" in model_name: + results["domain"] = eval_sent( + model_name, validation_data, eval_type="domain" ) elif "sent_index" in model_name: results["domain"] = eval_sent(