GAIR-NLP · EQ3A2A · Aug 22, 2023 · Aug 22, 2023 · Aug 22, 2023 · Aug 22, 2023
diff --git a/README.md b/README.md
@@ -184,7 +184,7 @@ The response_list should follow the following format:
 }
 ```
 
-In this case, you will get:
+#### In this case, you will get:
 
 
 ```python
@@ -213,6 +213,37 @@ In this case, you will get:
     ]
 }
 ```
+
+### Use with local chatbots
+install fastchat([fastchat github repository]("https://github.com/lm-sys/FastChat/tree/main"))
+``` python
+pip install fastchat
+```
+#### RESTful API Server([doc]("https://github.com/lm-sys/FastChat/blob/main/docs/openai_api.md"))
+First, launch the controller
+
+```bash
+python3 -m fastchat.serve.controller
+```
+
+Then, launch the model worker(s)
+
+```bash
+python3 -m fastchat.serve.model_worker --model-path lmsys/vicuna-7b-v1.3
+```
+
+Finally, launch the RESTful API server
+
+```bash
+python3 -m fastchat.serve.openai_api_server --host localhost --port 8000
+```
+
+Then, you can use factool powered by local chatbots by passing the model name you used in the fastchat module to our Factool initialization method
+
+```python3
+factool_instance = Factool("vicuna-7b-v1.3")
+```
+
 </details>
 
 

diff --git a/example/example.py b/example/example.py
@@ -7,7 +7,7 @@
 
 from factool import Factool
 
-factool_instance = Factool("gpt-4")
+factool_instance = Factool("gpt-3.5-turbo")
 
 # one example input for each category.
 inputs = [
@@ -16,12 +16,12 @@
                 "response": "Graham Neubig is a professor at MIT",
                 "category": "kbqa"
             },
-            {
-                "prompt": "def get_max_triples(n): \"\"\" You are given a positive integer n. You have to create an integer array a of length n. For each i (1 \u2264 i \u2264 n), the value of a[i] = i * i - i + 1. Return the number of triples (a[i], a[j], a[k]) of a where i < j < k, and a[i] + a[j] + a[k] is a multiple of 3. Example : Input: n = 5 Output: 1 Explanation: a = [1, 3, 7, 13, 21] The only valid triple is (1, 7, 13). \"\"\" Now implement the function get_max_triples using Python",
-                "response": "def get_max_triples(n):\n    a = [i * i - i + 1 for i in range(1, n+1)]\n    count = 0\n    for i in range(n-2):\n        for j in range(i+1, n-1):\n            for k in range(j+1, n):\n                if (a[i] + a[j] + a[k]) % 3 == 0:\n                    count += 1\n    return count\n\nprint(get_max_triples(5)) # Output: 1",
-                "category": "code",
-                "entry_point": "get_max_triples"
-            },
+            # {
+            #     "prompt": "def get_max_triples(n): \"\"\" You are given a positive integer n. You have to create an integer array a of length n. For each i (1 \u2264 i \u2264 n), the value of a[i] = i * i - i + 1. Return the number of triples (a[i], a[j], a[k]) of a where i < j < k, and a[i] + a[j] + a[k] is a multiple of 3. Example : Input: n = 5 Output: 1 Explanation: a = [1, 3, 7, 13, 21] The only valid triple is (1, 7, 13). \"\"\" Now implement the function get_max_triples using Python",
+            #     "response": "def get_max_triples(n):\n    a = [i * i - i + 1 for i in range(1, n+1)]\n    count = 0\n    for i in range(n-2):\n        for j in range(i+1, n-1):\n            for k in range(j+1, n):\n                if (a[i] + a[j] + a[k]) % 3 == 0:\n                    count += 1\n    return count\n\nprint(get_max_triples(5)) # Output: 1",
+            #     "category": "code",
+            #     "entry_point": "get_max_triples"
+            # },
             {
                 "prompt": "Stephen placed an online order for groceries.  His final bill came to $7023116.  Because this was through a delivery vendor, they tacked on a 25% fee to his final total and charged him $3.00 in delivery fees.  Stephen also added a $4.00 tip.  After the extra fees, what was the final price of Stephen's groceries?",
                 "response": "Okay, here are the steps to solve this:\n* Stephen's initial grocery bill was $7023116\n* The delivery vendor charged a 25% fee on top of that. 25% of $7023116 is 0.25 * $7023116 = $1755779\n* They also charged a flat $3 delivery fee\n* Stephen added a $4 tip\n* So the total charges were:\n    * Initial groceries: $7023116\n    * 25% fee: $1755779\n    * Delivery fee: $3 \n    * Tip: $4\n* Adding all that up, the total final price was $7023116 + $1755779 + $3 + $4 = $8779902\n\nSo the final price of Stephen's groceries was $8779902",

diff --git a/factool/factool.py b/factool/factool.py
@@ -7,26 +7,29 @@
 from factool.math.pipeline import math_pipeline
 from factool.scientific.pipeline import scientific_pipeline
 from factool.med_doc_qa.pipeline import med_doc_qa_pipeline
+from factool.law_counsel_qa.pipeline import law_counsel_qa_pipeline
+from factool.finance_counsel_qa.pipeline import finance_counsel_qa_pipeline
+from factool.med_counsel_qa.pipeline import med_counsel_qa_pipeline
 
 class Factool():
     def __init__(self, foundation_model):
         self.foundation_model = foundation_model
         self.pipelines = {
-                            "kbqa_online": knowledge_qa_pipeline(
-                                foundation_model, 10, "online"
-                            ),
-                            "code": code_pipeline(
-                                foundation_model, 3, 3
-                            ),
-                            "math": math_pipeline(
-                                foundation_model
-                            ),
-                            "scientific": scientific_pipeline(
-                                foundation_model
-                            ),
-                            "med_doc_qa": med_doc_qa_pipeline(
-                                foundation_model
-                            )
+                            # "kbqa_online": knowledge_qa_pipeline(
+                            #     foundation_model, 10, "online"
+                            # ),
+                            # "code": code_pipeline(
+                            #     foundation_model, 3, 3
+                            # ),
+                            # "math": math_pipeline(
+                            #     foundation_model
+                            # ),
+                            # "scientific": scientific_pipeline(
+                            #     foundation_model
+                            # ),
+                            # "med_doc_qa": med_doc_qa_pipeline(
+                            #     foundation_model
+                            # )
                         }
 
     def run(self, inputs):
@@ -84,6 +87,33 @@ def run(self, inputs):
                             [sample['response'] for sample in batch],
                         )
                     )
+            elif category == "law_counsel_qa":
+                batch_results = asyncio.run(
+                    law_counsel_qa_pipeline(
+                        self.foundation_model,3,batch[0].get("data_link"),batch[0].get("embedding_link")
+                    ).run_with_tool_api_call(
+                        [sample['prompt'] for sample in batch],
+                        [sample['response'] for sample in batch],
+                    )
+                )
+            elif category == "finance_counsel_qa":
+                batch_results = asyncio.run(
+                    finance_counsel_qa_pipeline(
+                        self.foundation_model,3,batch[0].get("data_link"),batch[0].get("embedding_link")
+                    ).run_with_tool_api_call(
+                        [sample['prompt'] for sample in batch],
+                        [sample['response'] for sample in batch],
+                    )
+                )
+            elif category == "med_counsel_qa":
+                batch_results = asyncio.run(
+                    med_counsel_qa_pipeline(
+                        self.foundation_model,3,batch[0].get("data_link"),batch[0].get("embedding_link")
+                    ).run_with_tool_api_call_self_check(
+                        [sample['prompt'] for sample in batch],
+                        [sample['response'] for sample in batch],
+                    )
+                )
             else:
                 batch_results = asyncio.run(
                     self.pipelines[category].run_with_tool_api_call(

diff --git a/factool/finance_counsel_qa/__init__.py b/factool/finance_counsel_qa/__init__.py
diff --git a/factool/finance_counsel_qa/pipeline.py b/factool/finance_counsel_qa/pipeline.py
@@ -0,0 +1,192 @@
+import json
+import yaml
+import os
+import time
+import math
+import pdb
+from typing import List, Dict
+
+from factool.utils.tool import local_search
+from factool.utils.base.pipeline import pipeline
+
+class finance_counsel_qa_pipeline(pipeline):
+    def __init__(self, foundation_model, snippet_cnt, data_link=None, embed_link=None):
+        super().__init__('finance_counsel_qa', foundation_model)
+        self.tool = local_search(snippet_cnt = snippet_cnt, data_link=data_link, embedding_link=embed_link)
+        with open(os.path.join(self.prompts_path, "claim_extraction.yaml"), 'r') as file:
+            data = yaml.load(file, Loader=yaml.FullLoader)
+        self.claim_prompt = data['finance_counsel_qa']
+
+        with open(os.path.join(self.prompts_path, 'agreement_verification.yaml'), 'r') as file:
+            data = yaml.load(file, Loader=yaml.FullLoader)
+        self.verification_prompt = data['finance_counsel_qa']
+
+    async def _claim_extraction(self, prompts, responses):
+        messages_list = [
+            [
+                {"role": "system", "content": self.claim_prompt['system']},
+                {"role": "user", "content": self.claim_prompt['user'].format(prompt=prompt, response=response)},
+            ]
+            for prompt, response in zip(prompts,responses)
+        ]
+        return await self.chat.async_run(messages_list, List)
+
+    async def _verification(self, claims, prompt, response, related_knowledge):
+        messages_list = [
+            [
+                {"role": "system", "content": self.verification_prompt['system']},
+                {"role": "user", "content": self.verification_prompt['user'].format(claim=claim, prompt=prompt, response=response, related_knowledge=related_knowledge)},
+            ] for claim in claims 
+        ]
+        return await self.chat.async_run(messages_list, dict)
+
+    async def run_with_tool_live(self, prompts, responses):
+        claims_in_responses = await self._claim_extraction(prompts, responses)
+        evidences_in_responses = []
+        verifications_in_responses = []
+        for claims_in_response,prompt,response in zip(claims_in_responses, prompts, responses):
+            related_knowledge = await self.tool.search(prompt+response)
+            related_knowledge = [doc['content'] for doc in related_knowledge]
+            related_knowledge = "".join(["<"+ doc + ">" for doc in related_knowledge])
+            evidences_in_responses.append(related_knowledge)
+            verifications = await self._verification(claims_in_response, prompt, response, related_knowledge)
+            verifications_in_responses.append(verifications)
+
+        return claims_in_responses, evidences_in_responses, verifications_in_responses
+
+    async def run_with_tool_live_without_claim_extraction(self, claims):
+        queries = await self._query_generation(claims)
+        evidences = await self.tool.run(queries)
+
+        final_response = await self._verification(claims, evidences)
+        for i in range(len(final_response)):
+            if final_response[i] != None:
+                final_response[i]['queries'] = queries[i]
+                final_response[i]['evidences'] = evidences[i]
+
+        return final_response
+
+    async def run_with_tool_api_call(self, prompts, responses):
+        batch_size = 5
+        num_batches = math.ceil(len(prompts) / batch_size)
+
+        self.sample_list = [{"prompt": prompt, "response": response, "category": 'finance_counsel_qa'} for prompt, response in zip(prompts, responses)]
+
+        for i in range(num_batches):
+            #print(i)
+            batch_start = i * batch_size
+            batch_end = min((i + 1) * batch_size, len(responses))
+
+            claims_in_responses, evidences_in_responses, verifications_in_responses = await self.run_with_tool_live(prompts[batch_start:batch_end],responses[batch_start:batch_end])
+
+            for j, (claims_in_response, evidences_in_response, verifications_in_response) in enumerate(zip(claims_in_responses, evidences_in_responses, verifications_in_responses)):
+                index = batch_start + j
+
+                # if claims_in_response != None:
+                #     for k, claim in enumerate(claims_in_response):
+                #         if verifications_in_response[k] != None:
+                #             if claim != None:
+                #                 #verifications_in_response[k].update({'claim': claim['claim']})
+                #                 if claim.get("claim_law") is not None:
+                #                     verifications_in_response[k] = {'claim_law': claim['claim_law'],**verifications_in_response[k]}
+                #                 else:
+                #                     verifications_in_response[k] = {'claim_logic': claim['claim_logic'],**verifications_in_response[k]}
+                #             else:
+                #                 #verifications_in_response[k].update({'claim': 'None'})
+                #                 verifications_in_response[k] = {'claim': 'None',**verifications_in_response[k]}
+
+                self.sample_list[index].update({
+                    'claims': claims_in_response,
+                    'evidences': evidences_in_response,
+                    'claim_level_factuality': verifications_in_response,
+                    'response_level_factuality': all([verification['factuality'] if verification != None else True for verification in verifications_in_response])
+                })
+
+        return self.sample_list
+
+    async def run_with_tool_dataset(self, annotated_dataset_path: str, with_tool_classified_dataset_path: str, rerun: bool = False, rerun_indices: list = []):
+        data_path = with_tool_classified_dataset_path if rerun else annotated_dataset_path
+        with open(data_path, 'r') as f:
+            data = [json.loads(line) for line in f]
+        self.sample_list = data if rerun else [claim for sample in data for claim in sample['claims']]
+        rerun_elements = self.sample_list if not rerun else [self.sample_list[i] for i in rerun_indices]
+
+        batch_size = 4
+        num_batches = math.ceil(len(rerun_elements) / batch_size) # 5
+
+        for i in range(num_batches):
+            print(i)
+            batch_start = i * batch_size
+            batch_end = min((i + 1) * batch_size, len(rerun_elements))
+
+            responses = await self.run_with_tool_live_without_claim_extraction(rerun_elements[batch_start:batch_end])
+
+            for j, response in enumerate(responses):
+                index = batch_start + j if rerun == False else rerun_indices[batch_start + j]
+                if response is None:
+                    self.sample_list[index].update({
+                        'with_tool_classification': 'None',
+                        'with_tool_reasoning': 'None',
+                        'queries': 'None',
+                        'evidences': 'None'
+                    })
+                else:
+                    self.sample_list[index].update({
+                        'with_tool_classification': response.get('factuality', 'None'),
+                        'with_tool_reasoning': response.get('reasoning', 'None'),
+                        'queries': response.get('queries', 'None'),
+                        'evidences': response.get('evidences', 'None')
+                    })
+
+            # save everything after each batch to prevent data loss
+            with open(with_tool_classified_dataset_path, 'w') as f:
+                for item in self.sample_list:
+                    json_str = json.dumps(item)
+                    f.write(json_str + '\n')
+
+    async def run_self_check_live(self, fewshot, batch):
+        user_prompt_key = 'user_3_shot_CoT' if fewshot else 'user_zero_shot_CoT'
+        messages_list = [
+            [
+                {"role": "system", "content": self.self_check_prompt['system']},
+                {"role": "user", "content": self.self_check_prompt[user_prompt_key].format(claim=response['claim'])},
+            ]
+            for response in batch
+        ]
+        return await self.chat.async_run(messages_list, Dict)
+
+    async def run_self_check_dataset(self, annotated_dataset_path: str, self_check_classified_dataset_path: str, fewshot: bool = False, rerun: bool = False, rerun_indices: list = []):
+        data_path = annotated_dataset_path if not rerun else self_check_classified_dataset_path
+        with open(data_path, 'r') as f:
+            data = [json.loads(line) for line in f]
+        self.sample_list = data if rerun else [claim for sample in data for claim in sample['claims']]
+        rerun_elements = self.sample_list if not rerun else [self.sample_list[i] for i in rerun_indices]
+
+        batch_size = 10
+        num_batches = math.ceil(len(rerun_elements) / batch_size)
+
+        for i in range(num_batches):
+            print(i)
+            batch_start = i * batch_size
+            batch_end = min((i + 1) * batch_size, len(rerun_elements))
+            batch = rerun_elements[batch_start:batch_end]
+
+            responses = await self.run_self_check_live(fewshot, batch)
+            for j, response in enumerate(responses):
+                index = batch_start + j if not rerun else rerun_indices[batch_start + j]
+                if response is None:
+                    self.sample_list[index].update({
+                        'self_check_classification': 'None',
+                        'self_check_reasoning': 'None'
+                    })
+                else:
+                    self.sample_list[index].update({
+                        'self_check_classification': response.get('factuality', 'None'),
+                        'self_check_reasoning': response.get('reasoning', 'None')
+                    })
+
+            # save everything after each batch to prevent data loss
+            with open(self_check_classified_dataset_path, 'w') as f:
+                for item in self.sample_list:
+                    json_str = json.dumps(item)
+                    f.write(json_str + '\n')
diff --git a/factool/knowledge_qa/pipeline.py b/factool/knowledge_qa/pipeline.py
@@ -7,7 +7,7 @@
 from typing import List, Dict
 
 from factool.knowledge_qa.tool import google_search
-from factool.knowledge_qa.tool import local_search
+from factool.utils.tool import local_search
 from factool.utils.base.pipeline import pipeline
 
 class knowledge_qa_pipeline(pipeline):
@@ -116,19 +116,22 @@ async def run_with_tool_api_call(self, prompts, responses):
                             else:
                                 verifications_in_response[k].update({'claim': 'None'})
 
-                evidences_with_source = []
-                for evidence, source in zip(evidences_in_response, sources_in_response):
-                    evidences_with_source.append({'evidence': evidence, 'source': source})
+                evidences_with_source_list = []
+                for evidences_in_claim, sources_in_claim in zip(evidences_in_response, sources_in_response):
+                    evidences_with_source = []
+                    for evidence, source in zip(evidences_in_claim, sources_in_claim):
+                        evidences_with_source.append({'evidence': evidence, 'source': source})
+                    evidences_with_source_list.append(evidences_with_source)
                 self.sample_list[index].update({
                     'claims': claims_in_response,
                     'queries': queries_in_response,
                     # 'evidences': evidences_in_response,
                     # 'sources': sources_in_response,
-                    'evidences': evidences_with_source,
+                    'evidences': evidences_with_source_list,
                     'claim_level_factuality': verifications_in_response,
                     'response_level_factuality': all([verification['factuality'] if verification != None else True for verification in verifications_in_response])
                 })
-
+        
         return self.sample_list
 
     async def run_with_tool_dataset(self, annotated_dataset_path: str, with_tool_classified_dataset_path: str, rerun: bool = False, rerun_indices: list = []):

diff --git a/factool/law_counsel_qa/__init__.py b/factool/law_counsel_qa/__init__.py