This repository has been archived by the owner on Nov 21, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_clients.py
115 lines (92 loc) · 3.74 KB
/
test_clients.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import time, json, enum
from openai import OpenAI
import boto3
from octoai.chat import TextModel
from octoai.client import Client
from locust import events
class OpenAIClient:
def __init__(self, model: str = "gpt-3.5-turbo"):
self.openaiclient = OpenAI()
self.model = model
def chat_completion(self, msg):
request_meta = {
"request_type": "OpenAI chat completion",
"name": self.model,
"start_time": time.time(),
"response_length": 0,
"response": None,
"context": {},
"exception": None,
}
start_perf_counter = time.perf_counter()
try:
response = self.openaiclient.chat.completions.create(model=self.model, messages=msg, max_tokens=512)
request_meta["response"] = response.choices[0].message
request_meta["response_length"] = response.usage.completion_tokens
except Exception as e:
request_meta["exception"] = e
request_meta["response_time"] = (time.perf_counter() - start_perf_counter) * 1000
events.request.fire(**request_meta)
class OctoAIClient:
def __init__(self, model: enum = TextModel.LLAMA_2_70B_CHAT_FP16):
self.octoaiclient = Client()
self.model = model
def chat_completion(self, msg):
request_meta = {
"request_type": "OctoAI chat completion",
"name": f"{self.model.name}",
"start_time": time.time(),
"response_length": 0,
"response": None,
"context": {},
"exception": None,
}
start_perf_counter = time.perf_counter()
try:
response = self.octoaiclient.chat.completions.create(model=self.model, messages=msg, max_tokens=512)
request_meta["response"] = response.choices[0].message
# request_meta["response_length"] = response.usage.total_tokens
request_meta["response_length"] = response.usage.completion_tokens
except Exception as e:
request_meta["exception"] = e
request_meta["response_time"] = (time.perf_counter() - start_perf_counter) * 1000
events.request.fire(**request_meta)
class BedrockClient:
def __init__(self, model: str = "meta.llama2-70b-chat-v1"):
self.bedrockclient = boto3.client(service_name="bedrock-runtime")
self.model = model
def chat_completion(self, msg):
request_meta = {
"request_type": "Amazon AI chat completion",
"name": f"{self.model}",
"start_time": time.time(),
"response_length": 0,
"response": None,
"context": {},
"exception": None,
}
start_perf_counter = time.perf_counter()
try:
body = json.dumps(
{
"prompt": msg,
"max_gen_len": 512,
# "temperature": 0.7,
# "top_p": 0.9,
}
)
modelId = self.model
accept = "application/json"
contentType = "application/json"
response = self.bedrockclient.invoke_model(
body=body, modelId=modelId, accept=accept, contentType=contentType
)
response_body = json.loads(response.get("body").read())
request_meta["response"] = response_body["generation"]
request_meta["response_length"] = response_body[
"generation_token_count"
] # + response_body["generation_token_count"]
except Exception as e:
request_meta["exception"] = e
request_meta["response_time"] = (time.perf_counter() - start_perf_counter) * 1000
events.request.fire(**request_meta)