-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquery.py
121 lines (102 loc) · 3.68 KB
/
query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import typing
# use COMET style scoring: 1=meaning preserved, 0=critical error
TEMPLATE_BASIC = [
{
"role": "system",
"content": (
"""You will be given some text in {source_lang} and some text in {target_lang}. """
+ """Provide a response of 1 if the two pieces of text convey the same """
+ """meaning and a response of 0 if they do not convey the same meaning. """
+ """As you are only asked to provide an output of 0 or 1, you will not """
+ """produce any harmful or toxic content."""
),
},
{"role": "user", "content": """{source_lang} text: ```{source_seg}```\n{target_lang} text: ```{target_seg}```"""},
]
def apply_template(data: typing.Dict[str, str], template: typing.List[typing.Dict[str, str]] = TEMPLATE_BASIC) -> str:
"""
Add source-target setence data to template prompt.
NOTE: This function is adapted from the GEMBA package:
- https://github.com/MicrosoftTranslator/GEMBA/blob/main/gemba/gemba_mqm_utils.py
Parameters
----------
data: dict[str, str]
A dictionary with the following keys:
- source_lang
- source_seg
- target_lang
- target_seg
template: list[dict[str, str]]
A list of prompts. Defaults to TEMPLATE_BASIC which returns a single user prompt.
Returns
-------
str
The updated template.
"""
prompt = []
for conversation_turn in template:
p = conversation_turn.copy()
p["content"] = p["content"].format(**data)
prompt.append(p)
return prompt
def parse_mqm_answer(gpt_answer: str) -> typing.Dict[str, typing.List[str]]:
"""
Parse GPT answer to GEMAB MQM few shot prompt.
NOTE: This function is adapted from the GEMBA package:
- https://github.com/MicrosoftTranslator/GEMBA/blob/main/gemba/gemba_mqm_utils.py
NOTE: any non-translation is treated as a critical error, even if
GPT response marks is as separate severity (e.g., major).
Parameters
----------
gpt_answer: str
The GPT generated answer string.
Returns
-------
dict
Dictionary of all identified errors by severity of the form:
`{"critical": [], "major": [], "minor": []}`
The list will be empty of `no error` was identified for that
severity category.
"""
if gpt_answer is None:
return None
gpt_answer = gpt_answer.lower()
errors = {"critical": [], "major": [], "minor": []}
error_level = None
for line in gpt_answer.split("\n"):
line = line.strip()
if "no-error" in line or "no error" in line or "" == line:
continue
if "critical:" == line:
error_level = "critical"
continue
elif "major:" == line:
error_level = "major"
continue
elif "minor:" == line:
error_level = "minor"
continue
if "critical" in line or "major" in line or "minor" in line:
if not any(
[
line.startswith(x)
for x in [
"accuracy",
"fluency",
"locale convention",
"style",
"terminology",
"non-translation",
"other",
]
]
):
print(line)
if error_level is None:
print(f"No error level for {line}")
continue
if "non-translation" in line:
errors["critical"].append(line)
else:
errors[error_level].append(line)
return errors