Skip to content

Commit 26ede33

Browse files
authored
Merge pull request #81 from DAAily/SSD-751-Sparky-Exception-deserializing-response!---On-the-Google-Cloud-Language-API
Ssd 751 sparky exception deserializing response! on the google cloud language api
2 parents 5200b7c + c1bc442 commit 26ede33

File tree

2 files changed

+26
-3
lines changed

2 files changed

+26
-3
lines changed

daaily/score/client.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import io
12
import logging
23
import re
34

@@ -57,7 +58,26 @@ def __init__(
5758
)
5859
self._spell = spellchecker.SpellChecker()
5960

60-
def _grammar_check(self, text) -> tuple[float, list[str]]:
61+
def re_encode(self, text: str) -> str:
62+
"""
63+
Re-encode the text with the proper encoding.
64+
65+
Texts need re-encoding because it comes from different sources and might have
66+
different encodings, such as raw MongoDB data.
67+
68+
Issue: https://github.com/googleapis/google-cloud-python/issues/11381
69+
70+
Parameters:
71+
text (str): The text to re-encode.
72+
73+
Returns:
74+
str: The re-encoded text.
75+
"""
76+
bytes_object = io.BytesIO(text.encode("utf-8"))
77+
text_io = io.TextIOWrapper(bytes_object, encoding="utf-8-sig")
78+
return text_io.read().strip()
79+
80+
def _grammar_check(self, text: str) -> tuple[float, list[str]]:
6181
"""
6282
Perform a grammar check on the provided text using Google's NL API.
6383
@@ -76,11 +96,15 @@ def _grammar_check(self, text) -> tuple[float, list[str]]:
7696
- float: The grammar score (ranging from 0 to 1).
7797
- list[str]: A list of identified grammar issue lemmas.
7898
"""
99+
text = self.re_encode(text)
79100
document = language_v1.Document( # type: ignore
80101
content=text,
81102
type_=language_v1.Document.Type.PLAIN_TEXT, # type: ignore
82103
)
83-
response = self._lang_client.analyze_syntax(document=document)
104+
response = self._lang_client.analyze_syntax(
105+
document=document,
106+
encoding_type=language_v1.EncodingType.UTF8, # type: ignore
107+
)
84108
grammar_issues = []
85109
for token in response.tokens:
86110
if token.part_of_speech.tag in (

samples/score/get_product_scores.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
# Iterate over the results
2121
for i, p in enumerate(products):
2222
print(f"{i} Product: {p['product_id']}")
23-
p["name_en"] = p.get("name_en", "").replace("\n", "")
2423
score_results = score.score(p)
2524
for sr in score_results.score_results:
2625
print(f"Field Name: {sr.field_name}")

0 commit comments

Comments
 (0)