1
+ import io
1
2
import logging
2
3
import re
3
4
@@ -57,7 +58,26 @@ def __init__(
57
58
)
58
59
self ._spell = spellchecker .SpellChecker ()
59
60
60
- def _grammar_check (self , text ) -> tuple [float , list [str ]]:
61
+ def re_encode (self , text : str ) -> str :
62
+ """
63
+ Re-encode the text with the proper encoding.
64
+
65
+ Texts need re-encoding because it comes from different sources and might have
66
+ different encodings, such as raw MongoDB data.
67
+
68
+ Issue: https://github.com/googleapis/google-cloud-python/issues/11381
69
+
70
+ Parameters:
71
+ text (str): The text to re-encode.
72
+
73
+ Returns:
74
+ str: The re-encoded text.
75
+ """
76
+ bytes_object = io .BytesIO (text .encode ("utf-8" ))
77
+ text_io = io .TextIOWrapper (bytes_object , encoding = "utf-8-sig" )
78
+ return text_io .read ().strip ()
79
+
80
+ def _grammar_check (self , text : str ) -> tuple [float , list [str ]]:
61
81
"""
62
82
Perform a grammar check on the provided text using Google's NL API.
63
83
@@ -76,11 +96,15 @@ def _grammar_check(self, text) -> tuple[float, list[str]]:
76
96
- float: The grammar score (ranging from 0 to 1).
77
97
- list[str]: A list of identified grammar issue lemmas.
78
98
"""
99
+ text = self .re_encode (text )
79
100
document = language_v1 .Document ( # type: ignore
80
101
content = text ,
81
102
type_ = language_v1 .Document .Type .PLAIN_TEXT , # type: ignore
82
103
)
83
- response = self ._lang_client .analyze_syntax (document = document )
104
+ response = self ._lang_client .analyze_syntax (
105
+ document = document ,
106
+ encoding_type = language_v1 .EncodingType .UTF8 , # type: ignore
107
+ )
84
108
grammar_issues = []
85
109
for token in response .tokens :
86
110
if token .part_of_speech .tag in (
0 commit comments