-
Notifications
You must be signed in to change notification settings - Fork 128
Expand file tree
/
Copy pathpp_api.py
More file actions
131 lines (106 loc) · 4.84 KB
/
pp_api.py
File metadata and controls
131 lines (106 loc) · 4.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
###############################################################################
import os
from . import _extensions_pydll as _C
if not hasattr(_C, "delete_object"):
raise ImportError(
"onnxruntime_extensions is not built with pre-processing C API\n"
"To enable it, please build the package with --ortx-user-option=pp_api")
create_processor = _C.create_processor
load_images = _C.load_images
image_pre_process = _C.image_pre_process
tensor_result_get_at = _C.tensor_result_get_at
create_tokenizer = _C.create_tokenizer
create_tokenizer_with_options = _C.create_tokenizer_with_options
update_tokenizer_options = _C.update_tokenizer_options
batch_tokenize = _C.batch_tokenize
batch_detokenize = _C.batch_detokenize
_apply_chat_template = _C.apply_chat_template
delete_object = _C.delete_object
class Tokenizer:
def __init__(self, tokenizer_dir, options = None):
self.tokenizer = None
self.options = {}
if os.path.isdir(tokenizer_dir):
if options is None:
self.tokenizer = create_tokenizer(tokenizer_dir)
else:
# Convert values to lowercase strings if bool, otherwise str
for k, v in options.items():
if isinstance(v, bool):
self.options[k] = str(v).lower()
else:
self.options[k] = str(v)
self.tokenizer = create_tokenizer_with_options(tokenizer_dir, self.options)
else:
try:
from transformers.utils import cached_file
# Required files
resolved_full_file = cached_file(tokenizer_dir, "tokenizer.json")
resolved_config_file = cached_file(tokenizer_dir, "tokenizer_config.json")
# Optional: attempt to download chat_template.jinja
try:
cached_file(tokenizer_dir, "chat_template.jinja")
except EnvironmentError:
# It is okay if this file does not exist as not every model has it
pass
# Optional: chat_template.json (e.g., some models use this instead)
try:
cached_file(tokenizer_dir, "chat_template.json")
except EnvironmentError:
pass
except ImportError:
raise ValueError(
f"Directory '{tokenizer_dir}' not found and transformers is not available"
)
if not os.path.exists(resolved_full_file):
raise FileNotFoundError(
f"Downloaded HF file '{resolved_full_file}' cannot be found"
)
if os.path.dirname(resolved_full_file) != os.path.dirname(resolved_config_file):
raise FileNotFoundError(
f"Downloaded HF files '{resolved_full_file}' "
f"and '{resolved_config_file}' are not in the same directory"
)
tokenizer_dir = os.path.dirname(resolved_full_file)
self.tokenizer = create_tokenizer(tokenizer_dir)
def tokenize(self, text):
if isinstance(text, (list, tuple)):
return batch_tokenize(self.tokenizer, text)
return batch_tokenize(self.tokenizer, [text])[0]
def update_options(self, options):
# Update tokenizer options at runtime.
for k, v in options.items():
if isinstance(v, bool):
self.options[k] = str(v).lower()
else:
self.options[k] = str(v)
update_tokenizer_options(self.tokenizer, self.options)
def detokenize(self, tokens):
return batch_detokenize(self.tokenizer, [tokens])
def apply_chat_template(self, chat, template="", tools="",add_generation_prompt=True, tokenize=False):
result = _apply_chat_template(
self.tokenizer, template, chat, tools, add_generation_prompt, tokenize)
return tensor_result_get_at(result, 1 if tokenize else 0)
def __del__(self):
if delete_object and self.tokenizer:
delete_object(self.tokenizer)
self.tokenizer = None
class ImageProcessor:
def __init__(self, processor_json):
self.processor = create_processor(processor_json)
def pre_process(self, images):
if isinstance(images, str):
images = [images]
if isinstance(images, list):
images = load_images(images)
return image_pre_process(self.processor, images)
@staticmethod
def to_numpy(result, idx):
return tensor_result_get_at(result, idx)
def __del__(self):
if delete_object and self.processor:
delete_object(self.processor)
self.processor = None