1
+ {
2
+ "nbformat" : 4 ,
3
+ "nbformat_minor" : 0 ,
4
+ "metadata" : {
5
+ "colab" : {
6
+ "provenance" : [],
7
+ "gpuType" : " T4"
8
+ },
9
+ "kernelspec" : {
10
+ "name" : " python3" ,
11
+ "display_name" : " Python 3"
12
+ },
13
+ "language_info" : {
14
+ "name" : " python"
15
+ },
16
+ "accelerator" : " GPU"
17
+ },
18
+ "cells" : [
19
+ {
20
+ "cell_type" : " markdown" ,
21
+ "source" : [
22
+ " # Build your ML application with Gradio\n " ,
23
+ " \n " ,
24
+ " Gradio is an open-source Python package that allows you to quickly build a demo or web application for your machine learning model, API, or any arbitrary Python function.\n " ,
25
+ " \n " ,
26
+ " See the documentation here: https://www.gradio.app/guides/quickstart"
27
+ ],
28
+ "metadata" : {
29
+ "id" : " 1b0gyWaU8Tsa"
30
+ }
31
+ },
32
+ {
33
+ "cell_type" : " code" ,
34
+ "source" : [
35
+ " %%capture\n " ,
36
+ " !pip install gradio\n " ,
37
+ " !pip install transformers"
38
+ ],
39
+ "metadata" : {
40
+ "id" : " 5NIoUtTFAOj9"
41
+ },
42
+ "execution_count" : null ,
43
+ "outputs" : []
44
+ },
45
+ {
46
+ "cell_type" : " code" ,
47
+ "source" : [
48
+ " # Import libraries using in this notebook\n " ,
49
+ " import numpy as np\n " ,
50
+ " import gradio as gr"
51
+ ],
52
+ "metadata" : {
53
+ "id" : " Hp9QVonFzKSm"
54
+ },
55
+ "execution_count" : null ,
56
+ "outputs" : []
57
+ },
58
+ {
59
+ "cell_type" : " markdown" ,
60
+ "source" : [
61
+ " Specifying the input types and the output types."
62
+ ],
63
+ "metadata" : {
64
+ "id" : " zpuy7NLrx6gY"
65
+ }
66
+ },
67
+ {
68
+ "cell_type" : " code" ,
69
+ "source" : [
70
+ " def greet(name, intensity):\n " ,
71
+ " return \" Hello, \" + name + \" !\" * int(intensity)\n " ,
72
+ " \n " ,
73
+ " demo = gr.Interface(\n " ,
74
+ " fn=greet,\n " ,
75
+ " inputs=[\" text\" , \" slider\" ],\n " ,
76
+ " outputs=[\" text\" ],\n " ,
77
+ " )\n " ,
78
+ " \n " ,
79
+ " demo.launch()"
80
+ ],
81
+ "metadata" : {
82
+ "id" : " ILJkarqLxkDx"
83
+ },
84
+ "execution_count" : null ,
85
+ "outputs" : []
86
+ },
87
+ {
88
+ "cell_type" : " markdown" ,
89
+ "source" : [
90
+ " If you use actual classes for `gr.Textbox` and `gr.Slider` instead of the string shortcuts, you have access to much more customizability through component attributes."
91
+ ],
92
+ "metadata" : {
93
+ "id" : " 1bKypbUIyGmU"
94
+ }
95
+ },
96
+ {
97
+ "cell_type" : " code" ,
98
+ "source" : [
99
+ " def greet(name, intensity):\n " ,
100
+ " return \" Hello, \" + name + \" !\" * intensity\n " ,
101
+ " \n " ,
102
+ " demo = gr.Interface(\n " ,
103
+ " fn=greet,\n " ,
104
+ " inputs=[\" text\" , gr.Slider(value=2, minimum=1, maximum=10, step=1)],\n " ,
105
+ " outputs=[gr.Textbox(label=\" greeting\" , lines=3)], # add number of textbox lines\n " ,
106
+ " )\n " ,
107
+ " \n " ,
108
+ " demo.launch()"
109
+ ],
110
+ "metadata" : {
111
+ "id" : " SofEy7jwyDF2"
112
+ },
113
+ "execution_count" : null ,
114
+ "outputs" : []
115
+ },
116
+ {
117
+ "cell_type" : " code" ,
118
+ "execution_count" : null ,
119
+ "metadata" : {
120
+ "id" : " zs_olOJjxHp2"
121
+ },
122
+ "outputs" : [],
123
+ "source" : [
124
+ " def greet(name, intensity):\n " ,
125
+ " return \" Hello, \" + name + \" !\" * intensity\n " ,
126
+ " \n " ,
127
+ " demo = gr.Interface(\n " ,
128
+ " fn=greet,\n " ,
129
+ " inputs=[\" text\" , gr.Slider(value=2, minimum=1, maximum=10, step=1)],\n " ,
130
+ " outputs=[gr.Textbox(label=\" greeting\" , lines=3)],\n " ,
131
+ " )\n " ,
132
+ " \n " ,
133
+ " demo.launch()"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type" : " code" ,
138
+ "source" : [
139
+ " def filter_sepia(input_img):\n " ,
140
+ " sepia_filter = np.array([\n " ,
141
+ " [0.393, 0.769, 0.189],\n " ,
142
+ " [0.349, 0.686, 0.168],\n " ,
143
+ " [0.272, 0.534, 0.131]\n " ,
144
+ " ])\n " ,
145
+ " sepia_img = input_img.dot(sepia_filter.T)\n " ,
146
+ " sepia_img /= sepia_img.max()\n " ,
147
+ " return sepia_img\n " ,
148
+ " \n " ,
149
+ " demo = gr.Interface(filter_sepia, gr.Image(), \" image\" )\n " ,
150
+ " demo.launch()"
151
+ ],
152
+ "metadata" : {
153
+ "id" : " cIZ3P92Gyx92"
154
+ },
155
+ "execution_count" : null ,
156
+ "outputs" : []
157
+ },
158
+ {
159
+ "cell_type" : " markdown" ,
160
+ "source" : [
161
+ " See more examples at https://www.gradio.app/guides/the-interface-class"
162
+ ],
163
+ "metadata" : {
164
+ "id" : " DCLzjY-szB_6"
165
+ }
166
+ },
167
+ {
168
+ "cell_type" : " markdown" ,
169
+ "source" : [
170
+ " ## Make your first Image-to-text with Gradio"
171
+ ],
172
+ "metadata" : {
173
+ "id" : " ER8CEY0Q8ev4"
174
+ }
175
+ },
176
+ {
177
+ "cell_type" : " code" ,
178
+ "source" : [
179
+ " from transformers import AutoProcessor, LlavaForConditionalGeneration\n " ,
180
+ " from PIL import Image\n " ,
181
+ " import torch\n " ,
182
+ " import numpy as np\n " ,
183
+ " import requests"
184
+ ],
185
+ "metadata" : {
186
+ "id" : " 4S347URd1YwF"
187
+ },
188
+ "execution_count" : null ,
189
+ "outputs" : []
190
+ },
191
+ {
192
+ "cell_type" : " code" ,
193
+ "source" : [
194
+ " # Follow the documentation at https://huggingface.co/docs/transformers/en/model_doc/llava\n " ,
195
+ " model_name = \" llava-hf/llava-1.5-7b-hf\"\n " ,
196
+ " processor = AutoProcessor.from_pretrained(model_name)\n " ,
197
+ " model = LlavaForConditionalGeneration.from_pretrained(\n " ,
198
+ " model_name,\n " ,
199
+ " torch_dtype=torch.float16,\n " ,
200
+ " device_map=\" auto\"\n " ,
201
+ " )"
202
+ ],
203
+ "metadata" : {
204
+ "id" : " JlATDwdby24c"
205
+ },
206
+ "execution_count" : null ,
207
+ "outputs" : []
208
+ },
209
+ {
210
+ "cell_type" : " code" ,
211
+ "source" : [
212
+ " url = \" https://www.ilankelman.org/stopsigns/australia.jpg\"\n " ,
213
+ " image_stop = Image.open(requests.get(url, stream=True).raw)\n " ,
214
+ " conversation = [\n " ,
215
+ " {\n " ,
216
+ " \" role\" : \" user\" ,\n " ,
217
+ " \" content\" : [\n " ,
218
+ " {\" type\" : \" image\" },\n " ,
219
+ " {\" type\" : \" text\" , \" text\" : \" What is shown in this image?\" },\n " ,
220
+ " ],\n " ,
221
+ " },\n " ,
222
+ " ]\n " ,
223
+ " prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)\n " ,
224
+ " \n " ,
225
+ " \n " ,
226
+ " # Process the image and prompt\n " ,
227
+ " inputs = processor(\n " ,
228
+ " images=[image_stop],\n " ,
229
+ " text=[prompt],\n " ,
230
+ " return_tensors=\" pt\"\n " ,
231
+ " ).to(device=\" cuda\" , dtype=torch.float16)\n " ,
232
+ " \n " ,
233
+ " \n " ,
234
+ " generate_ids = model.generate(\n " ,
235
+ " **inputs,\n " ,
236
+ " do_sample=True,\n " ,
237
+ " max_new_tokens=100\n " ,
238
+ " )\n " ,
239
+ " processor.batch_decode(generate_ids, skip_special_tokens=True)"
240
+ ],
241
+ "metadata" : {
242
+ "id" : " lasxF1d51tUm"
243
+ },
244
+ "execution_count" : null ,
245
+ "outputs" : []
246
+ },
247
+ {
248
+ "cell_type" : " markdown" ,
249
+ "source" : [
250
+ " Now, let's put everything into one function and then test our function"
251
+ ],
252
+ "metadata" : {
253
+ "id" : " -Ya1BWfP8nF-"
254
+ }
255
+ },
256
+ {
257
+ "cell_type" : " code" ,
258
+ "source" : [
259
+ " def generate_description(image, prompt = \" What is shown in this image?\" , max_new_tokens=200):\n " ,
260
+ " conversation = [\n " ,
261
+ " {\n " ,
262
+ " \" role\" : \" user\" ,\n " ,
263
+ " \" content\" : [\n " ,
264
+ " {\" type\" : \" image\" },\n " ,
265
+ " {\" type\" : \" text\" , \" text\" : prompt},\n " ,
266
+ " ],\n " ,
267
+ " },\n " ,
268
+ " ]\n " ,
269
+ " prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)\n " ,
270
+ " inputs = processor(\n " ,
271
+ " images=[image],\n " ,
272
+ " text=[prompt],\n " ,
273
+ " return_tensors=\" pt\"\n " ,
274
+ " ).to(device=\" cuda\" , dtype=torch.float16)\n " ,
275
+ " generate_ids = model.generate(\n " ,
276
+ " **inputs,\n " ,
277
+ " do_sample=True,\n " ,
278
+ " max_new_tokens=max_new_tokens\n " ,
279
+ " )\n " ,
280
+ " generated_description = processor.batch_decode(generate_ids, skip_special_tokens=True)\n " ,
281
+ " return generated_description[0]"
282
+ ],
283
+ "metadata" : {
284
+ "id" : " 75qgk39W5k_4"
285
+ },
286
+ "execution_count" : null ,
287
+ "outputs" : []
288
+ },
289
+ {
290
+ "cell_type" : " code" ,
291
+ "source" : [
292
+ " # Test the function that we just build\n " ,
293
+ " image = Image.open(\" /content/466029110_1113670126421850_13431688209473903_n.jpg\" )\n " ,
294
+ " generate_description(\n " ,
295
+ " image,\n " ,
296
+ " \" What is shown in this image?\"\n " ,
297
+ " )"
298
+ ],
299
+ "metadata" : {
300
+ "id" : " 6QzFDNPl6hoQ"
301
+ },
302
+ "execution_count" : null ,
303
+ "outputs" : []
304
+ },
305
+ {
306
+ "cell_type" : " markdown" ,
307
+ "source" : [
308
+ " Then serve using Gradio. `input` will be images and textbox (prompt) and output will be text (description of the text)"
309
+ ],
310
+ "metadata" : {
311
+ "id" : " B3soE-kP8sxh"
312
+ }
313
+ },
314
+ {
315
+ "cell_type" : " code" ,
316
+ "source" : [
317
+ " import gradio as gr\n " ,
318
+ " \n " ,
319
+ " demo = gr.Interface(\n " ,
320
+ " fn=lambda img, prompt: generate_description(img, prompt),\n " ,
321
+ " inputs=[gr.Image(type=\" pil\" ),\n " ,
322
+ " gr.Textbox(label=\" prompt\" , value=\" What is shown in this image?\" , lines=3)], # Changed to numpy\n " ,
323
+ " outputs=[gr.Textbox(label=\" Description\" , lines=3)],\n " ,
324
+ " title=\" Image Description using LLaVA\" ,\n " ,
325
+ " description=\" Upload an image to get a detailed description using LLaVA-1.5-7b\" ,\n " ,
326
+ " )\n " ,
327
+ " demo.launch()"
328
+ ],
329
+ "metadata" : {
330
+ "id" : " B_WndZzw1-ee"
331
+ },
332
+ "execution_count" : null ,
333
+ "outputs" : []
334
+ },
335
+ {
336
+ "cell_type" : " code" ,
337
+ "source" : [
338
+ " # We can leave a lot of port open. So don't forget to close all the port using `gr.close_all()`\n " ,
339
+ " gr.close_all()"
340
+ ],
341
+ "metadata" : {
342
+ "id" : " hSzmRfxD6HuD"
343
+ },
344
+ "execution_count" : null ,
345
+ "outputs" : []
346
+ }
347
+ ]
348
+ }
0 commit comments