|
15 | 15 | }, |
16 | 16 | { |
17 | 17 | "cell_type": "code", |
18 | | - "execution_count": 2, |
| 18 | + "execution_count": 12, |
19 | 19 | "metadata": {}, |
20 | 20 | "outputs": [], |
21 | 21 | "source": [ |
|
25 | 25 | "from compressed_tensors.quantization import (\n", |
26 | 26 | " QuantizationConfig,\n", |
27 | 27 | " QuantizationStatus,\n", |
28 | | - " apply_quantization_config,\n", |
29 | | - " compress_quantized_weights\n", |
| 28 | + " apply_quantization_config\n", |
30 | 29 | ")\n", |
31 | 30 | "from compressed_tensors.compressors import ModelCompressor\n", |
32 | 31 | "from transformers import AutoModelForCausalLM, AutoTokenizer, DefaultDataCollator\n", |
|
37 | 36 | }, |
38 | 37 | { |
39 | 38 | "cell_type": "code", |
40 | | - "execution_count": 3, |
| 39 | + "execution_count": 13, |
41 | 40 | "metadata": {}, |
42 | 41 | "outputs": [ |
43 | | - { |
44 | | - "data": { |
45 | | - "application/vnd.jupyter.widget-view+json": { |
46 | | - "model_id": "c883cdc8ecd04866bd01d61796b81c26", |
47 | | - "version_major": 2, |
48 | | - "version_minor": 0 |
49 | | - }, |
50 | | - "text/plain": [ |
51 | | - "config.json: 0%| | 0.00/560 [00:00<?, ?B/s]" |
52 | | - ] |
53 | | - }, |
54 | | - "metadata": {}, |
55 | | - "output_type": "display_data" |
56 | | - }, |
57 | | - { |
58 | | - "data": { |
59 | | - "application/vnd.jupyter.widget-view+json": { |
60 | | - "model_id": "32b18b14b6774ce7b61d2854a1ed5f49", |
61 | | - "version_major": 2, |
62 | | - "version_minor": 0 |
63 | | - }, |
64 | | - "text/plain": [ |
65 | | - "model.safetensors: 0%| | 0.00/4.40G [00:00<?, ?B/s]" |
66 | | - ] |
67 | | - }, |
68 | | - "metadata": {}, |
69 | | - "output_type": "display_data" |
70 | | - }, |
71 | | - { |
72 | | - "data": { |
73 | | - "application/vnd.jupyter.widget-view+json": { |
74 | | - "model_id": "370c6d18521a4b65833a411728be1ed7", |
75 | | - "version_major": 2, |
76 | | - "version_minor": 0 |
77 | | - }, |
78 | | - "text/plain": [ |
79 | | - "generation_config.json: 0%| | 0.00/129 [00:00<?, ?B/s]" |
80 | | - ] |
81 | | - }, |
82 | | - "metadata": {}, |
83 | | - "output_type": "display_data" |
84 | | - }, |
85 | 42 | { |
86 | 43 | "data": { |
87 | 44 | "text/plain": [ |
|
113 | 70 | ")" |
114 | 71 | ] |
115 | 72 | }, |
116 | | - "execution_count": 3, |
| 73 | + "execution_count": 13, |
117 | 74 | "metadata": {}, |
118 | 75 | "output_type": "execute_result" |
119 | 76 | } |
|
122 | 79 | "# load a dense, unquantized tiny llama model\n", |
123 | 80 | "device = \"cuda:0\"\n", |
124 | 81 | "model_name = \"TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T\"\n", |
125 | | - "model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=\"auto\")\n", |
| 82 | + "model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16)\n", |
126 | 83 | "model" |
127 | 84 | ] |
128 | 85 | }, |
|
139 | 96 | }, |
140 | 97 | { |
141 | 98 | "cell_type": "code", |
142 | | - "execution_count": 23, |
| 99 | + "execution_count": 14, |
143 | 100 | "metadata": {}, |
144 | 101 | "outputs": [], |
145 | 102 | "source": [ |
|
164 | 121 | }, |
165 | 122 | { |
166 | 123 | "cell_type": "code", |
167 | | - "execution_count": null, |
| 124 | + "execution_count": 15, |
168 | 125 | "metadata": {}, |
169 | 126 | "outputs": [], |
170 | 127 | "source": [ |
|
177 | 134 | }, |
178 | 135 | { |
179 | 136 | "cell_type": "code", |
180 | | - "execution_count": null, |
| 137 | + "execution_count": 16, |
181 | 138 | "metadata": {}, |
182 | 139 | "outputs": [], |
183 | 140 | "source": [ |
|
198 | 155 | }, |
199 | 156 | { |
200 | 157 | "cell_type": "code", |
201 | | - "execution_count": 28, |
| 158 | + "execution_count": 17, |
202 | 159 | "metadata": {}, |
203 | 160 | "outputs": [ |
204 | 161 | { |
205 | 162 | "name": "stderr", |
206 | 163 | "output_type": "stream", |
207 | 164 | "text": [ |
208 | | - "Running calibration: 512it [00:33, 15.42it/s]\n" |
| 165 | + "Running calibration: 512it [00:58, 8.82it/s]\n" |
209 | 166 | ] |
210 | 167 | } |
211 | 168 | ], |
|
233 | 190 | "\n", |
234 | 191 | "Notice that at this point, the weight itself is still a floating point and has not been quantized. \n", |
235 | 192 | "\n", |
236 | | - "To convert the weights to an integer type, we need to apply the `compress_quantized_weights` function. After compressing the weights, a forward pass of the model can no longer be run in PyTorch" |
| 193 | + "To convert the weights to an integer type, we need to apply the `compress_model` function. After compressing the weights, a forward pass of the model can no longer be run in PyTorch.\n", |
| 194 | + "\n", |
| 195 | + "After compressing the quantized model with the `pack-quantized` format, weights are represented as logical int4 values packed into int32 containers ( `weight_packed` ), with the original shape recorded in `weight_shape`.\n", |
| 196 | + "\n", |
| 197 | + "This packed representation is what gets saved to disk when using ModelCompressor.compress_model(model)." |
237 | 198 | ] |
238 | 199 | }, |
239 | 200 | { |
240 | 201 | "cell_type": "code", |
241 | | - "execution_count": 29, |
| 202 | + "execution_count": 18, |
242 | 203 | "metadata": {}, |
243 | 204 | "outputs": [ |
244 | 205 | { |
245 | 206 | "name": "stdout", |
246 | 207 | "output_type": "stream", |
247 | 208 | "text": [ |
248 | | - "Scale: tensor([17296.], device='cuda:4', dtype=torch.float16), Zero Point: tensor([0], device='cuda:4', dtype=torch.int8)\n", |
249 | | - "Weight min: -1.587890625 max: 1.0283203125 dtype: torch.float16\n" |
| 209 | + "Scale: tensor([-3.0465e+26], device='cuda:0', dtype=torch.bfloat16), Zero Point: tensor([0], device='cuda:0', dtype=torch.int8)\n", |
| 210 | + "Weight min: -1.5859375 max: 1.03125 dtype: torch.bfloat16\n" |
250 | 211 | ] |
251 | 212 | } |
252 | 213 | ], |
|
262 | 223 | }, |
263 | 224 | { |
264 | 225 | "cell_type": "code", |
265 | | - "execution_count": 30, |
| 226 | + "execution_count": 19, |
266 | 227 | "metadata": {}, |
267 | 228 | "outputs": [ |
| 229 | + { |
| 230 | + "name": "stderr", |
| 231 | + "output_type": "stream", |
| 232 | + "text": [ |
| 233 | + "Compressing model: 154it [00:02, 59.75it/s]" |
| 234 | + ] |
| 235 | + }, |
268 | 236 | { |
269 | 237 | "name": "stdout", |
270 | 238 | "output_type": "stream", |
271 | 239 | "text": [ |
272 | | - "Scale: tensor([17296.], device='cuda:4', dtype=torch.float16), Zero Point: tensor([0], device='cuda:4', dtype=torch.int8)\n", |
273 | | - "Weight min: 0 max: 0 dtype: torch.int8\n" |
| 240 | + "Compressed weight scale: tensor([-3.0465e+26], device='cuda:0', dtype=torch.bfloat16), zero point: tensor([0], device='cuda:0', dtype=torch.int8)\n", |
| 241 | + "Compressed weight dtype: torch.int32\n", |
| 242 | + "Compressed weight shape: torch.Size([2048, 256])\n", |
| 243 | + "Uncompressed weight shape: tensor([2048, 2048], device='cuda:0')\n" |
| 244 | + ] |
| 245 | + }, |
| 246 | + { |
| 247 | + "name": "stderr", |
| 248 | + "output_type": "stream", |
| 249 | + "text": [ |
| 250 | + "\n" |
274 | 251 | ] |
275 | 252 | } |
276 | 253 | ], |
277 | 254 | "source": [ |
278 | 255 | "# convert quantized weights to integers\n", |
279 | | - "model.apply(compress_quantized_weights)\n", |
| 256 | + "compressor = ModelCompressor(quantization_config=config)\n", |
| 257 | + "compressor.compress_model(model)\n", |
280 | 258 | "\n", |
281 | 259 | "state_dict = model.state_dict()\n", |
282 | 260 | "example_layer = \"model.layers.0.self_attn.q_proj.weight\"\n", |
283 | 261 | "scale = state_dict[example_layer + \"_scale\"]\n", |
284 | 262 | "zero_point = state_dict[example_layer + \"_zero_point\"]\n", |
285 | | - "weight = state_dict[example_layer]\n", |
286 | | - "print(f\"Scale: {scale}, Zero Point: {zero_point}\")\n", |
287 | | - "print(f\"Weight min: {torch.min(weight)} max: {torch.max(weight)} dtype: {weight.dtype}\")" |
288 | | - ] |
289 | | - }, |
290 | | - { |
291 | | - "cell_type": "markdown", |
292 | | - "metadata": {}, |
293 | | - "source": [ |
294 | | - "After compressing the quantized model, the weight matrix has a range of int4 but is stored in an int8. \n", |
295 | | - "\n", |
296 | | - "We can further compress the model on disk using the `pack-quantized` format we specified in the config. This compression format will pack the int4 weights into int32" |
| 263 | + "weight = state_dict[example_layer + \"_packed\"]\n", |
| 264 | + "shape = state_dict[example_layer + \"_shape\"]\n", |
| 265 | + "print(f\"Compressed weight scale: {scale}, zero point: {zero_point}\")\n", |
| 266 | + "print(f\"Compressed weight dtype: {weight.dtype}\")\n", |
| 267 | + "print(f\"Compressed weight shape: {weight.shape}\")\n", |
| 268 | + "print(f\"Uncompressed weight shape: {shape}\")" |
297 | 269 | ] |
298 | 270 | }, |
299 | 271 | { |
300 | 272 | "cell_type": "code", |
301 | | - "execution_count": 31, |
| 273 | + "execution_count": 20, |
302 | 274 | "metadata": {}, |
303 | 275 | "outputs": [ |
304 | 276 | { |
305 | 277 | "name": "stdout", |
306 | 278 | "output_type": "stream", |
307 | 279 | "text": [ |
308 | | - "Compression format: pack-quantized\n" |
309 | | - ] |
310 | | - }, |
311 | | - { |
312 | | - "name": "stderr", |
313 | | - "output_type": "stream", |
314 | | - "text": [ |
315 | | - "Quantized Compression: 100%|██████████| 509/509 [00:03<00:00, 153.70it/s]\n" |
316 | | - ] |
317 | | - }, |
318 | | - { |
319 | | - "name": "stdout", |
320 | | - "output_type": "stream", |
321 | | - "text": [ |
322 | | - "Size of the model's weights on disk using safetensors: 712.23 MB\n" |
| 280 | + "Compression format: pack-quantized\n", |
| 281 | + "Size of the model's weights on disk using safetensors: 712.25 MB\n" |
323 | 282 | ] |
324 | 283 | } |
325 | 284 | ], |
|
330 | 289 | "compression_format = config.format\n", |
331 | 290 | "print(f\"Compression format: {compression_format}\")\n", |
332 | 291 | "\n", |
333 | | - "compressor = ModelCompressor(quantization_config=config)\n", |
334 | | - "compressed_state_dict = compressor.compress(model)\n", |
335 | | - "model.save_pretrained(output_dir, state_dict=compressed_state_dict)\n", |
| 292 | + "\n", |
| 293 | + "model.save_pretrained(output_dir, state_dict=model.state_dict())\n", |
336 | 294 | "compressor.update_config(output_dir)\n", |
337 | 295 | "\n", |
338 | 296 | "compressed_size_on_disk_mb = os.path.getsize(os.path.join(output_dir, \"model.safetensors\")) / 1024 / 1024\n", |
|
356 | 314 | "name": "python", |
357 | 315 | "nbconvert_exporter": "python", |
358 | 316 | "pygments_lexer": "ipython3", |
359 | | - "version": "3.10.12" |
| 317 | + "version": "3.12.12" |
360 | 318 | } |
361 | 319 | }, |
362 | 320 | "nbformat": 4, |
|
0 commit comments