reduce memory usage during repack (#168)

ReinForce-II · web-flow · commit df20c15920bf · 2025-03-07T22:22:51.000+08:00
diff --git a/qllm/auto_model_quantization.py b/qllm/auto_model_quantization.py
@@ -139,6 +139,7 @@ def repack_to_new_mode(self, model, new_pack_mode):
             new_module.bias = qlayer.bias if qlayer.bias is not None else None
             set_op_by_name(model, module_name, new_module)
             new_module.pack(qlayer, scales.T, zeros.T, qlayer.g_idx)
+            del qlayer.weight
             qlayer.to('cpu')
             new_module.to('cpu')
         del qlayers