support max_memory to specify mem usage for each GPU (#460)

casper-hansen · May 2, 2024 · 12581fb · 12581fb
1 parent 33af761
commit 12581fb
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 0 deletions.
diff --git a/awq/models/auto.py b/awq/models/auto.py
@@ -83,6 +83,7 @@ def from_quantized(
  batch_size=1,
  safetensors=True,
  device_map="balanced",
+ max_memory=None,
  offload_folder=None,
  download_kwargs=None,
  **config_kwargs,
@@ -108,6 +109,7 @@ def from_quantized(
  use_exllama_v2=use_exllama_v2,
  safetensors=safetensors,
  device_map=device_map,
+ max_memory=max_memory,
  offload_folder=offload_folder,
  download_kwargs=download_kwargs,
  **config_kwargs,

diff --git a/awq/models/base.py b/awq/models/base.py
@@ -393,6 +393,12 @@ def from_quantized(
  "A device map that will be passed onto the model loading method from transformers."
  ),
  ] = "balanced",
+ max_memory: Annotated[
+ Dict[Union[int, str], Union[int, str]], 
+ Doc(
+ 'A dictionary device identifier to maximum memory which will be passed onto the model loading method from transformers. For example：{0: "4GB",1: "10GB"'
+ ),
+ ] = None,
  offload_folder: Annotated[
  str,
  Doc("The folder ot offload the model to."),
@@ -449,6 +455,7 @@ def from_quantized(
  model,
  checkpoint=model_weights_path,
  device_map=device_map,
+ max_memory=max_memory,
  no_split_module_classes=[self.layer_type],
  offload_folder=offload_folder,
  dtype=torch_dtype,