@@ -21,6 +21,7 @@ def __init__(self, config) -> None:
21
21
self .quant_config .output_dir = Path (self .quant_config .output_dir ) / self .quant_config .model_name
22
22
for k , v in self .quant_config .layer_config .to_dict ().items ():
23
23
setattr (self .quant_config , k , v )
24
+ self .quant_cache_dir = Path (f"{ self .quant_config .output_dir } /quant_cache" )
24
25
25
26
def set_tokenizer (self , tokenizer ):
26
27
self .tokenizer = tokenizer
@@ -89,7 +90,7 @@ def collect_hessian_pre(self, model, model_prefix, dev):
89
90
from .qllm_hessian import process_collect_hessian
90
91
sample_args = self .quant_config .hessian_config
91
92
sample_args .base_model = self .quant_config .model_name
92
- sample_args .save_path = f". /hessian_path/{ sample_args .base_model } _{ sample_args .devset_size } _{ sample_args .ctx_size } "
93
+ sample_args .save_path = f"{ self . quant_config . output_dir } /hessian_path/{ sample_args .base_model } _{ sample_args .devset_size } _{ sample_args .ctx_size } "
93
94
94
95
self .quant_config .hessian_path = sample_args .save_path
95
96
self .quant_config .inv_hessian_path = sample_args .save_path + "_inv"
@@ -123,21 +124,20 @@ def parallel_quantize(self, quantize_layer, attention_layers, num_gpus, dev):
123
124
124
125
pbar = tqdm .tqdm (total = len (attention_layers ), desc = f"running VPTQ on { num_gpus } GPUs" )
125
126
output_queue = theading_queue .Queue ()
126
- quant_tmp = Path ("quant_tmp" )
127
127
for i in range (num_gpus ):
128
128
output_queue .put (i ) # poison pill
129
129
def fetch_next_task (future ):
130
130
comm_utils .clear_memory ()
131
131
pbar .update (1 )
132
132
pbar .set_postfix_str (f'gpu memory: { torch .cuda .memory_allocated (future .gpu_idx )/ 1024 ** 3 :.2f} GB' )
133
133
output_queue .put (future .gpu_idx )
134
- torch .save (future .result (), quant_tmp / f"layer_{ future .layer_idx } .pt" )
134
+ torch .save (future .result (), self . quant_cache_dir / f"layer_{ future .layer_idx } .pt" )
135
135
136
136
for layer_idx ,layer in enumerate (attention_layers ):
137
- if (quant_tmp / f"layer_{ layer_idx } .pt" ).exists ():
137
+ if (self . quant_cache_dir / f"layer_{ layer_idx } .pt" ).exists ():
138
138
import warnings
139
139
warnings .simplefilter (action = 'ignore' , category = FutureWarning )
140
- attention_layers [layer_idx ] = torch .load (quant_tmp / f"layer_{ layer_idx } .pt" , weights_only = False )
140
+ attention_layers [layer_idx ] = torch .load (self . quant_cache_dir / f"layer_{ layer_idx } .pt" , weights_only = False )
141
141
pbar .update (1 )
142
142
continue
143
143
free_gpu_id = output_queue .get ()
@@ -178,15 +178,14 @@ def do_quantize(self, model, dataloader, model_prefix, dev):
178
178
vptq_quantizer = InternalVPTQQuantizer ()
179
179
quantize_layer = vptq_quantizer .quantize_layer
180
180
quantizers = {}
181
- quant_tmp = Path ("quant_tmp" )
182
- quant_tmp .mkdir (exist_ok = True )
181
+ self .quant_cache_dir .mkdir (exist_ok = True )
183
182
184
183
if num_gpus > 1 :
185
184
self .parallel_quantize (quantize_layer , attention_layers , num_gpus , dev )
186
185
else :
187
186
for layer_idx in tqdm .trange ((len (attention_layers )), desc = "running VPTQ" ):
188
- if (quant_tmp / f"layer_{ layer_idx } .pt" ).exists ():
189
- attention_layers [layer_idx ] = torch .load (quant_tmp / f"layer_{ layer_idx } .pt" , weights_only = False )
187
+ if (self . quant_cache_dir / f"layer_{ layer_idx } .pt" ).exists ():
188
+ attention_layers [layer_idx ] = torch .load (self . quant_cache_dir / f"layer_{ layer_idx } .pt" , weights_only = False )
190
189
continue
191
190
attention_layers [layer_idx ] = quantize_layer (
192
191
(attention_layers [layer_idx ], layer_idx ), self .quant_config , self .quant_config ,
0 commit comments