Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

KeyError: "attribute 'weight' already exists" #229

Open
ByUnal opened this issue Mar 4, 2024 · 1 comment
Open

KeyError: "attribute 'weight' already exists" #229

ByUnal opened this issue Mar 4, 2024 · 1 comment

Comments

@ByUnal
Copy link

ByUnal commented Mar 4, 2024

I'm trying to train RWKV/rwkv-4-world-430m with LoRA by using Transformer's Trainer module. I've performed chunking to my data (chunk size = 128), and I initiated the training. Training continues properly, but it throws an error at the end of the epoch just before evaluating.
The error is: KeyError: "attribute 'weight' already exists"

Here is my training code and full error:

Code:

from transformers import Trainer, TrainingArguments
from datetime import datetime


TRAIN_BATCH_SIZE = 32
EVAL_BATCH_SIZE = 32
LEARNING_RATE = 5e-5
LR_WARMUP_STEPS = 100
WEIGHT_DECAY = 1e-4
EPOCH = 2

steps_per_epoch = int(len(train_dataset) / TRAIN_BATCH_SIZE)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


training_args = TrainingArguments(
    output_dir='./rwkv-output',
    overwrite_output_dir=True,
    logging_dir='./RWKVlogs',
    logging_steps = steps_per_epoch, # When to start reporting loss
    num_train_epochs = EPOCH,
    do_train=True,
    do_eval=True,
    bf16=True,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    warmup_steps=LR_WARMUP_STEPS,
    optim="paged_adamw_8bit",
    save_total_limit=1,
    weight_decay=WEIGHT_DECAY,
    learning_rate=LEARNING_RATE,
    evaluation_strategy='epoch',
    eval_steps=steps_per_epoch, 
    save_strategy='epoch',
    save_steps=steps_per_epoch,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    greater_is_better=False,
    seed=SEED_TRAIN,
    report_to = "none",
    gradient_accumulation_steps=1,
    # run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}" 
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=lm_train_datasets,
    eval_dataset=lm_valid_datasets,
    # train_dataset=train_dataset,
    # eval_dataset=valid_dataset,
    tokenizer=tokenizer
    )

trainer.train()

Full of error:

KeyError                                  Traceback (most recent call last)
Cell In[52], line 57
     17 training_args = TrainingArguments(
     18     output_dir='./rwkv-output',
     19     overwrite_output_dir=True,
   (...)
     43     # run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}" 
     44 )
     46 trainer = Trainer(
     47     model=peft_model,
     48     args=training_args,
   (...)
     54     tokenizer=tokenizer
     55     )
---> 57 trainer.train()

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/transformers/trainer.py:1627, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1625         hf_hub_utils.enable_progress_bars()
   1626 else:
-> 1627     return inner_training_loop(
   1628         args=args,
   1629         resume_from_checkpoint=resume_from_checkpoint,
   1630         trial=trial,
   1631         ignore_keys_for_eval=ignore_keys_for_eval,
   1632     )

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/transformers/trainer.py:2052, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   2049     self.control.should_training_stop = True
   2051 self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
-> 2052 self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
   2054 if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
   2055     if is_torch_tpu_available():
   2056         # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/transformers/trainer.py:2415, in Trainer._maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
   2413 metrics = None
   2414 if self.control.should_evaluate:
-> 2415     metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
   2416     self._report_to_hp_search(trial, self.state.global_step, metrics)
   2418     # Run delayed LR scheduler now that metrics are populated

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/transformers/trainer.py:3232, in Trainer.evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
   3229 start_time = time.time()
   3231 eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-> 3232 output = eval_loop(
   3233     eval_dataloader,
   3234     description="Evaluation",
   3235     # No point gathering the predictions if there are no metrics, otherwise we defer to
   3236     # self.args.prediction_loss_only
   3237     prediction_loss_only=True if self.compute_metrics is None else None,
   3238     ignore_keys=ignore_keys,
   3239     metric_key_prefix=metric_key_prefix,
   3240 )
   3242 total_batch_size = self.args.eval_batch_size * self.args.world_size
   3243 if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/transformers/trainer.py:3421, in Trainer.evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
   3418         batch_size = observed_batch_size
   3420 # Prediction step
-> 3421 loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
   3422 main_input_name = getattr(self.model, "main_input_name", "input_ids")
   3423 inputs_decode = self._prepare_input(inputs[main_input_name]) if args.include_inputs_for_metrics else None

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/transformers/trainer.py:3638, in Trainer.prediction_step(self, model, inputs, prediction_loss_only, ignore_keys)
   3636 if has_labels or loss_without_labels:
   3637     with self.compute_loss_context_manager():
-> 3638         loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
   3639     loss = loss.mean().detach()
   3641     if isinstance(outputs, dict):

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/transformers/trainer.py:2928, in Trainer.compute_loss(self, model, inputs, return_outputs)
   2926 else:
   2927     labels = None
-> 2928 outputs = model(**inputs)
   2929 # Save past state if it exists
   2930 # TODO: this needs to be fixed and made cleaner later.
   2931 if self.args.past_index >= 0:

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/accelerate/utils/operations.py:822, in convert_outputs_to_fp32.<locals>.forward(*args, **kwargs)
    821 def forward(*args, **kwargs):
--> 822     return model_forward(*args, **kwargs)

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/accelerate/utils/operations.py:810, in ConvertOutputsToFp32.__call__(self, *args, **kwargs)
    809 def __call__(self, *args, **kwargs):
--> 810     return convert_to_fp32(self.model_forward(*args, **kwargs))

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/torch/amp/autocast_mode.py:16, in autocast_decorator.<locals>.decorate_autocast(*args, **kwargs)
     13 @functools.wraps(func)
     14 def decorate_autocast(*args, **kwargs):
     15     with autocast_instance:
---> 16         return func(*args, **kwargs)

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/accelerate/utils/operations.py:822, in convert_outputs_to_fp32.<locals>.forward(*args, **kwargs)
    821 def forward(*args, **kwargs):
--> 822     return model_forward(*args, **kwargs)

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/accelerate/utils/operations.py:810, in ConvertOutputsToFp32.__call__(self, *args, **kwargs)
    809 def __call__(self, *args, **kwargs):
--> 810     return convert_to_fp32(self.model_forward(*args, **kwargs))

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/torch/amp/autocast_mode.py:16, in autocast_decorator.<locals>.decorate_autocast(*args, **kwargs)
     13 @functools.wraps(func)
     14 def decorate_autocast(*args, **kwargs):
     15     with autocast_instance:
---> 16         return func(*args, **kwargs)

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/accelerate/utils/operations.py:822, in convert_outputs_to_fp32.<locals>.forward(*args, **kwargs)
    821 def forward(*args, **kwargs):
--> 822     return model_forward(*args, **kwargs)

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/accelerate/utils/operations.py:810, in ConvertOutputsToFp32.__call__(self, *args, **kwargs)
    809 def __call__(self, *args, **kwargs):
--> 810     return convert_to_fp32(self.model_forward(*args, **kwargs))

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/torch/amp/autocast_mode.py:16, in autocast_decorator.<locals>.decorate_autocast(*args, **kwargs)
     13 @functools.wraps(func)
     14 def decorate_autocast(*args, **kwargs):
     15     with autocast_instance:
---> 16         return func(*args, **kwargs)

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/peft/peft_model.py:1083, in PeftModelForCausalLM.forward(self, input_ids, attention_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, task_ids, **kwargs)
   1081     if peft_config.peft_type == PeftType.POLY:
   1082         kwargs["task_ids"] = task_ids
-> 1083     return self.base_model(
   1084         input_ids=input_ids,
   1085         attention_mask=attention_mask,
   1086         inputs_embeds=inputs_embeds,
   1087         labels=labels,
   1088         output_attentions=output_attentions,
   1089         output_hidden_states=output_hidden_states,
   1090         return_dict=return_dict,
   1091         **kwargs,
   1092     )
   1094 batch_size = _get_batch_size(input_ids, inputs_embeds)
   1095 if attention_mask is not None:
   1096     # concat prompt attention mask

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:161, in BaseTuner.forward(self, *args, **kwargs)
    160 def forward(self, *args: Any, **kwargs: Any):
--> 161     return self.model.forward(*args, **kwargs)

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/accelerate/hooks.py:166, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
    164         output = module._old_forward(*args, **kwargs)
    165 else:
--> 166     output = module._old_forward(*args, **kwargs)
    167 return module._hf_hook.post_forward(module, output)

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/transformers/models/rwkv/modeling_rwkv.py:839, in RwkvForCausalLM.forward(self, input_ids, attention_mask, inputs_embeds, state, labels, use_cache, output_attentions, output_hidden_states, return_dict)
    831 r"""
    832 labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    833     Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    834     `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    835     are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
    836 """
    837 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--> 839 rwkv_outputs = self.rwkv(
    840     input_ids,
    841     inputs_embeds=inputs_embeds,
    842     state=state,
    843     use_cache=use_cache,
    844     output_attentions=output_attentions,
    845     output_hidden_states=output_hidden_states,
    846     return_dict=return_dict,
    847 )
    848 hidden_states = rwkv_outputs[0]
    850 logits = self.head(hidden_states)

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/accelerate/hooks.py:166, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
    164         output = module._old_forward(*args, **kwargs)
    165 else:
--> 166     output = module._old_forward(*args, **kwargs)
    167 return module._hf_hook.post_forward(module, output)

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/transformers/models/rwkv/modeling_rwkv.py:642, in RwkvModel.forward(self, input_ids, attention_mask, inputs_embeds, state, use_cache, output_attentions, output_hidden_states, return_dict)
    639 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    641 if self.training == self.layers_are_rescaled:
--> 642     self._rescale_layers()
    644 if input_ids is not None and inputs_embeds is not None:
    645     raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/transformers/models/rwkv/modeling_rwkv.py:728, in RwkvModel._rescale_layers(self)
    726 elif hasattr(block.attention.output.weight, "quant_state"):
    727     self._bnb_4bit_dequantize_and_rescale(block.attention.output, block_id)
--> 728     self._bnb_4bit_dequantize_and_rescale(block.feed_forward.value, block_id)
    729 else:
    730     block.attention.output.weight.div_(2 ** int(block_id // self.config.rescale_every))

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/transformers/models/rwkv/modeling_rwkv.py:754, in RwkvModel._bnb_4bit_dequantize_and_rescale(self, target_layer, block_id)
    748 # re-quantize the model:
    749 # we need to put it first on CPU then back to the device
    750 # this will create an overhead :/
    751 # We set requires_grad=False as we cannot compute gradients on top of 4bit parameters anyway and to avoid
    752 # bugs with bnb
    753 quant_weight = bnb.nn.Params4bit(dequant_weights.to("cpu"), requires_grad=False).to(dequant_weights.device)
--> 754 setattr(target_layer, "weight", quant_weight)

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/torch/nn/modules/module.py:1705, in Module.__setattr__(self, name, value)
   1702         raise AttributeError(
   1703             "cannot assign parameters before Module.__init__() call")
   1704     remove_from(self.__dict__, self._buffers, self._modules, self._non_persistent_buffers_set)
-> 1705     self.register_parameter(name, value)
   1706 elif params is not None and name in params:
   1707     if value is not None:

File ~/anaconda3/envs/cht/lib/python3.10/site-packages/torch/nn/modules/module.py:575, in Module.register_parameter(self, name, param)
    573     raise KeyError("parameter name can't be empty string \"\"")
    574 elif hasattr(self, name) and name not in self._parameters:
--> 575     raise KeyError(f"attribute '{name}' already exists")
    577 if param is None:
    578     self._parameters[name] = None

KeyError: "attribute 'weight' already exists"

Please help me on this. Could be related with LoRa ?

@ByUnal
Copy link
Author

ByUnal commented Mar 5, 2024

I've found that the problem here is Quantization:

import torch
from transformers import BitsAndBytesConfig

config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    # llm_int8_has_fp16_weight = True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             trust_remote_code=True,
                                             quantization_config=config,
                                             device_map="auto"
                                            )

Model gives the error above, but it works without quantization.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant