diff --git a/modeling_mixformer_sequential.py b/modeling_mixformer_sequential.py index 22b75f1..7d4f722 100644 --- a/modeling_mixformer_sequential.py +++ b/modeling_mixformer_sequential.py @@ -753,7 +753,7 @@ class MixFormerSequentialForCausalLM(MixFormerSequentialPreTrainedModel): **kwargs, ) -> CausalLMOutputWithPast: if attention_mask is not None and self.training: - raise ValueError("`attention_mask` is not supported during training.") + print("`attention_mask` is not supported during training. Using it might lead to unexpected results.") if past_key_values is None and attention_mask is None: lm_logits = self.layers(input_ids)