diff --git a/configuration_mixformer_sequential.py b/configuration_mixformer_sequential.py index 7366275..e80206a 100644 --- a/configuration_mixformer_sequential.py +++ b/configuration_mixformer_sequential.py @@ -21,24 +21,24 @@ class MixFormerSequentialConfig(PretrainedConfig): def __init__( self, - vocab_size: Optional[int] = 50304, - n_positions: Optional[int] = 2048, - n_embd: Optional[int] = 1024, - n_layer: Optional[int] = 20, + vocab_size: int = 50304, + n_positions: int = 2048, + n_embd: int = 1024, + n_layer: int = 20, n_inner: Optional[int] = None, - n_head: Optional[int] = 16, + n_head: int = 16, n_head_kv: Optional[int] = None, rotary_dim: Optional[int] = 32, activation_function: Optional[str] = "gelu_new", flash_rotary: bool = False, fused_dense: bool = False, - attn_pdrop: Optional[float] = 0.0, - embd_pdrop: Optional[float] = 0.0, - resid_pdrop: Optional[float] = 0.0, - layer_norm_epsilon: Optional[float] = 1e-5, - initializer_range: Optional[float] = 0.02, - tie_word_embeddings: Optional[bool] = False, - pad_vocab_size_multiple: Optional[int] = 64, + attn_pdrop: float = 0.0, + embd_pdrop: float = 0.0, + resid_pdrop: float = 0.0, + layer_norm_epsilon: float = 1e-5, + initializer_range: float = 0.02, + tie_word_embeddings: bool = False, + pad_vocab_size_multiple: int = 64, **kwargs ) -> None: self.vocab_size = int(math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple)