From 8e9ebfb9bfe4da87d5c705d097fe884dc8e62d6f Mon Sep 17 00:00:00 2001 From: Gustavo de Rosa Date: Thu, 16 Nov 2023 18:10:30 +0000 Subject: [PATCH] Delete configuration_mixformer_sequential.py --- configuration_mixformer_sequential.py | 61 --------------------------- 1 file changed, 61 deletions(-) delete mode 100644 configuration_mixformer_sequential.py diff --git a/configuration_mixformer_sequential.py b/configuration_mixformer_sequential.py deleted file mode 100644 index e80206a..0000000 --- a/configuration_mixformer_sequential.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import math -from typing import Optional - -from transformers import PretrainedConfig - - -class MixFormerSequentialConfig(PretrainedConfig): - """MixFormer (sequential for DeepSpeed) configuration.""" - - model_type = "mixformer-sequential" - - attribute_map = { - "max_position_embeddings": "n_positions", - "hidden_size": "n_embd", - "num_attention_heads": "n_head", - "num_hidden_layers": "n_layer", - } - - def __init__( - self, - vocab_size: int = 50304, - n_positions: int = 2048, - n_embd: int = 1024, - n_layer: int = 20, - n_inner: Optional[int] = None, - n_head: int = 16, - n_head_kv: Optional[int] = None, - rotary_dim: Optional[int] = 32, - activation_function: Optional[str] = "gelu_new", - flash_rotary: bool = False, - fused_dense: bool = False, - attn_pdrop: float = 0.0, - embd_pdrop: float = 0.0, - resid_pdrop: float = 0.0, - layer_norm_epsilon: float = 1e-5, - initializer_range: float = 0.02, - tie_word_embeddings: bool = False, - pad_vocab_size_multiple: int = 64, - **kwargs - ) -> None: - self.vocab_size = int(math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple) - self.n_positions = n_positions - self.n_embd = n_embd - self.n_layer = n_layer - self.n_inner = n_inner - self.n_head = n_head - self.n_head_kv = n_head_kv - self.rotary_dim = min(rotary_dim, n_embd // n_head) - self.activation_function = activation_function - self.flash_rotary = flash_rotary - self.fused_dense = fused_dense - self.attn_pdrop = attn_pdrop - self.embd_pdrop = embd_pdrop - self.resid_pdrop = resid_pdrop - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)