|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | """ Bloom configuration""" | 
					
						
						|  | from transformers.configuration_utils import PretrainedConfig | 
					
						
						|  | from transformers.utils import logging | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | logger = logging.get_logger(__name__) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class RWConfig(PretrainedConfig): | 
					
						
						|  | model_type = "RefinedWeb" | 
					
						
						|  | keys_to_ignore_at_inference = ["past_key_values"] | 
					
						
						|  | attribute_map = { | 
					
						
						|  | "num_hidden_layers": "n_layer", | 
					
						
						|  | "num_attention_heads": "n_head", | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  | def __init__( | 
					
						
						|  | self, | 
					
						
						|  | vocab_size=250880, | 
					
						
						|  | hidden_size=64, | 
					
						
						|  | n_layer=2, | 
					
						
						|  | n_head=8, | 
					
						
						|  | layer_norm_epsilon=1e-5, | 
					
						
						|  | initializer_range=0.02, | 
					
						
						|  | use_cache=True, | 
					
						
						|  | bos_token_id=1, | 
					
						
						|  | eos_token_id=2, | 
					
						
						|  | apply_residual_connection_post_layernorm=False, | 
					
						
						|  | hidden_dropout=0.0, | 
					
						
						|  | attention_dropout=0.0, | 
					
						
						|  | n_head_kv=None, | 
					
						
						|  | alibi=False, | 
					
						
						|  | **kwargs, | 
					
						
						|  | ): | 
					
						
						|  | self.vocab_size = vocab_size | 
					
						
						|  |  | 
					
						
						|  | n_embed = kwargs.pop("n_embed", None) | 
					
						
						|  | self.hidden_size = hidden_size if n_embed is None else n_embed | 
					
						
						|  | self.n_layer = n_layer | 
					
						
						|  | self.n_head = n_head | 
					
						
						|  | self.layer_norm_epsilon = layer_norm_epsilon | 
					
						
						|  | self.initializer_range = initializer_range | 
					
						
						|  | self.use_cache = use_cache | 
					
						
						|  | self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm | 
					
						
						|  | self.hidden_dropout = hidden_dropout | 
					
						
						|  | self.attention_dropout = attention_dropout | 
					
						
						|  |  | 
					
						
						|  | self.bos_token_id = bos_token_id | 
					
						
						|  | self.eos_token_id = eos_token_id | 
					
						
						|  | self.n_head_kv = n_head if n_head_kv is None else n_head_kv | 
					
						
						|  | self.alibi = alibi | 
					
						
						|  |  | 
					
						
						|  | super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def head_dim(self): | 
					
						
						|  | return self.hidden_size // self.n_head | 
					
						
						|  |  | 
					
						
						|  | @property | 
					
						
						|  | def rotary(self): | 
					
						
						|  | return not self.alibi | 
					
						
						|  |  |