Commit
·
d334358
1
Parent(s):
3d19935
Update configuration_aquila.py
Browse files- configuration_aquila.py +15 -0
configuration_aquila.py
CHANGED
|
@@ -83,6 +83,7 @@ class AquilaConfig(PretrainedConfig):
|
|
| 83 |
intermediate_size=11008,
|
| 84 |
num_hidden_layers=32,
|
| 85 |
num_attention_heads=32,
|
|
|
|
| 86 |
hidden_act="silu",
|
| 87 |
max_position_embeddings=2048,
|
| 88 |
initializer_range=0.02,
|
|
@@ -91,7 +92,10 @@ class AquilaConfig(PretrainedConfig):
|
|
| 91 |
pad_token_id=0,
|
| 92 |
bos_token_id=1,
|
| 93 |
eos_token_id=2,
|
|
|
|
| 94 |
tie_word_embeddings=False,
|
|
|
|
|
|
|
| 95 |
**kwargs,
|
| 96 |
):
|
| 97 |
self.vocab_size = vocab_size
|
|
@@ -99,11 +103,22 @@ class AquilaConfig(PretrainedConfig):
|
|
| 99 |
self.hidden_size = hidden_size
|
| 100 |
self.intermediate_size = intermediate_size
|
| 101 |
self.num_hidden_layers = num_hidden_layers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
self.num_attention_heads = num_attention_heads
|
| 103 |
self.hidden_act = hidden_act
|
| 104 |
self.initializer_range = initializer_range
|
| 105 |
self.rms_norm_eps = rms_norm_eps
|
|
|
|
| 106 |
self.use_cache = use_cache
|
|
|
|
|
|
|
|
|
|
| 107 |
super().__init__(
|
| 108 |
pad_token_id=pad_token_id,
|
| 109 |
bos_token_id=bos_token_id,
|
|
|
|
| 83 |
intermediate_size=11008,
|
| 84 |
num_hidden_layers=32,
|
| 85 |
num_attention_heads=32,
|
| 86 |
+
num_key_value_heads=None,
|
| 87 |
hidden_act="silu",
|
| 88 |
max_position_embeddings=2048,
|
| 89 |
initializer_range=0.02,
|
|
|
|
| 92 |
pad_token_id=0,
|
| 93 |
bos_token_id=1,
|
| 94 |
eos_token_id=2,
|
| 95 |
+
pretraining_tp=1,
|
| 96 |
tie_word_embeddings=False,
|
| 97 |
+
rope_theta=10000.0,
|
| 98 |
+
rope_scaling=None,
|
| 99 |
**kwargs,
|
| 100 |
):
|
| 101 |
self.vocab_size = vocab_size
|
|
|
|
| 103 |
self.hidden_size = hidden_size
|
| 104 |
self.intermediate_size = intermediate_size
|
| 105 |
self.num_hidden_layers = num_hidden_layers
|
| 106 |
+
|
| 107 |
+
# for backward compatibility
|
| 108 |
+
if num_key_value_heads is None:
|
| 109 |
+
num_key_value_heads = num_attention_heads
|
| 110 |
+
|
| 111 |
+
self.num_key_value_heads = num_key_value_heads
|
| 112 |
+
|
| 113 |
self.num_attention_heads = num_attention_heads
|
| 114 |
self.hidden_act = hidden_act
|
| 115 |
self.initializer_range = initializer_range
|
| 116 |
self.rms_norm_eps = rms_norm_eps
|
| 117 |
+
self.pretraining_tp = pretraining_tp
|
| 118 |
self.use_cache = use_cache
|
| 119 |
+
self.rope_theta = rope_theta
|
| 120 |
+
self.rope_scaling = rope_scaling
|
| 121 |
+
|
| 122 |
super().__init__(
|
| 123 |
pad_token_id=pad_token_id,
|
| 124 |
bos_token_id=bos_token_id,
|