nvidia
/

esm2_t36_3B_UR50D

@@ -35,7 +35,7 @@
   "position_embedding_type": "rotary",
   "qkv_weight_interleaved": true,
   "token_dropout": true,
-  "transformers_version": "4.56.2",
   "use_cache": true,
   "vocab_list": null,
   "vocab_size": 33

   "position_embedding_type": "rotary",
   "qkv_weight_interleaved": true,
   "token_dropout": true,
+  "transformers_version": "4.57.0",
   "use_cache": true,
   "vocab_list": null,
   "vocab_size": 33

esm_nv.py CHANGED Viewed

@@ -39,7 +39,7 @@ from transformers.modeling_outputs import (
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.esm.configuration_esm import EsmConfig
-from transformers.models.esm.modeling_esm import EsmEmbeddings, EsmPooler
 from transformers.utils import logging
@@ -153,16 +153,6 @@ class NVEsmEncoder(nn.Module):
         if config.position_embedding_type == "rotary":
             self.rotary_embeddings = RotaryPositionEmbedding(config.hidden_size // config.num_attention_heads)
-            # Keep on CPU, pin for faster non_blocking H2D; don't persist in state_dict.
-            if config.attn_input_format == "bshd":
-                self.register_buffer(
-                    "te_rope_emb",
-                    self.rotary_embeddings(max_seq_len=config.max_position_embeddings).cpu().pin_memory(),
-                    persistent=False,
-                )
-            else:
-                self.te_rope_emb = None
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -195,6 +185,7 @@ class NVEsmEncoder(nn.Module):
                 "THD expects embeddings shaped [1, total_tokens, hidden_size]."
             )
             hidden_states = hidden_states.squeeze(0)
         elif self.config.attn_input_format == "bshd":
             if any(x is not None for x in [cu_seq_lens_q, cu_seq_lens_k, max_length_q, max_length_k]):
@@ -202,28 +193,14 @@ class NVEsmEncoder(nn.Module):
                     "cu_seq_lens_q, cu_seq_lens_k, max_length_q, and max_length_k are not allowed when using BSHD inputs."
                 )
-        te_rope_emb = None
-        if self.config.position_embedding_type == "rotary":
-            if self.config.attn_input_format == "bshd":
-                te_rope_emb = self.te_rope_emb.to(
-                    device=hidden_states.device, dtype=hidden_states.dtype, non_blocking=True
-                )
-                seq_len = hidden_states.shape[1]
-                if te_rope_emb.size(0) < seq_len:
-                    raise RuntimeError(
-                        f"ROPE length {te_rope_emb.size(0)} < input seq length {seq_len}. "
-                        f"Increase max_position_embeddings."
-                    )
-                te_rope_emb = te_rope_emb[:seq_len]
-            elif self.config.attn_input_format == "thd":
-                assert cu_seq_lens_q is not None
-                te_rope_emb = self.rotary_embeddings(max_seq_len=cu_seq_lens_q[-1]).to(
-                    device=hidden_states.device, dtype=hidden_states.dtype, non_blocking=True
-                )
-            else:
-                raise ValueError(f"Unsupported attention input format: {self.config.attn_input_format}")
         for layer_module in self.layers:
             if output_hidden_states:
@@ -305,15 +282,10 @@ class NVEsmModel(NVEsmPreTrainedModel):
         super().__init__(config)
         self.config = config
-        # Create EsmEmbeddings with temporarily modified config to use padded vocab size
-        # This ensures the word embeddings layer uses the padded vocabulary size for FP8 support
-        original_vocab_size = config.vocab_size
-        config.vocab_size = config.padded_vocab_size
         # Ensure pad_token_id is set properly, defaulting to 0 if not specified
         if not hasattr(config, "pad_token_id") or config.pad_token_id is None:
             config.pad_token_id = 0
-        self.embeddings = EsmEmbeddings(config)
-        config.vocab_size = original_vocab_size  # Restore original vocab_size
         self.encoder = NVEsmEncoder(config)
         self.pooler = EsmPooler(config) if add_pooling_layer else None
@@ -337,7 +309,6 @@ class NVEsmModel(NVEsmPreTrainedModel):
         input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         cu_seq_lens_q: torch.IntTensor | None = None,
@@ -351,7 +322,6 @@ class NVEsmModel(NVEsmPreTrainedModel):
             input_ids (torch.Tensor): The input ids.
             attention_mask (torch.Tensor): The attention mask.
             position_ids (torch.Tensor): The position ids.
-            head_mask (torch.Tensor): The head mask.
             inputs_embeds (torch.Tensor): The input embeddings.
             output_hidden_states (bool): Whether to output the hidden states.
             cu_seq_lens_q (torch.IntTensor): The cumulative sequence lengths for the query state, if using THD inputs.
@@ -389,18 +359,14 @@ class NVEsmModel(NVEsmPreTrainedModel):
         # TE expects a boolean attention mask, where 1s are masked and 0s are not masked
         extended_attention_mask = extended_attention_mask < -1
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
         embedding_output = self.embeddings(
             input_ids=input_ids,
-            position_ids=position_ids,
             attention_mask=attention_mask,
             inputs_embeds=inputs_embeds,
         )
         encoder_outputs = self.encoder(
             embedding_output,
@@ -547,3 +513,89 @@ class NVEsmLMHead(nn.Module):
         x = torch.nn.functional.gelu(x)
         x = self.decoder(x)
         return x

 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.esm.configuration_esm import EsmConfig
+from transformers.models.esm.modeling_esm import EsmPooler
 from transformers.utils import logging
         if config.position_embedding_type == "rotary":
             self.rotary_embeddings = RotaryPositionEmbedding(config.hidden_size // config.num_attention_heads)
     def forward(
         self,
         hidden_states: torch.Tensor,
                 "THD expects embeddings shaped [1, total_tokens, hidden_size]."
             )
             hidden_states = hidden_states.squeeze(0)
+            attention_mask = None
         elif self.config.attn_input_format == "bshd":
             if any(x is not None for x in [cu_seq_lens_q, cu_seq_lens_k, max_length_q, max_length_k]):
                     "cu_seq_lens_q, cu_seq_lens_k, max_length_q, and max_length_k are not allowed when using BSHD inputs."
                 )
+        # Ensure that rotary embeddings are computed with at a higher precision outside the torch autocast context.
+        with torch.autocast(device_type="cuda", enabled=False):
+            if self.config.position_embedding_type == "rotary":
+                if self.config.attn_input_format == "bshd":
+                    te_rope_emb = self.rotary_embeddings(max_seq_len=hidden_states.shape[1])
+                elif self.config.attn_input_format == "thd":
+                    te_rope_emb = self.rotary_embeddings(max_seq_len=cu_seq_lens_q[-1])
+            te_rope_emb = te_rope_emb.to(hidden_states.device, dtype=hidden_states.dtype, non_blocking=True)
         for layer_module in self.layers:
             if output_hidden_states:
         super().__init__(config)
         self.config = config
         # Ensure pad_token_id is set properly, defaulting to 0 if not specified
         if not hasattr(config, "pad_token_id") or config.pad_token_id is None:
             config.pad_token_id = 0
+        self.embeddings = NVEsmEmbeddings(config)
         self.encoder = NVEsmEncoder(config)
         self.pooler = EsmPooler(config) if add_pooling_layer else None
         input_ids: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         cu_seq_lens_q: torch.IntTensor | None = None,
             input_ids (torch.Tensor): The input ids.
             attention_mask (torch.Tensor): The attention mask.
             position_ids (torch.Tensor): The position ids.
             inputs_embeds (torch.Tensor): The input embeddings.
             output_hidden_states (bool): Whether to output the hidden states.
             cu_seq_lens_q (torch.IntTensor): The cumulative sequence lengths for the query state, if using THD inputs.
         # TE expects a boolean attention mask, where 1s are masked and 0s are not masked
         extended_attention_mask = extended_attention_mask < -1
         embedding_output = self.embeddings(
             input_ids=input_ids,
             attention_mask=attention_mask,
             inputs_embeds=inputs_embeds,
+            cu_seq_lens_q=cu_seq_lens_q,
+            cu_seq_lens_k=cu_seq_lens_k,
+            max_length_q=max_length_q,
+            max_length_k=max_length_k,
         )
         encoder_outputs = self.encoder(
             embedding_output,
         x = torch.nn.functional.gelu(x)
         x = self.decoder(x)
         return x
+class NVEsmEmbeddings(nn.Module):
+    """Modified version of EsmEmbeddings to support THD inputs."""
+    def __init__(self, config):
+        """Initialize a NVEsmEmbeddings."""
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.padded_vocab_size, config.hidden_size, padding_idx=config.pad_token_id
+        )
+        self.layer_norm = (
+            nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) if config.emb_layer_norm_before else None
+        )
+        if config.position_embedding_type != "rotary":
+            raise ValueError(
+                "The TE-accelerated ESM-2 model only supports rotary position embeddings, received "
+                f"{config.position_embedding_type}"
+            )
+        self.padding_idx = config.pad_token_id
+        self.token_dropout = config.token_dropout
+        self.mask_token_id = config.mask_token_id
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cu_seq_lens_q: torch.IntTensor | None = None,
+        cu_seq_lens_k: torch.IntTensor | None = None,
+        max_length_q: int | None = None,
+        max_length_k: int | None = None,
+    ):
+        """Forward pass of the NVEsmEmbeddings."""
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        # Note that if we want to support ESM-1 (not 1b!) in future then we need to support an
+        # embedding_scale factor here.
+        embeddings = inputs_embeds
+        if all(x is not None for x in [cu_seq_lens_q, cu_seq_lens_k, max_length_q, max_length_k]):
+            using_thd = True
+            attention_mask = None
+        else:
+            using_thd = False
+        # Matt: ESM has the option to handle masking in MLM in a slightly unusual way. If the token_dropout
+        # flag is False then it is handled in the same was as BERT/RoBERTa. If it is set to True, however,
+        # masked tokens are treated as if they were selected for input dropout and zeroed out.
+        # This "mask-dropout" is compensated for when masked tokens are not present, by scaling embeddings by
+        # a factor of (fraction of unmasked tokens during training) / (fraction of unmasked tokens in sample).
+        # This is analogous to the way that dropout layers scale down outputs during evaluation when not
+        # actually dropping out values (or, equivalently, scale up their un-dropped outputs in training).
+        if self.token_dropout and input_ids is not None:
+            embeddings = embeddings.masked_fill((input_ids == self.mask_token_id).unsqueeze(-1), 0.0)
+            mask_ratio_train = 0.15 * 0.8  # Hardcoded as the ratio used in all ESM model training runs
+            if not using_thd:
+                # BSHD token dropout correction
+                src_lengths = attention_mask.sum(-1) if attention_mask is not None else input_ids.shape[1]
+                n_masked_per_seq = (input_ids == self.mask_token_id).sum(-1).float()
+                mask_ratio_observed = n_masked_per_seq / src_lengths
+                scale_factor = (1 - mask_ratio_train) / (1 - mask_ratio_observed)
+                embeddings = (embeddings * scale_factor[:, None, None]).to(embeddings.dtype)
+            else:
+                src_lengths = torch.diff(cu_seq_lens_q)
+                # We need to find the number of masked tokens in each sequence in the padded batch.
+                is_masked = (input_ids == self.mask_token_id).squeeze(0)
+                n_masked_per_seq = torch.nested.nested_tensor_from_jagged(is_masked, offsets=cu_seq_lens_q).sum(1)
+                mask_ratio_observed = n_masked_per_seq.float() / src_lengths
+                scale_factor = (1 - mask_ratio_train) / (1 - mask_ratio_observed)
+                reshaped_scale_factor = torch.repeat_interleave(scale_factor, src_lengths, dim=0)
+                embeddings = (embeddings * reshaped_scale_factor.unsqueeze(-1)).to(embeddings.dtype)
+        if self.layer_norm is not None:
+            embeddings = self.layer_norm(embeddings)
+        if attention_mask is not None:
+            embeddings = (embeddings * attention_mask.unsqueeze(-1)).to(embeddings.dtype)
+        return embeddings