Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

config.json +1 -0
esm_nv.py +119 -54
model-00001-of-00003.safetensors +2 -2
model-00003-of-00003.safetensors +2 -2
model.safetensors.index.json +2 -2

config.json CHANGED Viewed

@@ -29,6 +29,7 @@
   "num_attention_heads": 40,
   "num_hidden_layers": 36,
   "pad_token_id": 1,
   "position_embedding_type": "rotary",
   "qkv_weight_interleaved": true,
   "token_dropout": true,

   "num_attention_heads": 40,
   "num_hidden_layers": 36,
   "pad_token_id": 1,
+  "padded_vocab_size": 64,
   "position_embedding_type": "rotary",
   "qkv_weight_interleaved": true,
   "token_dropout": true,

esm_nv.py CHANGED Viewed

@@ -23,7 +23,7 @@
 Adapted from `modeling_esm.py` in huggingface/transformers.
 """
-from typing import Optional, Tuple, Union
 # TODO: put import guard around transformer_engine here, with an informative error message around
 # installation and the nvidia docker container.
@@ -35,7 +35,6 @@ from transformer_engine.pytorch.attention.rope import RotaryPositionEmbedding
 from transformers.modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPooling,
-    BaseModelOutputWithPoolingAndCrossAttentions,
     MaskedLMOutput,
 )
 from transformers.modeling_utils import PreTrainedModel
@@ -56,10 +55,11 @@ class NVEsmConfig(EsmConfig):
         self,
         qkv_weight_interleaved: bool = True,
         encoder_activation: str = "gelu",
-        attn_input_format: str = "bshd",
         fuse_qkv_params: bool = True,
         micro_batch_size: Optional[int] = None,
         max_seq_length: Optional[int] = None,
         **kwargs,
     ):
         """Initialize the NVEsmConfig with additional TE-related config options.
@@ -87,6 +87,8 @@ class NVEsmConfig(EsmConfig):
             max_seq_length: The maximum sequence length to use for the attention. This is needed for
                 JIT Warmup, a technique where jit fused functions are warmed up before training to
                 ensure same kernels are used for forward propogation and activation recompute phase.
             **kwargs: Additional config options to pass to EsmConfig.
         """
         super().__init__(**kwargs)
@@ -98,6 +100,15 @@ class NVEsmConfig(EsmConfig):
         self.micro_batch_size = micro_batch_size
         self.max_seq_length = max_seq_length
 class NVEsmEncoder(nn.Module):
     """NVEsmEncoder is a TransformerEngine-optimized ESM encoder."""
@@ -138,15 +149,26 @@ class NVEsmEncoder(nn.Module):
         self.emb_layer_norm_after = transformer_engine.pytorch.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         if config.position_embedding_type == "rotary":
             self.rotary_embeddings = RotaryPositionEmbedding(config.hidden_size // config.num_attention_heads)
-            self.te_rope_emb = self.rotary_embeddings(max_seq_len=config.max_position_embeddings)
-        else:
-            self.te_rope_emb = None
     def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         output_hidden_states: bool = False,
     ):
         """Forward pass of the NVEsmEncoder.
@@ -154,14 +176,51 @@ class NVEsmEncoder(nn.Module):
             hidden_states (torch.Tensor): The hidden states.
             attention_mask (torch.Tensor): The attention mask.
             output_hidden_states (bool): Whether to output the hidden states.
         """
-        all_hidden_states = () if output_hidden_states else None
-        if self.te_rope_emb is not None:
-            te_rope_emb = self.te_rope_emb.to(hidden_states.device, non_blocking=True)
-            te_rope_emb = te_rope_emb[: hidden_states.shape[1]]
-        else:
-            te_rope_emb = None
         for layer_module in self.layers:
             if output_hidden_states:
@@ -171,6 +230,10 @@ class NVEsmEncoder(nn.Module):
                 hidden_states,
                 attention_mask,
                 rotary_pos_emb=te_rope_emb,
             )
         hidden_states = self.emb_layer_norm_after(hidden_states)
@@ -180,7 +243,7 @@ class NVEsmEncoder(nn.Module):
         return BaseModelOutput(
             last_hidden_state=hidden_states,
-            hidden_states=all_hidden_states,
         )
@@ -239,7 +302,15 @@ class NVEsmModel(NVEsmPreTrainedModel):
         super().__init__(config)
         self.config = config
         self.embeddings = EsmEmbeddings(config)
         self.encoder = NVEsmEncoder(config)
         self.pooler = EsmPooler(config) if add_pooling_layer else None
@@ -266,7 +337,11 @@ class NVEsmModel(NVEsmPreTrainedModel):
         head_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
         """Forward pass of the NVEsmModel.
         Args:
@@ -276,25 +351,14 @@ class NVEsmModel(NVEsmPreTrainedModel):
             head_mask (torch.Tensor): The head mask.
             inputs_embeds (torch.Tensor): The input embeddings.
             output_hidden_states (bool): Whether to output the hidden states.
         Returns:
             BaseModelOutputWithPooling: The output of the model.
         """
-        r"""
-        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the
-            cross-attention if the model is configured as a decoder.
-        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on the padding token indices of the encoder input.
-            This mask is used in the cross-attention if the model is configured as a decoder. Mask
-            values selected in `[0, 1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            Note that this mask is inverted when it is passed to TransformerEngine, which expects a
-            boolean mask where 1s are masked and 0s are not masked.
-        """
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
@@ -339,6 +403,10 @@ class NVEsmModel(NVEsmPreTrainedModel):
             embedding_output,
             attention_mask=extended_attention_mask,
             output_hidden_states=output_hidden_states,
         )
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
@@ -391,7 +459,11 @@ class NVEsmForMaskedLM(NVEsmPreTrainedModel):
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         output_hidden_states: Optional[bool] = None,
-    ) -> Union[Tuple, MaskedLMOutput]:
         """Forward pass of the NVEsmForMaskedLM.
         Args:
@@ -401,34 +473,39 @@ class NVEsmForMaskedLM(NVEsmPreTrainedModel):
             inputs_embeds (torch.FloatTensor): The input embeddings.
             labels (torch.LongTensor): The labels.
             output_hidden_states (bool): Whether to output the hidden states.
         Returns:
             MaskedLMOutput: The output of the model.
         """
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
-            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
-            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
-            Used to hide legacy arguments that have been deprecated.
-        """
         outputs = self.esm(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
             inputs_embeds=inputs_embeds,
             output_hidden_states=output_hidden_states,
         )
         sequence_output = outputs[0]
         prediction_scores = self.lm_head(sequence_output)
         masked_lm_loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()
-            labels = labels.to(prediction_scores.device)
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
         return MaskedLMOutput(
             loss=masked_lm_loss,
@@ -436,18 +513,6 @@ class NVEsmForMaskedLM(NVEsmPreTrainedModel):
             hidden_states=outputs.hidden_states,
         )
-    def predict_contacts(self, tokens: torch.Tensor, attention_mask: torch.Tensor):
-        """Predict the contacts of the model.
-        Args:
-            tokens (torch.Tensor): The tokens.
-            attention_mask (torch.Tensor): The attention mask.
-        Returns:
-            torch.Tensor: The predicted contacts.
-        """
-        return self.esm.predict_contacts(tokens, attention_mask=attention_mask)
 class NVEsmLMHead(nn.Module):
     """ESM Head for masked language modeling using TransformerEngine."""
@@ -463,7 +528,7 @@ class NVEsmLMHead(nn.Module):
         self.decoder = transformer_engine.pytorch.LayerNormLinear(
             config.hidden_size,
-            config.vocab_size,
             bias=True,
             eps=config.layer_norm_eps,
         )

 Adapted from `modeling_esm.py` in huggingface/transformers.
 """
+from typing import Literal, Optional
 # TODO: put import guard around transformer_engine here, with an informative error message around
 # installation and the nvidia docker container.
 from transformers.modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPooling,
     MaskedLMOutput,
 )
 from transformers.modeling_utils import PreTrainedModel
         self,
         qkv_weight_interleaved: bool = True,
         encoder_activation: str = "gelu",
+        attn_input_format: Literal["bshd", "thd"] = "bshd",
         fuse_qkv_params: bool = True,
         micro_batch_size: Optional[int] = None,
         max_seq_length: Optional[int] = None,
+        padded_vocab_size: Optional[int] = 64,
         **kwargs,
     ):
         """Initialize the NVEsmConfig with additional TE-related config options.
             max_seq_length: The maximum sequence length to use for the attention. This is needed for
                 JIT Warmup, a technique where jit fused functions are warmed up before training to
                 ensure same kernels are used for forward propogation and activation recompute phase.
+            padded_vocab_size: The padded vocabulary size to support FP8. If not provided, defaults
+                to vocab_size. Must be greater than or equal to vocab_size.
             **kwargs: Additional config options to pass to EsmConfig.
         """
         super().__init__(**kwargs)
         self.micro_batch_size = micro_batch_size
         self.max_seq_length = max_seq_length
+        # Set padded_vocab_size with default fallback to vocab_size
+        self.padded_vocab_size = padded_vocab_size if padded_vocab_size is not None else self.vocab_size
+        # Ensure padded_vocab_size is at least as large as vocab_size
+        if self.padded_vocab_size is not None and self.vocab_size is not None:
+            assert self.padded_vocab_size >= self.vocab_size, (
+                f"padded_vocab_size ({self.padded_vocab_size}) must be greater than or equal to vocab_size ({self.vocab_size})"
+            )
 class NVEsmEncoder(nn.Module):
     """NVEsmEncoder is a TransformerEngine-optimized ESM encoder."""
         self.emb_layer_norm_after = transformer_engine.pytorch.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         if config.position_embedding_type == "rotary":
             self.rotary_embeddings = RotaryPositionEmbedding(config.hidden_size // config.num_attention_heads)
+            # Keep on CPU, pin for faster non_blocking H2D; don't persist in state_dict.
+            if config.attn_input_format == "bshd":
+                self.register_buffer(
+                    "te_rope_emb",
+                    self.rotary_embeddings(max_seq_len=config.max_position_embeddings).cpu().pin_memory(),
+                    persistent=False,
+                )
+            else:
+                self.te_rope_emb = None
     def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         output_hidden_states: bool = False,
+        cu_seq_lens_q: torch.IntTensor | None = None,
+        cu_seq_lens_k: torch.IntTensor | None = None,
+        max_length_q: int | None = None,
+        max_length_k: int | None = None,
     ):
         """Forward pass of the NVEsmEncoder.
             hidden_states (torch.Tensor): The hidden states.
             attention_mask (torch.Tensor): The attention mask.
             output_hidden_states (bool): Whether to output the hidden states.
+            cu_seq_lens_q (torch.IntTensor): The cumulative sequence lengths for the query state, if using THD inputs.
+            cu_seq_lens_k (torch.IntTensor): The cumulative sequence lengths for the key state, if using THD inputs.
+            max_length_q (int): The maximum length for the query state, if using THD inputs.
+            max_length_k (int): The maximum length for the key state, if using THD inputs.
         """
+        all_hidden_states: tuple[torch.Tensor, ...] = ()
+        if self.config.attn_input_format == "thd":
+            if any(x is None for x in [cu_seq_lens_q, cu_seq_lens_k, max_length_q, max_length_k]):
+                raise ValueError(
+                    "cu_seq_lens_q, cu_seq_lens_k, max_length_q, and max_length_k must be provided when using THD inputs."
+                )
+            assert hidden_states.dim() == 3 and hidden_states.size(0) == 1, (
+                "THD expects embeddings shaped [1, total_tokens, hidden_size]."
+            )
+            hidden_states = hidden_states.squeeze(0)
+        elif self.config.attn_input_format == "bshd":
+            if any(x is not None for x in [cu_seq_lens_q, cu_seq_lens_k, max_length_q, max_length_k]):
+                raise ValueError(
+                    "cu_seq_lens_q, cu_seq_lens_k, max_length_q, and max_length_k are not allowed when using BSHD inputs."
+                )
+        te_rope_emb = None
+        if self.config.position_embedding_type == "rotary":
+            if self.config.attn_input_format == "bshd":
+                te_rope_emb = self.te_rope_emb.to(
+                    device=hidden_states.device, dtype=hidden_states.dtype, non_blocking=True
+                )
+                seq_len = hidden_states.shape[1]
+                if te_rope_emb.size(0) < seq_len:
+                    raise RuntimeError(
+                        f"ROPE length {te_rope_emb.size(0)} < input seq length {seq_len}. "
+                        f"Increase max_position_embeddings."
+                    )
+                te_rope_emb = te_rope_emb[:seq_len]
+            elif self.config.attn_input_format == "thd":
+                assert cu_seq_lens_q is not None
+                te_rope_emb = self.rotary_embeddings(max_seq_len=cu_seq_lens_q[-1]).to(
+                    device=hidden_states.device, dtype=hidden_states.dtype, non_blocking=True
+                )
+            else:
+                raise ValueError(f"Unsupported attention input format: {self.config.attn_input_format}")
         for layer_module in self.layers:
             if output_hidden_states:
                 hidden_states,
                 attention_mask,
                 rotary_pos_emb=te_rope_emb,
+                cu_seqlens_q=cu_seq_lens_q,
+                cu_seqlens_kv=cu_seq_lens_k,
+                max_seqlen_q=max_length_q,
+                max_seqlen_kv=max_length_k,
             )
         hidden_states = self.emb_layer_norm_after(hidden_states)
         return BaseModelOutput(
             last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states if all_hidden_states else None,
         )
         super().__init__(config)
         self.config = config
+        # Create EsmEmbeddings with temporarily modified config to use padded vocab size
+        # This ensures the word embeddings layer uses the padded vocabulary size for FP8 support
+        original_vocab_size = config.vocab_size
+        config.vocab_size = config.padded_vocab_size
+        # Ensure pad_token_id is set properly, defaulting to 0 if not specified
+        if not hasattr(config, "pad_token_id") or config.pad_token_id is None:
+            config.pad_token_id = 0
         self.embeddings = EsmEmbeddings(config)
+        config.vocab_size = original_vocab_size  # Restore original vocab_size
         self.encoder = NVEsmEncoder(config)
         self.pooler = EsmPooler(config) if add_pooling_layer else None
         head_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
+        cu_seq_lens_q: torch.IntTensor | None = None,
+        cu_seq_lens_k: torch.IntTensor | None = None,
+        max_length_q: int | None = None,
+        max_length_k: int | None = None,
+    ) -> BaseModelOutputWithPooling:
         """Forward pass of the NVEsmModel.
         Args:
             head_mask (torch.Tensor): The head mask.
             inputs_embeds (torch.Tensor): The input embeddings.
             output_hidden_states (bool): Whether to output the hidden states.
+            cu_seq_lens_q (torch.IntTensor): The cumulative sequence lengths for the query state, if using THD inputs.
+            cu_seq_lens_k (torch.IntTensor): The cumulative sequence lengths for the key state, if using THD inputs.
+            max_length_q (int): The maximum length for the query state, if using THD inputs.
+            max_length_k (int): The maximum length for the key state, if using THD inputs.
         Returns:
             BaseModelOutputWithPooling: The output of the model.
         """
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
             embedding_output,
             attention_mask=extended_attention_mask,
             output_hidden_states=output_hidden_states,
+            cu_seq_lens_q=cu_seq_lens_q,
+            cu_seq_lens_k=cu_seq_lens_k,
+            max_length_q=max_length_q,
+            max_length_k=max_length_k,
         )
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         output_hidden_states: Optional[bool] = None,
+        cu_seq_lens_q: torch.IntTensor | None = None,
+        cu_seq_lens_k: torch.IntTensor | None = None,
+        max_length_q: int | None = None,
+        max_length_k: int | None = None,
+    ) -> MaskedLMOutput:
         """Forward pass of the NVEsmForMaskedLM.
         Args:
             inputs_embeds (torch.FloatTensor): The input embeddings.
             labels (torch.LongTensor): The labels.
             output_hidden_states (bool): Whether to output the hidden states.
+            cu_seq_lens_q (torch.IntTensor): The cumulative sequence lengths for the query state, if using THD inputs.
+            cu_seq_lens_k (torch.IntTensor): The cumulative sequence lengths for the key state, if using THD inputs.
+            max_length_q (int): The maximum length for the query state, if using THD inputs.
+            max_length_k (int): The maximum length for the key state, if using THD inputs.
         Returns:
             MaskedLMOutput: The output of the model.
         """
         outputs = self.esm(
             input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
             inputs_embeds=inputs_embeds,
             output_hidden_states=output_hidden_states,
+            cu_seq_lens_q=cu_seq_lens_q,
+            cu_seq_lens_k=cu_seq_lens_k,
+            max_length_q=max_length_q,
+            max_length_k=max_length_k,
         )
         sequence_output = outputs[0]
         prediction_scores = self.lm_head(sequence_output)
+        # Truncate logits back to original vocab_size if padding was used
+        if self.config.padded_vocab_size != self.config.vocab_size:
+            prediction_scores = prediction_scores[..., : self.config.vocab_size]
         masked_lm_loss = None
         if labels is not None:
             loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.view(-1, self.config.vocab_size),
+                labels.to(prediction_scores.device).view(-1),
+            )
         return MaskedLMOutput(
             loss=masked_lm_loss,
             hidden_states=outputs.hidden_states,
         )
 class NVEsmLMHead(nn.Module):
     """ESM Head for masked language modeling using TransformerEngine."""
         self.decoder = transformer_engine.pytorch.LayerNormLinear(
             config.hidden_size,
+            config.padded_vocab_size if config.padded_vocab_size is not None else config.vocab_size,
             bias=True,
             eps=config.layer_norm_eps,
         )

model-00001-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:556b6aa9e921bf1caf2826232e2f1dd7bf40e62b3f07275fc37880c97e3c2745
-size 4930807200

 version https://git-lfs.github.com/spec/v1
+oid sha256:7e8931ca9ef4b515e3ad09daca46bec50a98bc580e2a8a87dcd2a73a8ebbc0a6
+size 4931124640

model-00003-of-00003.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:af3f10ee45ecb126c2b94a0c06688d63a3f3f2249cee1ea4e5e967ba9e70c850
-size 1494863401

 version https://git-lfs.github.com/spec/v1
+oid sha256:cd83d7ac3fe52dde4fc414511ee0babb770422a19b26a6f83b6f4ab39a6e0114
+size 1494863525

model.safetensors.index.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "metadata": {
-    "total_parameters": 2839004193,
-    "total_size": 11356016905
   },
   "weight_map": {
     "esm.embeddings.word_embeddings.weight": "model-00001-of-00003.safetensors",

 {
   "metadata": {
+    "total_parameters": 2839083584,
+    "total_size": 11356334469
   },
   "weight_map": {
     "esm.embeddings.word_embeddings.weight": "model-00001-of-00003.safetensors",