hf-audio
/

xcodec-wavlm-mls

+{
+  "acoustic_model_config": {
+    "codebook_dim": 8,
+    "codebook_loss_weight": 1.0,
+    "codebook_size": 1024,
+    "commitment_loss_weight": 0.25,
+    "decoder_hidden_size": 1024,
+    "downsampling_ratios": [
+      8,
+      5,
+      4,
+      2
+    ],
+    "encoder_hidden_size": 64,
+    "hidden_size": 256,
+    "hop_length": 320,
+    "model_type": "dac",
+    "n_codebooks": 9,
+    "quantizer_dropout": 0,
+    "sampling_rate": 16000,
+    "upsampling_ratios": [
+      8,
+      5,
+      4,
+      2
+    ]
+  },
+  "architectures": [
+    "XcodecModel"
+  ],
+  "block_dilations": [
+    1,
+    1
+  ],
+  "channel_ratios": [
+    1,
+    1
+  ],
+  "codebook_dim": 1024,
+  "codebook_size": 1024,
+  "decoder_channels": 768,
+  "encoder_channels": 768,
+  "initializer_range": 0.02,
+  "input_channels": 768,
+  "kernel_size": 3,
+  "model_type": "xcodec",
+  "output_channels": 768,
+  "sample_rate": 16000,
+  "semantic_model_config": {
+    "_name_or_path": "microsoft/wavlm-base-plus",
+    "activation_dropout": 0.0,
+    "adapter_kernel_size": 3,
+    "adapter_stride": 2,
+    "add_adapter": false,
+    "apply_spec_augment": true,
+    "architectures": [
+      "WavLMModel"
+    ],
+    "attention_dropout": 0.1,
+    "classifier_proj_size": 256,
+    "codevector_dim": 256,
+    "contrastive_logits_temperature": 0.1,
+    "conv_bias": false,
+    "conv_dim": [
+      512,
+      512,
+      512,
+      512,
+      512,
+      512,
+      512
+    ],
+    "conv_kernel": [
+      10,
+      3,
+      3,
+      3,
+      3,
+      2,
+      2
+    ],
+    "conv_stride": [
+      5,
+      2,
+      2,
+      2,
+      2,
+      2,
+      2
+    ],
+    "ctc_loss_reduction": "sum",
+    "ctc_zero_infinity": false,
+    "diversity_loss_weight": 0.1,
+    "do_stable_layer_norm": false,
+    "feat_extract_activation": "gelu",
+    "feat_extract_norm": "group",
+    "feat_proj_dropout": 0.1,
+    "feat_quantizer_dropout": 0.0,
+    "final_dropout": 0.0,
+    "freeze_feat_extract_train": true,
+    "hidden_act": "gelu",
+    "hidden_dropout": 0.1,
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-05,
+    "layerdrop": 0.05,
+    "mask_channel_length": 10,
+    "mask_channel_min_space": 1,
+    "mask_channel_other": 0.0,
+    "mask_channel_prob": 0.0,
+    "mask_channel_selection": "static",
+    "mask_feature_length": 10,
+    "mask_feature_min_masks": 0,
+    "mask_feature_prob": 0.0,
+    "mask_time_length": 10,
+    "mask_time_min_masks": 2,
+    "mask_time_min_space": 1,
+    "mask_time_other": 0.0,
+    "mask_time_prob": 0.05,
+    "mask_time_selection": "static",
+    "max_bucket_distance": 800,
+    "model_type": "wavlm",
+    "no_mask_channel_overlap": false,
+    "no_mask_time_overlap": false,
+    "num_adapter_layers": 3,
+    "num_attention_heads": 12,
+    "num_buckets": 320,
+    "num_codevector_groups": 2,
+    "num_codevectors_per_group": 320,
+    "num_conv_pos_embedding_groups": 16,
+    "num_conv_pos_embeddings": 128,
+    "num_ctc_classes": 80,
+    "num_feat_extract_layers": 7,
+    "num_hidden_layers": 12,
+    "num_negatives": 100,
+    "output_hidden_size": 768,
+    "proj_codevector_dim": 256,
+    "replace_prob": 0.5,
+    "tdnn_dilation": [
+      1,
+      2,
+      3,
+      1,
+      1
+    ],
+    "tdnn_dim": [
+      512,
+      512,
+      512,
+      512,
+      1500
+    ],
+    "tdnn_kernel": [
+      5,
+      3,
+      3,
+      1,
+      1
+    ],
+    "tokenizer_class": "Wav2Vec2CTCTokenizer",
+    "torch_dtype": "float32",
+    "use_weighted_layer_sum": false,
+    "vocab_size": 32,
+    "xvector_output_dim": 512
+  },
+  "strides": [
+    1,
+    1
+  ],
+  "target_bandwidths": [
+    0.5,
+    1,
+    1.5,
+    2,
+    4
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.56.0.dev0",
+  "unit_kernel_size": 3
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ae25e265e56ad9916a4ff972a6262b8d03081cb10524795e0b83d9f8219af59
+size 710037116