| 
							 | 
						 | 
					
					
						
						| 
							 | 
						""" | 
					
					
						
						| 
							 | 
						Stage 2: Expand Qwen3 from 64 to 80 layers using simple duplication | 
					
					
						
						| 
							 | 
						Mapping: | 
					
					
						
						| 
							 | 
						- Layers 0-23 β 0-23 (unchanged) | 
					
					
						
						| 
							 | 
						- Layers 24-39 β 24-55 (each layer duplicated once) | 
					
					
						
						| 
							 | 
						- Layers 40-63 β 56-79 (unchanged) | 
					
					
						
						| 
							 | 
						""" | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						import torch | 
					
					
						
						| 
							 | 
						import os | 
					
					
						
						| 
							 | 
						import json | 
					
					
						
						| 
							 | 
						from tqdm import tqdm | 
					
					
						
						| 
							 | 
						from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer | 
					
					
						
						| 
							 | 
						from safetensors.torch import load_file, save_file | 
					
					
						
						| 
							 | 
						import numpy as np | 
					
					
						
						| 
							 | 
						from collections import OrderedDict | 
					
					
						
						| 
							 | 
						import gc | 
					
					
						
						| 
							 | 
						import shutil | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						INPUT_DIR = "./Qwen3-58B-Embiggened"   | 
					
					
						
						| 
							 | 
						OUTPUT_DIR = "./Qwen3-72B-Embiggened" | 
					
					
						
						| 
							 | 
						TARGET_LAYERS = 80 | 
					
					
						
						| 
							 | 
						SOURCE_LAYERS = 64 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def load_model_sharted(model_path): | 
					
					
						
						| 
							 | 
						    """Load model weights from sharted safetensors files.""" | 
					
					
						
						| 
							 | 
						    print("\nπ© Loading sharted weights...") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    index_path = os.path.join(model_path, "model.safetensors.index.json") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    if not os.path.exists(index_path): | 
					
					
						
						| 
							 | 
						        raise FileNotFoundError(f"No index file found at {index_path}") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    with open(index_path, 'r') as f: | 
					
					
						
						| 
							 | 
						        index = json.load(f) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    weight_map = index['weight_map'] | 
					
					
						
						| 
							 | 
						    unique_files = set(weight_map.values()) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    all_weights = {} | 
					
					
						
						| 
							 | 
						    for file in tqdm(unique_files, desc="Loading sharts"): | 
					
					
						
						| 
							 | 
						        file_path = os.path.join(model_path, file) | 
					
					
						
						| 
							 | 
						        weights = load_file(file_path) | 
					
					
						
						| 
							 | 
						        all_weights.update(weights) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    return all_weights | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def save_model_sharted(state_dict, output_dir, max_shart_size="5GB"): | 
					
					
						
						| 
							 | 
						    """Save model in sharted safetensors format.""" | 
					
					
						
						| 
							 | 
						    print("\nπ© Sharting model weights...") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    os.makedirs(output_dir, exist_ok=True) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    size_map = {'GB': 1e9, 'MB': 1e6} | 
					
					
						
						| 
							 | 
						    for unit, multiplier in size_map.items(): | 
					
					
						
						| 
							 | 
						        if unit in max_shart_size: | 
					
					
						
						| 
							 | 
						            max_bytes = int(float(max_shart_size.replace(unit, '')) * multiplier) | 
					
					
						
						| 
							 | 
						            break | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    sharts = [] | 
					
					
						
						| 
							 | 
						    current_shart = {} | 
					
					
						
						| 
							 | 
						    current_size = 0 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    for name, tensor in state_dict.items(): | 
					
					
						
						| 
							 | 
						        tensor_size = tensor.numel() * tensor.element_size() | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if current_size + tensor_size > max_bytes and current_shart: | 
					
					
						
						| 
							 | 
						            sharts.append(current_shart) | 
					
					
						
						| 
							 | 
						            current_shart = {} | 
					
					
						
						| 
							 | 
						            current_size = 0 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        current_shart[name] = tensor | 
					
					
						
						| 
							 | 
						        current_size += tensor_size | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    if current_shart: | 
					
					
						
						| 
							 | 
						        sharts.append(current_shart) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    weight_map = {} | 
					
					
						
						| 
							 | 
						    for i, shart in enumerate(tqdm(sharts, desc="Saving sharts")): | 
					
					
						
						| 
							 | 
						        shart_name = f"model-{i+1:05d}-of-{len(sharts):05d}.safetensors" | 
					
					
						
						| 
							 | 
						        save_file(shart, os.path.join(output_dir, shart_name)) | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        for name in shart: | 
					
					
						
						| 
							 | 
						            weight_map[name] = shart_name | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    index = { | 
					
					
						
						| 
							 | 
						        "metadata": {"total_size": sum(t.numel() * t.element_size() for t in state_dict.values())}, | 
					
					
						
						| 
							 | 
						        "weight_map": weight_map | 
					
					
						
						| 
							 | 
						    } | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    with open(os.path.join(output_dir, "model.safetensors.index.json"), 'w') as f: | 
					
					
						
						| 
							 | 
						        json.dump(index, f, indent=2) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print(f"π© Successfully sharted into {len(sharts)} files!") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def extract_layer_weights(weights, layer_idx): | 
					
					
						
						| 
							 | 
						    """Extract all weights for a specific layer.""" | 
					
					
						
						| 
							 | 
						    layer_weights = OrderedDict() | 
					
					
						
						| 
							 | 
						    prefix = f"model.layers.{layer_idx}." | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    for name, tensor in weights.items(): | 
					
					
						
						| 
							 | 
						        if name.startswith(prefix): | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            component_name = name[len(prefix):] | 
					
					
						
						| 
							 | 
						            layer_weights[component_name] = tensor | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    return layer_weights | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def create_layer_weights(layer_weights, new_layer_idx): | 
					
					
						
						| 
							 | 
						    """Create weight dict with new layer index.""" | 
					
					
						
						| 
							 | 
						    result = OrderedDict() | 
					
					
						
						| 
							 | 
						    prefix = f"model.layers.{new_layer_idx}." | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    for component_name, tensor in layer_weights.items(): | 
					
					
						
						| 
							 | 
						        full_name = prefix + component_name | 
					
					
						
						| 
							 | 
						        result[full_name] = tensor.clone()   | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    return result | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def verify_architecture(model_path): | 
					
					
						
						| 
							 | 
						    """Verify the model architecture matches expected Qwen3-72B dimensions.""" | 
					
					
						
						| 
							 | 
						    print("\n" + "="*60) | 
					
					
						
						| 
							 | 
						    print("ARCHITECTURE VERIFICATION") | 
					
					
						
						| 
							 | 
						    print("="*60) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print("\nLoading model for verification...") | 
					
					
						
						| 
							 | 
						    model = AutoModelForCausalLM.from_pretrained( | 
					
					
						
						| 
							 | 
						        model_path, | 
					
					
						
						| 
							 | 
						        torch_dtype=torch.bfloat16, | 
					
					
						
						| 
							 | 
						        device_map="cpu", | 
					
					
						
						| 
							 | 
						        trust_remote_code=True | 
					
					
						
						| 
							 | 
						    ) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    expected = { | 
					
					
						
						| 
							 | 
						        "lm_head.weight": (151936, 8192), | 
					
					
						
						| 
							 | 
						        "model.embed_tokens.weight": (151936, 8192), | 
					
					
						
						| 
							 | 
						        "model.layers.0.input_layernorm.weight": (8192,), | 
					
					
						
						| 
							 | 
						        "model.layers.0.mlp.down_proj.weight": (8192, 29568), | 
					
					
						
						| 
							 | 
						        "model.layers.0.mlp.gate_proj.weight": (29568, 8192), | 
					
					
						
						| 
							 | 
						        "model.layers.0.mlp.up_proj.weight": (29568, 8192), | 
					
					
						
						| 
							 | 
						        "model.layers.0.post_attention_layernorm.weight": (8192,), | 
					
					
						
						| 
							 | 
						        "model.layers.0.self_attn.k_norm.weight": (128,), | 
					
					
						
						| 
							 | 
						        "model.layers.0.self_attn.k_proj.weight": (1024, 8192), | 
					
					
						
						| 
							 | 
						        "model.layers.0.self_attn.o_proj.weight": (8192, 8192), | 
					
					
						
						| 
							 | 
						        "model.layers.0.self_attn.q_norm.weight": (128,), | 
					
					
						
						| 
							 | 
						        "model.layers.0.self_attn.q_proj.weight": (8192, 8192), | 
					
					
						
						| 
							 | 
						        "model.layers.0.self_attn.v_proj.weight": (1024, 8192), | 
					
					
						
						| 
							 | 
						        "model.norm.weight": (8192,), | 
					
					
						
						| 
							 | 
						    } | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    all_correct = True | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    check_layers = [0, 24, 25, 39, 40, 56, 79]   | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    for layer_idx in check_layers: | 
					
					
						
						| 
							 | 
						        print(f"\nπ Checking layer {layer_idx}:") | 
					
					
						
						| 
							 | 
						        for base_name, expected_shape in expected.items(): | 
					
					
						
						| 
							 | 
						            if "layers.0." in base_name: | 
					
					
						
						| 
							 | 
						                name = base_name.replace("layers.0.", f"layers.{layer_idx}.") | 
					
					
						
						| 
							 | 
						                param_dict = dict(model.named_parameters()) | 
					
					
						
						| 
							 | 
						                if name in param_dict: | 
					
					
						
						| 
							 | 
						                    actual_shape = tuple(param_dict[name].shape) | 
					
					
						
						| 
							 | 
						                    if actual_shape == expected_shape: | 
					
					
						
						| 
							 | 
						                        print(f"   β {name.split('.')[-1]}: {actual_shape}") | 
					
					
						
						| 
							 | 
						                    else: | 
					
					
						
						| 
							 | 
						                        print(f"   β {name}: {actual_shape} (expected {expected_shape})") | 
					
					
						
						| 
							 | 
						                        all_correct = False | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    num_layers = model.config.num_hidden_layers | 
					
					
						
						| 
							 | 
						    print(f"\nTotal layers: {num_layers} (expected: 80)") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    if all_correct and num_layers == 80: | 
					
					
						
						| 
							 | 
						        print("\nβ
 Architecture verification PASSED!") | 
					
					
						
						| 
							 | 
						    else: | 
					
					
						
						| 
							 | 
						        print("\nβ Architecture verification FAILED!") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    del model | 
					
					
						
						| 
							 | 
						    torch.cuda.empty_cache() | 
					
					
						
						| 
							 | 
						    return all_correct | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def run_diagnostics(model_path): | 
					
					
						
						| 
							 | 
						    """Run comprehensive diagnostics on the expanded model.""" | 
					
					
						
						| 
							 | 
						    print("\n" + "="*60) | 
					
					
						
						| 
							 | 
						    print("COMPREHENSIVE DIAGNOSTICS") | 
					
					
						
						| 
							 | 
						    print("="*60) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print("\nLoading model for diagnostics...") | 
					
					
						
						| 
							 | 
						    model = AutoModelForCausalLM.from_pretrained( | 
					
					
						
						| 
							 | 
						        model_path, | 
					
					
						
						| 
							 | 
						        torch_dtype=torch.bfloat16, | 
					
					
						
						| 
							 | 
						        device_map="auto", | 
					
					
						
						| 
							 | 
						        trust_remote_code=True | 
					
					
						
						| 
							 | 
						    ) | 
					
					
						
						| 
							 | 
						    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print("\nπ§ͺ Generation Quality Tests:") | 
					
					
						
						| 
							 | 
						    test_cases = [ | 
					
					
						
						| 
							 | 
						        ("The capital of France is", ["Paris"]), | 
					
					
						
						| 
							 | 
						        ("2 + 2 =", ["4", "four"]), | 
					
					
						
						| 
							 | 
						        ("The quick brown fox", ["jumps", "jumped", "lazy", "dog"]), | 
					
					
						
						| 
							 | 
						        ("Hello, my name is", None), | 
					
					
						
						| 
							 | 
						        ("Water boils at", ["100", "212", "degrees"]), | 
					
					
						
						| 
							 | 
						        ("The Earth orbits the", ["Sun", "solar"]), | 
					
					
						
						| 
							 | 
						        ("Machine learning is a type of", ["artificial intelligence", "AI"]), | 
					
					
						
						| 
							 | 
						        ("Python is a", ["programming", "language", "snake"]), | 
					
					
						
						| 
							 | 
						        ("The largest planet is", ["Jupiter"]), | 
					
					
						
						| 
							 | 
						        ("DNA stands for", ["deoxyribonucleic", "acid"]), | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        ("The derivative of x squared is", ["2x", "two"]), | 
					
					
						
						| 
							 | 
						        ("Shakespeare wrote", ["plays", "Hamlet", "Romeo"]), | 
					
					
						
						| 
							 | 
						        ("The speed of light is", ["299", "300", "fast"]), | 
					
					
						
						| 
							 | 
						        ("Photosynthesis converts", ["light", "energy", "carbon"]), | 
					
					
						
						| 
							 | 
						        ("The Pythagorean theorem states", ["aΒ²", "squared", "hypotenuse"]), | 
					
					
						
						| 
							 | 
						    ] | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    device = model.device | 
					
					
						
						| 
							 | 
						    coherent_count = 0 | 
					
					
						
						| 
							 | 
						    total_tests = len(test_cases) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    for prompt, expected in test_cases: | 
					
					
						
						| 
							 | 
						        inputs = tokenizer(prompt, return_tensors="pt").to(device) | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        with torch.no_grad(): | 
					
					
						
						| 
							 | 
						            outputs = model.generate( | 
					
					
						
						| 
							 | 
						                **inputs, | 
					
					
						
						| 
							 | 
						                max_new_tokens=20, | 
					
					
						
						| 
							 | 
						                do_sample=True, | 
					
					
						
						| 
							 | 
						                temperature=0.7, | 
					
					
						
						| 
							 | 
						                top_k=50, | 
					
					
						
						| 
							 | 
						                top_p=0.95, | 
					
					
						
						| 
							 | 
						                pad_token_id=tokenizer.pad_token_id, | 
					
					
						
						| 
							 | 
						            ) | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | 
					
					
						
						| 
							 | 
						        generated_only = generated_text[len(prompt):].strip() | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        print(f"\n   Prompt: '{prompt}'") | 
					
					
						
						| 
							 | 
						        print(f"   Generated: '{generated_only}'") | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        is_coherent = True | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        words = generated_only.split() | 
					
					
						
						| 
							 | 
						        if len(words) > 3: | 
					
					
						
						| 
							 | 
						            if len(set(words)) < len(words) / 2: | 
					
					
						
						| 
							 | 
						                print("   β οΈ  High repetition detected") | 
					
					
						
						| 
							 | 
						                is_coherent = False | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if expected and len(generated_only) > 0: | 
					
					
						
						| 
							 | 
						            found = any(kw.lower() in generated_only.lower() for kw in expected) | 
					
					
						
						| 
							 | 
						            if found: | 
					
					
						
						| 
							 | 
						                print("   β Contains expected content") | 
					
					
						
						| 
							 | 
						            else: | 
					
					
						
						| 
							 | 
						                print("   β οΈ  Missing expected keywords") | 
					
					
						
						| 
							 | 
						                is_coherent = False | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if is_coherent and len(generated_only.split()) >= 2: | 
					
					
						
						| 
							 | 
						            coherent_count += 1 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    coherence_rate = (coherent_count / total_tests) * 100 | 
					
					
						
						| 
							 | 
						    print(f"\nπ Overall coherence rate: {coherence_rate:.1f}%") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print("\nπ Perplexity Test:") | 
					
					
						
						| 
							 | 
						    test_texts = [ | 
					
					
						
						| 
							 | 
						        "The quick brown fox jumps over the lazy dog.", | 
					
					
						
						| 
							 | 
						        "In the beginning was the Word, and the Word was with God.", | 
					
					
						
						| 
							 | 
						        "To be or not to be, that is the question.", | 
					
					
						
						| 
							 | 
						        "E equals m c squared is Einstein's famous equation.", | 
					
					
						
						| 
							 | 
						    ] | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    perplexities = [] | 
					
					
						
						| 
							 | 
						    for test_text in test_texts: | 
					
					
						
						| 
							 | 
						        inputs = tokenizer(test_text, return_tensors="pt").to(device) | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        with torch.no_grad(): | 
					
					
						
						| 
							 | 
						            outputs = model(**inputs, labels=inputs["input_ids"]) | 
					
					
						
						| 
							 | 
						            perplexity = torch.exp(outputs.loss).item() | 
					
					
						
						| 
							 | 
						            perplexities.append(perplexity) | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        print(f"   '{test_text[:30]}...': {perplexity:.2f}") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    avg_perplexity = np.mean(perplexities) | 
					
					
						
						| 
							 | 
						    print(f"\n   Average perplexity: {avg_perplexity:.2f}") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    if avg_perplexity > 100: | 
					
					
						
						| 
							 | 
						        print("   β οΈ  Very high perplexity") | 
					
					
						
						| 
							 | 
						    elif avg_perplexity > 50: | 
					
					
						
						| 
							 | 
						        print("   β οΈ  Moderately high perplexity") | 
					
					
						
						| 
							 | 
						    else: | 
					
					
						
						| 
							 | 
						        print("   β Reasonable perplexity") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print("\n㪠Duplicate Layer Analysis:") | 
					
					
						
						| 
							 | 
						    print("Checking if duplicated layers maintain reasonable behavior...") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    test_input = "The meaning of life is" | 
					
					
						
						| 
							 | 
						    inputs = tokenizer(test_input, return_tensors="pt").to(device) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    activations = {} | 
					
					
						
						| 
							 | 
						    hooks = [] | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    def get_activation(name): | 
					
					
						
						| 
							 | 
						        def hook(model, input, output): | 
					
					
						
						| 
							 | 
						            activations[name] = output[0].detach() | 
					
					
						
						| 
							 | 
						        return hook | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    for layer_idx in [24, 25, 39, 40]:   | 
					
					
						
						| 
							 | 
						        hook = model.model.layers[layer_idx].register_forward_hook( | 
					
					
						
						| 
							 | 
						            get_activation(f'layer_{layer_idx}') | 
					
					
						
						| 
							 | 
						        ) | 
					
					
						
						| 
							 | 
						        hooks.append(hook) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    with torch.no_grad(): | 
					
					
						
						| 
							 | 
						        _ = model(**inputs) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    for hook in hooks: | 
					
					
						
						| 
							 | 
						        hook.remove() | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    if len(activations) >= 4: | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        act_24 = activations['layer_24'].flatten() | 
					
					
						
						| 
							 | 
						        act_25 = activations['layer_25'].flatten() | 
					
					
						
						| 
							 | 
						        similarity_24_25 = torch.cosine_similarity(act_24.unsqueeze(0), act_25.unsqueeze(0)).item() | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        act_39 = activations['layer_39'].flatten() | 
					
					
						
						| 
							 | 
						        act_40 = activations['layer_40'].flatten() | 
					
					
						
						| 
							 | 
						        similarity_39_40 = torch.cosine_similarity(act_39.unsqueeze(0), act_40.unsqueeze(0)).item() | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        print(f"   Cosine similarity layer 24 vs 25 (duplicate): {similarity_24_25:.4f}") | 
					
					
						
						| 
							 | 
						        print(f"   Cosine similarity layer 39 vs 40 (different): {similarity_39_40:.4f}") | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if similarity_24_25 > 0.95: | 
					
					
						
						| 
							 | 
						            print("   β Duplicate layers show expected high similarity") | 
					
					
						
						| 
							 | 
						        else: | 
					
					
						
						| 
							 | 
						            print("   β οΈ  Duplicate layers diverged more than expected") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print("\nπ Weight Statistics (checking for anomalies):") | 
					
					
						
						| 
							 | 
						    anomalies = 0 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    for name, param in model.named_parameters(): | 
					
					
						
						| 
							 | 
						        if torch.isnan(param).any(): | 
					
					
						
						| 
							 | 
						            print(f"   β οΈ  {name}: Contains NaN!") | 
					
					
						
						| 
							 | 
						            anomalies += 1 | 
					
					
						
						| 
							 | 
						        elif torch.isinf(param).any(): | 
					
					
						
						| 
							 | 
						            print(f"   β οΈ  {name}: Contains Inf!") | 
					
					
						
						| 
							 | 
						            anomalies += 1 | 
					
					
						
						| 
							 | 
						        elif param.std() < 1e-8: | 
					
					
						
						| 
							 | 
						            print(f"   β οΈ  {name}: Zero variance!") | 
					
					
						
						| 
							 | 
						            anomalies += 1 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    if anomalies == 0: | 
					
					
						
						| 
							 | 
						        print("   β No anomalies detected in weights") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    success = coherence_rate >= 60 and avg_perplexity < 100 and anomalies == 0 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print("\n" + "="*60) | 
					
					
						
						| 
							 | 
						    print("DIAGNOSTIC SUMMARY") | 
					
					
						
						| 
							 | 
						    print("="*60) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    if success: | 
					
					
						
						| 
							 | 
						        print("β
 Model passed all diagnostics!") | 
					
					
						
						| 
							 | 
						        print("   - Good coherence rate") | 
					
					
						
						| 
							 | 
						        print("   - Reasonable perplexity") | 
					
					
						
						| 
							 | 
						        print("   - No weight anomalies") | 
					
					
						
						| 
							 | 
						        print("   - Duplicate layers functioning correctly") | 
					
					
						
						| 
							 | 
						    else: | 
					
					
						
						| 
							 | 
						        print("β οΈ  Some issues detected:") | 
					
					
						
						| 
							 | 
						        if coherence_rate < 60: | 
					
					
						
						| 
							 | 
						            print(f"   - Low coherence rate: {coherence_rate:.1f}%") | 
					
					
						
						| 
							 | 
						        if avg_perplexity >= 100: | 
					
					
						
						| 
							 | 
						            print(f"   - High average perplexity: {avg_perplexity:.2f}") | 
					
					
						
						| 
							 | 
						        if anomalies > 0: | 
					
					
						
						| 
							 | 
						            print(f"   - Weight anomalies: {anomalies}") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    del model | 
					
					
						
						| 
							 | 
						    torch.cuda.empty_cache() | 
					
					
						
						| 
							 | 
						    return success | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def main(): | 
					
					
						
						| 
							 | 
						    print("="*60) | 
					
					
						
						| 
							 | 
						    print("Stage 2: Simple Layer Duplication") | 
					
					
						
						| 
							 | 
						    print("64 layers β 80 layers") | 
					
					
						
						| 
							 | 
						    print("="*60) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print(f"\nπ₯ Loading model from: {INPUT_DIR}") | 
					
					
						
						| 
							 | 
						    weights = load_model_sharted(INPUT_DIR) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print(f"\nπ Loaded {len(weights)} tensors") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    new_weights = OrderedDict() | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print("\nπ Copying non-layer weights...") | 
					
					
						
						| 
							 | 
						    for name, tensor in weights.items(): | 
					
					
						
						| 
							 | 
						        if not name.startswith("model.layers."): | 
					
					
						
						| 
							 | 
						            new_weights[name] = tensor.clone() | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print("\nπ Expanding layers with simple duplication...") | 
					
					
						
						| 
							 | 
						    print("   Layers 0-23: Direct copy") | 
					
					
						
						| 
							 | 
						    print("   Layers 24-39: Each layer duplicated once") | 
					
					
						
						| 
							 | 
						    print("   Layers 40-63: Direct copy (shifted to 56-79)") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    new_layer_idx = 0 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    with tqdm(total=TARGET_LAYERS, desc="Creating layers") as pbar: | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        for old_idx in range(24): | 
					
					
						
						| 
							 | 
						            layer_weights = extract_layer_weights(weights, old_idx) | 
					
					
						
						| 
							 | 
						            new_weights.update(create_layer_weights(layer_weights, new_layer_idx)) | 
					
					
						
						| 
							 | 
						            new_layer_idx += 1 | 
					
					
						
						| 
							 | 
						            pbar.update(1) | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        for old_idx in range(24, 40): | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            layer_weights = extract_layer_weights(weights, old_idx) | 
					
					
						
						| 
							 | 
						            new_weights.update(create_layer_weights(layer_weights, new_layer_idx)) | 
					
					
						
						| 
							 | 
						            new_layer_idx += 1 | 
					
					
						
						| 
							 | 
						            pbar.update(1) | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						             | 
					
					
						
						| 
							 | 
						            print(f"\n   Duplicating layer {old_idx} β layer {new_layer_idx}") | 
					
					
						
						| 
							 | 
						            new_weights.update(create_layer_weights(layer_weights, new_layer_idx)) | 
					
					
						
						| 
							 | 
						            new_layer_idx += 1 | 
					
					
						
						| 
							 | 
						            pbar.update(1) | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        for old_idx in range(40, 64): | 
					
					
						
						| 
							 | 
						            layer_weights = extract_layer_weights(weights, old_idx) | 
					
					
						
						| 
							 | 
						            new_weights.update(create_layer_weights(layer_weights, new_layer_idx)) | 
					
					
						
						| 
							 | 
						            new_layer_idx += 1 | 
					
					
						
						| 
							 | 
						            pbar.update(1) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print(f"\nβ Created {new_layer_idx} layers") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    if new_layer_idx != TARGET_LAYERS: | 
					
					
						
						| 
							 | 
						        print(f"\nβ ERROR: Created {new_layer_idx} layers but expected {TARGET_LAYERS}") | 
					
					
						
						| 
							 | 
						        print("Layer creation failed. Exiting.") | 
					
					
						
						| 
							 | 
						        return False | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print("\nπ Updating model configuration...") | 
					
					
						
						| 
							 | 
						    config_path = os.path.join(INPUT_DIR, "config.json") | 
					
					
						
						| 
							 | 
						    with open(config_path, 'r') as f: | 
					
					
						
						| 
							 | 
						        config = json.load(f) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    config['num_hidden_layers'] = TARGET_LAYERS | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print(f"\nπΎ Saving expanded model to: {OUTPUT_DIR}") | 
					
					
						
						| 
							 | 
						    os.makedirs(OUTPUT_DIR, exist_ok=True) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    with open(os.path.join(OUTPUT_DIR, "config.json"), 'w') as f: | 
					
					
						
						| 
							 | 
						        json.dump(config, f, indent=2) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    tokenizer_files = [ | 
					
					
						
						| 
							 | 
						        'tokenizer.json', 'tokenizer_config.json',  | 
					
					
						
						| 
							 | 
						        'special_tokens_map.json', 'generation_config.json' | 
					
					
						
						| 
							 | 
						    ] | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    for file in tokenizer_files: | 
					
					
						
						| 
							 | 
						        src = os.path.join(INPUT_DIR, file) | 
					
					
						
						| 
							 | 
						        dst = os.path.join(OUTPUT_DIR, file) | 
					
					
						
						| 
							 | 
						        if os.path.exists(src): | 
					
					
						
						| 
							 | 
						            shutil.copy(src, dst) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    save_model_sharted(new_weights, OUTPUT_DIR) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    metadata = { | 
					
					
						
						| 
							 | 
						        "stage": "2-duplicate", | 
					
					
						
						| 
							 | 
						        "source_model": INPUT_DIR, | 
					
					
						
						| 
							 | 
						        "method": "Simple layer duplication", | 
					
					
						
						| 
							 | 
						        "layer_mapping": { | 
					
					
						
						| 
							 | 
						            "0-23": "0-23 (unchanged)", | 
					
					
						
						| 
							 | 
						            "24-39": "24-55 (each duplicated once)", | 
					
					
						
						| 
							 | 
						            "40-63": "56-79 (unchanged)" | 
					
					
						
						| 
							 | 
						        }, | 
					
					
						
						| 
							 | 
						        "duplication_info": { | 
					
					
						
						| 
							 | 
						            "method": "exact_copy", | 
					
					
						
						| 
							 | 
						            "layers_duplicated": list(range(24, 40)) | 
					
					
						
						| 
							 | 
						        }, | 
					
					
						
						| 
							 | 
						        "final_layers": TARGET_LAYERS | 
					
					
						
						| 
							 | 
						    } | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    with open(os.path.join(OUTPUT_DIR, "stage2_metadata.json"), 'w') as f: | 
					
					
						
						| 
							 | 
						        json.dump(metadata, f, indent=2) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print("\nβ
 Stage 2 duplication complete!") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print("\nπ Quick verification:") | 
					
					
						
						| 
							 | 
						    print(f"   Total weights: {len(new_weights)}") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    layer_count = 0 | 
					
					
						
						| 
							 | 
						    for name in new_weights.keys(): | 
					
					
						
						| 
							 | 
						        if name.startswith("model.layers.") and ".input_layernorm.weight" in name: | 
					
					
						
						| 
							 | 
						            layer_count += 1 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print(f"   Layer count: {layer_count} (expected: {TARGET_LAYERS})") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print("\n㪠Checking layer duplication:") | 
					
					
						
						| 
							 | 
						    test_component = "self_attn.q_proj.weight" | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    if f"model.layers.24.{test_component}" in new_weights and f"model.layers.25.{test_component}" in new_weights: | 
					
					
						
						| 
							 | 
						        layer24 = new_weights[f"model.layers.24.{test_component}"] | 
					
					
						
						| 
							 | 
						        layer25 = new_weights[f"model.layers.25.{test_component}"] | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        if torch.equal(layer24, layer25): | 
					
					
						
						| 
							 | 
						            print("   β Layer 24 and 25 are identical (as expected)") | 
					
					
						
						| 
							 | 
						        else: | 
					
					
						
						| 
							 | 
						            print("   β οΈ  Layer 24 and 25 differ (unexpected!)") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    print(f"\nπ SUCCESS! Model expanded to {TARGET_LAYERS} layers.") | 
					
					
						
						| 
							 | 
						    print(f"π Output saved to: {OUTPUT_DIR}") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    arch_ok = verify_architecture(OUTPUT_DIR) | 
					
					
						
						| 
							 | 
						    diag_ok = run_diagnostics(OUTPUT_DIR) | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    if arch_ok and diag_ok: | 
					
					
						
						| 
							 | 
						        print("\nπ FINAL SUCCESS! Your Qwen3-72B-DupeLayers model is ready and verified!") | 
					
					
						
						| 
							 | 
						        print("\nπ Final architecture:") | 
					
					
						
						| 
							 | 
						        print("   Hidden size: 8192") | 
					
					
						
						| 
							 | 
						        print("   Intermediate size: 29568")  | 
					
					
						
						| 
							 | 
						        print("   Attention heads: 64") | 
					
					
						
						| 
							 | 
						        print("   KV heads: 8") | 
					
					
						
						| 
							 | 
						        print("   Layers: 80") | 
					
					
						
						| 
							 | 
						        print("   Vocabulary: 151936") | 
					
					
						
						| 
							 | 
						        print("\nπ‘ The model has passed all quality checks and is ready for use!") | 
					
					
						
						| 
							 | 
						    else: | 
					
					
						
						| 
							 | 
						        print("\nβ οΈ  Some verification issues detected. Please review the diagnostics above.") | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    return arch_ok and diag_ok | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						if __name__ == "__main__": | 
					
					
						
						| 
							 | 
						    success = main() | 
					
					
						
						| 
							 | 
						    exit(0 if success else 1) | 
					
					
						
						| 
							 | 
						
 |