Qwen3-72B-Embiggened / stage2_v3.py

Update stage2_v3.py

aa9e8d9 verified 5 months ago

20.2 kB

	#!/usr/bin/env python
	"""
	Stage 2: Expand Qwen3 from 64 to 80 layers using simple duplication
	Mapping:
	- Layers 0-23 → 0-23 (unchanged)
	- Layers 24-39 → 24-55 (each layer duplicated once)
	- Layers 40-63 → 56-79 (unchanged)
	"""

	import torch
	import os
	import json
	from tqdm import tqdm
	from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
	from safetensors.torch import load_file, save_file
	import numpy as np
	from collections import OrderedDict
	import gc
	import shutil

	# Configuration
	INPUT_DIR = "./Qwen3-58B-Embiggened" # Output from stage 1
	OUTPUT_DIR = "./Qwen3-72B-Embiggened"
	TARGET_LAYERS = 80
	SOURCE_LAYERS = 64

	def load_model_sharted(model_path):
	"""Load model weights from sharted safetensors files."""
	print("\n💩 Loading sharted weights...")

	index_path = os.path.join(model_path, "model.safetensors.index.json")

	if not os.path.exists(index_path):
	raise FileNotFoundError(f"No index file found at {index_path}")

	with open(index_path, 'r') as f:
	index = json.load(f)

	weight_map = index['weight_map']
	unique_files = set(weight_map.values())

	all_weights = {}
	for file in tqdm(unique_files, desc="Loading sharts"):
	file_path = os.path.join(model_path, file)
	weights = load_file(file_path)
	all_weights.update(weights)

	return all_weights

	def save_model_sharted(state_dict, output_dir, max_shart_size="5GB"):
	"""Save model in sharted safetensors format."""
	print("\n💩 Sharting model weights...")

	os.makedirs(output_dir, exist_ok=True)

	# Convert max_shart_size to bytes
	size_map = {'GB': 1e9, 'MB': 1e6}
	for unit, multiplier in size_map.items():
	if unit in max_shart_size:
	max_bytes = int(float(max_shart_size.replace(unit, '')) * multiplier)
	break

	# Group weights into sharts
	sharts = []
	current_shart = {}
	current_size = 0

	for name, tensor in state_dict.items():
	tensor_size = tensor.numel() * tensor.element_size()

	if current_size + tensor_size > max_bytes and current_shart:
	sharts.append(current_shart)
	current_shart = {}
	current_size = 0

	current_shart[name] = tensor
	current_size += tensor_size

	if current_shart:
	sharts.append(current_shart)

	# Save sharts
	weight_map = {}
	for i, shart in enumerate(tqdm(sharts, desc="Saving sharts")):
	shart_name = f"model-{i+1:05d}-of-{len(sharts):05d}.safetensors"
	save_file(shart, os.path.join(output_dir, shart_name))

	for name in shart:
	weight_map[name] = shart_name

	# Save index
	index = {
	"metadata": {"total_size": sum(t.numel() * t.element_size() for t in state_dict.values())},
	"weight_map": weight_map
	}

	with open(os.path.join(output_dir, "model.safetensors.index.json"), 'w') as f:
	json.dump(index, f, indent=2)

	print(f"💩 Successfully sharted into {len(sharts)} files!")

	def extract_layer_weights(weights, layer_idx):
	"""Extract all weights for a specific layer."""
	layer_weights = OrderedDict()
	prefix = f"model.layers.{layer_idx}."

	for name, tensor in weights.items():
	if name.startswith(prefix):
	# Remove the layer prefix to get the component name
	component_name = name[len(prefix):]
	layer_weights[component_name] = tensor

	return layer_weights

	def create_layer_weights(layer_weights, new_layer_idx):
	"""Create weight dict with new layer index."""
	result = OrderedDict()
	prefix = f"model.layers.{new_layer_idx}."

	for component_name, tensor in layer_weights.items():
	full_name = prefix + component_name
	result[full_name] = tensor.clone() # Clone to ensure independent copies

	return result

	def verify_architecture(model_path):
	"""Verify the model architecture matches expected Qwen3-72B dimensions."""
	print("\n" + "="*60)
	print("ARCHITECTURE VERIFICATION")
	print("="*60)

	print("\nLoading model for verification...")
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	torch_dtype=torch.bfloat16,
	device_map="cpu",
	trust_remote_code=True
	)

	expected = {
	"lm_head.weight": (151936, 8192),
	"model.embed_tokens.weight": (151936, 8192),
	"model.layers.0.input_layernorm.weight": (8192,),
	"model.layers.0.mlp.down_proj.weight": (8192, 29568),
	"model.layers.0.mlp.gate_proj.weight": (29568, 8192),
	"model.layers.0.mlp.up_proj.weight": (29568, 8192),
	"model.layers.0.post_attention_layernorm.weight": (8192,),
	"model.layers.0.self_attn.k_norm.weight": (128,),
	"model.layers.0.self_attn.k_proj.weight": (1024, 8192),
	"model.layers.0.self_attn.o_proj.weight": (8192, 8192),
	"model.layers.0.self_attn.q_norm.weight": (128,),
	"model.layers.0.self_attn.q_proj.weight": (8192, 8192),
	"model.layers.0.self_attn.v_proj.weight": (1024, 8192),
	"model.norm.weight": (8192,),
	}

	all_correct = True

	# Check specific layers including duplicated ones
	check_layers = [0, 24, 25, 39, 40, 56, 79] # Original and duplicated layers

	for layer_idx in check_layers:
	print(f"\n📍 Checking layer {layer_idx}:")
	for base_name, expected_shape in expected.items():
	if "layers.0." in base_name:
	name = base_name.replace("layers.0.", f"layers.{layer_idx}.")
	param_dict = dict(model.named_parameters())
	if name in param_dict:
	actual_shape = tuple(param_dict[name].shape)
	if actual_shape == expected_shape:
	print(f" ✓ {name.split('.')[-1]}: {actual_shape}")
	else:
	print(f" ✗ {name}: {actual_shape} (expected {expected_shape})")
	all_correct = False

	num_layers = model.config.num_hidden_layers
	print(f"\nTotal layers: {num_layers} (expected: 80)")

	if all_correct and num_layers == 80:
	print("\n✅ Architecture verification PASSED!")
	else:
	print("\n❌ Architecture verification FAILED!")

	del model
	torch.cuda.empty_cache()
	return all_correct

	def run_diagnostics(model_path):
	"""Run comprehensive diagnostics on the expanded model."""
	print("\n" + "="*60)
	print("COMPREHENSIVE DIAGNOSTICS")
	print("="*60)

	# Load model and tokenizer
	print("\nLoading model for diagnostics...")
	model = AutoModelForCausalLM.from_pretrained(
	model_path,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	trust_remote_code=True
	)
	tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

	# Test generation quality
	print("\n🧪 Generation Quality Tests:")
	test_cases = [
	("The capital of France is", ["Paris"]),
	("2 + 2 =", ["4", "four"]),
	("The quick brown fox", ["jumps", "jumped", "lazy", "dog"]),
	("Hello, my name is", None),
	("Water boils at", ["100", "212", "degrees"]),
	("The Earth orbits the", ["Sun", "solar"]),
	("Machine learning is a type of", ["artificial intelligence", "AI"]),
	("Python is a", ["programming", "language", "snake"]),
	("The largest planet is", ["Jupiter"]),
	("DNA stands for", ["deoxyribonucleic", "acid"]),
	# Additional tests
	("The derivative of x squared is", ["2x", "two"]),
	("Shakespeare wrote", ["plays", "Hamlet", "Romeo"]),
	("The speed of light is", ["299", "300", "fast"]),
	("Photosynthesis converts", ["light", "energy", "carbon"]),
	("The Pythagorean theorem states", ["a²", "squared", "hypotenuse"]),
	]

	device = model.device
	coherent_count = 0
	total_tests = len(test_cases)

	for prompt, expected in test_cases:
	inputs = tokenizer(prompt, return_tensors="pt").to(device)

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=20,
	do_sample=True,
	temperature=0.7,
	top_k=50,
	top_p=0.95,
	pad_token_id=tokenizer.pad_token_id,
	)

	generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	generated_only = generated_text[len(prompt):].strip()

	print(f"\n Prompt: '{prompt}'")
	print(f" Generated: '{generated_only}'")

	# Check coherence
	is_coherent = True

	# Check for repetition
	words = generated_only.split()
	if len(words) > 3:
	if len(set(words)) < len(words) / 2:
	print(" ⚠️ High repetition detected")
	is_coherent = False

	# Check for expected content
	if expected and len(generated_only) > 0:
	found = any(kw.lower() in generated_only.lower() for kw in expected)
	if found:
	print(" ✓ Contains expected content")
	else:
	print(" ⚠️ Missing expected keywords")
	is_coherent = False

	if is_coherent and len(generated_only.split()) >= 2:
	coherent_count += 1

	coherence_rate = (coherent_count / total_tests) * 100
	print(f"\n📊 Overall coherence rate: {coherence_rate:.1f}%")

	# Perplexity test
	print("\n📈 Perplexity Test:")
	test_texts = [
	"The quick brown fox jumps over the lazy dog.",
	"In the beginning was the Word, and the Word was with God.",
	"To be or not to be, that is the question.",
	"E equals m c squared is Einstein's famous equation.",
	]

	perplexities = []
	for test_text in test_texts:
	inputs = tokenizer(test_text, return_tensors="pt").to(device)

	with torch.no_grad():
	outputs = model(**inputs, labels=inputs["input_ids"])
	perplexity = torch.exp(outputs.loss).item()
	perplexities.append(perplexity)

	print(f" '{test_text[:30]}...': {perplexity:.2f}")

	avg_perplexity = np.mean(perplexities)
	print(f"\n Average perplexity: {avg_perplexity:.2f}")

	if avg_perplexity > 100:
	print(" ⚠️ Very high perplexity")
	elif avg_perplexity > 50:
	print(" ⚠️ Moderately high perplexity")
	else:
	print(" ✓ Reasonable perplexity")

	# Test duplicate layer behavior
	print("\n🔬 Duplicate Layer Analysis:")
	print("Checking if duplicated layers maintain reasonable behavior...")

	# Get activations from a few layers
	test_input = "The meaning of life is"
	inputs = tokenizer(test_input, return_tensors="pt").to(device)

	activations = {}
	hooks = []

	def get_activation(name):
	def hook(model, input, output):
	activations[name] = output[0].detach()
	return hook

	# Register hooks for duplicate pairs
	for layer_idx in [24, 25, 39, 40]: # Original and duplicate
	hook = model.model.layers[layer_idx].register_forward_hook(
	get_activation(f'layer_{layer_idx}')
	)
	hooks.append(hook)

	with torch.no_grad():
	_ = model(**inputs)

	# Remove hooks
	for hook in hooks:
	hook.remove()

	# Check similarity of duplicates
	if len(activations) >= 4:
	# Check 24 vs 25 (should be duplicates)
	act_24 = activations['layer_24'].flatten()
	act_25 = activations['layer_25'].flatten()
	similarity_24_25 = torch.cosine_similarity(act_24.unsqueeze(0), act_25.unsqueeze(0)).item()

	# Check 39 vs 40 (should be different - 40 is original layer 40, not duplicate)
	act_39 = activations['layer_39'].flatten()
	act_40 = activations['layer_40'].flatten()
	similarity_39_40 = torch.cosine_similarity(act_39.unsqueeze(0), act_40.unsqueeze(0)).item()

	print(f" Cosine similarity layer 24 vs 25 (duplicate): {similarity_24_25:.4f}")
	print(f" Cosine similarity layer 39 vs 40 (different): {similarity_39_40:.4f}")

	if similarity_24_25 > 0.95:
	print(" ✓ Duplicate layers show expected high similarity")
	else:
	print(" ⚠️ Duplicate layers diverged more than expected")

	# Weight statistics check
	print("\n🔍 Weight Statistics (checking for anomalies):")
	anomalies = 0

	for name, param in model.named_parameters():
	if torch.isnan(param).any():
	print(f" ⚠️ {name}: Contains NaN!")
	anomalies += 1
	elif torch.isinf(param).any():
	print(f" ⚠️ {name}: Contains Inf!")
	anomalies += 1
	elif param.std() < 1e-8:
	print(f" ⚠️ {name}: Zero variance!")
	anomalies += 1

	if anomalies == 0:
	print(" ✓ No anomalies detected in weights")

	# Final summary
	success = coherence_rate >= 60 and avg_perplexity < 100 and anomalies == 0

	print("\n" + "="*60)
	print("DIAGNOSTIC SUMMARY")
	print("="*60)

	if success:
	print("✅ Model passed all diagnostics!")
	print(" - Good coherence rate")
	print(" - Reasonable perplexity")
	print(" - No weight anomalies")
	print(" - Duplicate layers functioning correctly")
	else:
	print("⚠️ Some issues detected:")
	if coherence_rate < 60:
	print(f" - Low coherence rate: {coherence_rate:.1f}%")
	if avg_perplexity >= 100:
	print(f" - High average perplexity: {avg_perplexity:.2f}")
	if anomalies > 0:
	print(f" - Weight anomalies: {anomalies}")

	del model
	torch.cuda.empty_cache()
	return success

	def main():
	print("="*60)
	print("Stage 2: Simple Layer Duplication")
	print("64 layers → 80 layers")
	print("="*60)

	# Load weights from stage 1
	print(f"\n📥 Loading model from: {INPUT_DIR}")
	weights = load_model_sharted(INPUT_DIR)

	print(f"\n📊 Loaded {len(weights)} tensors")

	# Create new weight dictionary
	new_weights = OrderedDict()

	# Copy non-layer weights
	print("\n📋 Copying non-layer weights...")
	for name, tensor in weights.items():
	if not name.startswith("model.layers."):
	new_weights[name] = tensor.clone()

	# Layer expansion with progress bar
	print("\n🔄 Expanding layers with simple duplication...")
	print(" Layers 0-23: Direct copy")
	print(" Layers 24-39: Each layer duplicated once")
	print(" Layers 40-63: Direct copy (shifted to 56-79)")

	new_layer_idx = 0

	with tqdm(total=TARGET_LAYERS, desc="Creating layers") as pbar:
	# Copy layers 0-23 unchanged
	for old_idx in range(24):
	layer_weights = extract_layer_weights(weights, old_idx)
	new_weights.update(create_layer_weights(layer_weights, new_layer_idx))
	new_layer_idx += 1
	pbar.update(1)

	# Duplicate layers 24-39
	for old_idx in range(24, 40):
	# Copy original layer
	layer_weights = extract_layer_weights(weights, old_idx)
	new_weights.update(create_layer_weights(layer_weights, new_layer_idx))
	new_layer_idx += 1
	pbar.update(1)

	# Duplicate the same layer
	print(f"\n Duplicating layer {old_idx} → layer {new_layer_idx}")
	new_weights.update(create_layer_weights(layer_weights, new_layer_idx))
	new_layer_idx += 1
	pbar.update(1)

	# Copy layers 40-63 to positions 56-79
	for old_idx in range(40, 64):
	layer_weights = extract_layer_weights(weights, old_idx)
	new_weights.update(create_layer_weights(layer_weights, new_layer_idx))
	new_layer_idx += 1
	pbar.update(1)

	print(f"\n✓ Created {new_layer_idx} layers")

	# Verify we have all layers
	if new_layer_idx != TARGET_LAYERS:
	print(f"\n❌ ERROR: Created {new_layer_idx} layers but expected {TARGET_LAYERS}")
	print("Layer creation failed. Exiting.")
	return False

	# Update config
	print("\n📝 Updating model configuration...")
	config_path = os.path.join(INPUT_DIR, "config.json")
	with open(config_path, 'r') as f:
	config = json.load(f)

	config['num_hidden_layers'] = TARGET_LAYERS

	# Save everything
	print(f"\n💾 Saving expanded model to: {OUTPUT_DIR}")
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	# Save config
	with open(os.path.join(OUTPUT_DIR, "config.json"), 'w') as f:
	json.dump(config, f, indent=2)

	# Copy tokenizer files
	tokenizer_files = [
	'tokenizer.json', 'tokenizer_config.json',
	'special_tokens_map.json', 'generation_config.json'
	]

	for file in tokenizer_files:
	src = os.path.join(INPUT_DIR, file)
	dst = os.path.join(OUTPUT_DIR, file)
	if os.path.exists(src):
	shutil.copy(src, dst)

	# Save weights in sharted format
	save_model_sharted(new_weights, OUTPUT_DIR)

	# Save metadata
	metadata = {
	"stage": "2-duplicate",
	"source_model": INPUT_DIR,
	"method": "Simple layer duplication",
	"layer_mapping": {
	"0-23": "0-23 (unchanged)",
	"24-39": "24-55 (each duplicated once)",
	"40-63": "56-79 (unchanged)"
	},
	"duplication_info": {
	"method": "exact_copy",
	"layers_duplicated": list(range(24, 40))
	},
	"final_layers": TARGET_LAYERS
	}

	with open(os.path.join(OUTPUT_DIR, "stage2_metadata.json"), 'w') as f:
	json.dump(metadata, f, indent=2)

	print("\n✅ Stage 2 duplication complete!")

	# Quick verification
	print("\n🔍 Quick verification:")
	print(f" Total weights: {len(new_weights)}")

	# Count layers
	layer_count = 0
	for name in new_weights.keys():
	if name.startswith("model.layers.") and ".input_layernorm.weight" in name:
	layer_count += 1

	print(f" Layer count: {layer_count} (expected: {TARGET_LAYERS})")

	# Check duplicate similarity
	print("\n🔬 Checking layer duplication:")
	test_component = "self_attn.q_proj.weight"

	# Check first duplicate pair
	if f"model.layers.24.{test_component}" in new_weights and f"model.layers.25.{test_component}" in new_weights:
	layer24 = new_weights[f"model.layers.24.{test_component}"]
	layer25 = new_weights[f"model.layers.25.{test_component}"]

	# Should be identical
	if torch.equal(layer24, layer25):
	print(" ✓ Layer 24 and 25 are identical (as expected)")
	else:
	print(" ⚠️ Layer 24 and 25 differ (unexpected!)")

	print(f"\n🎉 SUCCESS! Model expanded to {TARGET_LAYERS} layers.")
	print(f"📁 Output saved to: {OUTPUT_DIR}")

	# Run full diagnostics
	arch_ok = verify_architecture(OUTPUT_DIR)
	diag_ok = run_diagnostics(OUTPUT_DIR)

	if arch_ok and diag_ok:
	print("\n🎊 FINAL SUCCESS! Your Qwen3-72B-DupeLayers model is ready and verified!")
	print("\n📐 Final architecture:")
	print(" Hidden size: 8192")
	print(" Intermediate size: 29568")
	print(" Attention heads: 64")
	print(" KV heads: 8")
	print(" Layers: 80")
	print(" Vocabulary: 151936")
	print("\n💡 The model has passed all quality checks and is ready for use!")
	else:
	print("\n⚠️ Some verification issues detected. Please review the diagnostics above.")

	return arch_ok and diag_ok

	if __name__ == "__main__":
	success = main()
	exit(0 if success else 1)