Spaces:

Bordoglor
/

Ramzes

Configuration error

App Files Files Community

Ramzes / tests /test_trainable_tokens.py

Bordoglor

Upload folder using huggingface_hub

302920f verified 19 days ago

raw

history blame contribute delete

38.7 kB

	# Copyright 2025-present the HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from __future__ import annotations

	import copy

	import pytest
	import torch
	from safetensors.torch import load_file as safe_load_file
	from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer

	from peft import AutoPeftModel, LoraConfig, PeftModel, TrainableTokensConfig, get_peft_model
	from peft.tuners.trainable_tokens.layer import TrainableTokensLayer
	from peft.utils import TrainableTokensWrapper, get_peft_model_state_dict

	from .testing_utils import hub_online_once


	class ModelEmb(torch.nn.Module):
	def __init__(self):
	super().__init__()
	self.emb = torch.nn.Embedding(100, 10)
	self.lin0 = torch.nn.Linear(10, 1)

	def forward(self, x):
	return self.lin0(self.emb(x))

	def get_input_embeddings(self):
	return self.emb


	class ModelEmbedIn(torch.nn.Module):
	def __init__(self):
	super().__init__()
	self.embed_in = torch.nn.Embedding(100, 10)
	self.lin0 = torch.nn.Linear(10, 1)

	def forward(self, x):
	return self.lin0(self.embed_in(x))

	def get_input_embeddings(self):
	return self.embed_in


	class ModelEmbedMultiple(torch.nn.Module):
	def __init__(self):
	super().__init__()
	self.embed_in = torch.nn.Embedding(100, 10)
	self.embed_in_2 = torch.nn.Embedding(100, 10)
	self.lin0 = torch.nn.Linear(10, 1)

	def forward(self, x):
	return self.lin0(self.embed_in(x) + self.embed_in_2(x))

	def get_input_embeddings(self):
	return self.embed_in


	class ModelEmbedInNoGet(torch.nn.Module):
	def __init__(self):
	super().__init__()
	self.embed_in = torch.nn.Embedding(100, 10)
	self.lin0 = torch.nn.Linear(10, 1)

	def forward(self, x):
	return self.lin0(self.embed_in(x))


	class TestTrainableTokens:
	@pytest.fixture
	def model_id(self):
	return "trl-internal-testing/tiny-random-LlamaForCausalLM"

	@pytest.fixture
	def model_multi_embedding(self):
	class MultiEmbeddingMLP(torch.nn.Module):
	def __init__(self):
	super().__init__()
	self.emb_text = torch.nn.Embedding(10, 5)
	self.emb_image = torch.nn.Embedding(8, 5)
	self.lin0 = torch.nn.Linear(5, 10)
	self.lin1 = torch.nn.Linear(10, 20)

	def forward(self, x_text, x_image):
	x_text = self.emb_text(x_text)
	x_image = self.emb_image(x_image)
	y = self.lin0(torch.concat([x_text, x_image], dim=1).view(-1, 5))
	y = self.lin1(y)
	return y, (x_text, x_image)

	return MultiEmbeddingMLP()

	@pytest.fixture
	def model(self, model_id):
	with hub_online_once(model_id):
	# This must not be a yield fixture so that we don't carry the hub_online_once
	# behavior over to the rest of the test that uses this fixture
	return AutoModelForCausalLM.from_pretrained(model_id)

	@pytest.fixture
	def tokenizer(self, model_id):
	return AutoTokenizer.from_pretrained(model_id)

	def simulate_training(self, trainable_tokens_layer, adapter_name="default"):
	"""Simulates training of trainable_tokens adapter layer by assigning random
	values to the delta tokens.
	"""
	trainable_tokens_layer.trainable_tokens_delta[adapter_name].data = torch.rand_like(
	trainable_tokens_layer.trainable_tokens_delta[adapter_name].data
	)

	def test_stand_alone_usage(self, model, tokenizer, tmp_path):
	original_model = copy.deepcopy(model)

	peft_config = TrainableTokensConfig(target_modules=["embed_tokens"], token_indices=[0, 1, 3])
	peft_model = get_peft_model(model, peft_config)
	save_path = tmp_path / "stand_alone_usage"

	# simulate normal use but take care to use the tokens that we expect to be modified
	# (+1 that we don't expect to be modified)
	X = {
	"input_ids": torch.tensor([[0, 1, 2, 3]]),
	"attention_mask": torch.tensor([[1, 1, 1, 1]]),
	}

	idcs_to_modify = peft_config.token_indices
	idcs_to_keep = [i for i in X["input_ids"][0].tolist() if i not in idcs_to_modify]

	self.simulate_training(peft_model.model.model.embed_tokens)
	output_train = peft_model(output_hidden_states=True, **X)

	peft_model.save_pretrained(save_path)
	peft_model_org = peft_model

	# check whether the token indices differ from the base model after loading the model
	# from the checkpoint.
	peft_model = AutoPeftModel.from_pretrained(save_path)
	output_load = peft_model(output_hidden_states=True, **X)
	output_orig = original_model(output_hidden_states=True, **X)

	# on the way, make sure that the embedding matrix itself was not modified
	assert torch.allclose(
	peft_model.model.model.embed_tokens.weight,
	peft_model_org.model.model.embed_tokens.weight,
	)

	W_load = output_load.hidden_states[0]
	W_orig = output_orig.hidden_states[0]
	W_train = output_train.hidden_states[0]

	# all PEFT model embed outputs must equal the outputs during 'training' to make sure
	# that saving/loading works properly.
	assert torch.allclose(W_load, W_train)

	assert not torch.allclose(W_load[:, idcs_to_modify], W_orig[:, idcs_to_modify])
	assert torch.allclose(W_load[:, idcs_to_keep], W_orig[:, idcs_to_keep])

	@pytest.mark.parametrize(
	"peft_config",
	[
	LoraConfig(
	target_modules="all-linear",
	trainable_token_indices={"embed_tokens": [0, 1, 3]},
	),
	],
	)
	def test_combined_with_peft_method_usage(self, model, tokenizer, peft_config, tmp_path):
	original_model = copy.deepcopy(model)
	peft_model = get_peft_model(model, peft_config)
	save_path = tmp_path / "combined_usage"

	# simulate normal use but take care to use the tokens that we expect to be modified
	# (+2 that we don't expect to be modified)
	X = {
	"input_ids": torch.tensor([[0, 1, 2, 3, 4]]),
	"attention_mask": torch.tensor([[1, 1, 1, 1, 1]]),
	}

	idcs_to_modify = peft_config.trainable_token_indices["embed_tokens"]
	idcs_to_keep = [i for i in X["input_ids"][0].tolist() if i not in idcs_to_modify]

	self.simulate_training(peft_model.model.model.embed_tokens.token_adapter)
	output_train = peft_model(output_hidden_states=True, **X)

	peft_model.save_pretrained(save_path)
	peft_model_org = peft_model

	# check whether the token indices differ from the base model
	peft_model = AutoPeftModel.from_pretrained(save_path)
	output_load = peft_model(output_hidden_states=True, **X)
	output_orig = original_model(output_hidden_states=True, **X)

	W_load = output_load.hidden_states[0]
	W_orig = output_orig.hidden_states[0]
	W_train = output_train.hidden_states[0]

	# all PEFT model embed outputs must equal the outputs during 'training' to make sure
	# that saving/loading works properly.
	assert torch.allclose(W_load, W_train)

	assert not torch.allclose(W_load[:, idcs_to_modify], W_orig[:, idcs_to_modify])
	assert torch.allclose(W_load[:, idcs_to_keep], W_orig[:, idcs_to_keep])

	def test_basic_training(self, model, tokenizer):
	# ensure that the model can be trained and backpropagation works
	config = TrainableTokensConfig(
	target_modules=["embed_tokens"],
	token_indices=[0, 10],
	)

	model = get_peft_model(model, config)
	optimizer = torch.optim.AdamW(model.parameters(), lr=1)

	initial_delta = model.model.model.embed_tokens.trainable_tokens_delta.default.clone()
	initial_originals = model.model.model.embed_tokens.trainable_tokens_original.default.clone()

	X = {
	"input_ids": torch.tensor([[0, 1, 2, 3, 4]]),
	"attention_mask": torch.tensor([[1, 1, 1, 1, 1]]),
	}

	for step in range(3):
	optimizer.zero_grad()
	y_pred = model(**X)
	loss = y_pred.logits.mean()
	loss.backward()
	optimizer.step()

	assert torch.allclose(
	model.model.model.embed_tokens.trainable_tokens_original.default,
	initial_originals,
	)
	assert not torch.allclose(
	model.model.model.embed_tokens.trainable_tokens_delta.default,
	initial_delta,
	)

	@pytest.mark.parametrize(
	"peft_config",
	[
	LoraConfig(
	target_modules="all-linear",
	trainable_token_indices={"embed_tokens": [0, 1, 3]},
	),
	],
	)
	def test_disable_adapters_with_merging(self, model, tokenizer, peft_config):
	X = {
	"input_ids": torch.tensor([[0, 1, 2, 3, 4]]),
	"attention_mask": torch.tensor([[1, 1, 1, 1, 1]]),
	}

	model = get_peft_model(model, peft_config)
	model.eval()

	outputs_before = model(**X).logits

	model.train()
	lr = 0.01
	optimizer = torch.optim.Adam(model.parameters(), lr=lr)

	# train at least 3 steps for all parameters to be updated (probably this is required because of symmetry
	# breaking of some LoRA layers that are initialized with constants)
	for _ in range(3):
	optimizer.zero_grad()
	y_pred = model(**X)
	loss = y_pred.logits.mean()
	loss.backward()
	optimizer.step()

	model.eval()
	outputs_unmerged = model(**X).logits
	model.merge_adapter()
	outputs_after = model(**X).logits

	with model.disable_adapter():
	outputs_disabled = model(**X).logits

	# check that after leaving the disable_adapter context, everything is enabled again
	outputs_enabled_after_disable = model(**X).logits

	atol, rtol = 1e-5, 1e-5 # tolerances higher than defaults since merging introduces some numerical instability

	# check that there is a difference in results after training
	assert not torch.allclose(outputs_before, outputs_after, atol=atol, rtol=rtol)

	# unmerged or merged should make no difference
	assert torch.allclose(outputs_after, outputs_unmerged, atol=atol, rtol=rtol)

	# check that disabling adapters gives the same results as before training
	assert torch.allclose(outputs_before, outputs_disabled, atol=atol, rtol=rtol)

	# check that enabling + disabling adapters does not change the results
	assert torch.allclose(outputs_after, outputs_enabled_after_disable, atol=atol, rtol=rtol)

	@pytest.mark.parametrize(
	"peft_config",
	[
	LoraConfig(
	target_modules="all-linear",
	trainable_token_indices={"embed_tokens": [0, 1, 3]},
	),
	],
	)
	def test_safe_merge_with_adapter(self, model, tokenizer, peft_config):
	X = {
	"input_ids": torch.tensor([[0, 1, 2, 3]]),
	"attention_mask": torch.tensor([[1, 1, 1, 1]]),
	}

	model = model.eval()
	logits_base = model(**X).logits

	model = get_peft_model(model, peft_config).eval()
	logits_peft = model(**X).logits

	atol, rtol = 1e-6, 1e-6 # default

	model_unloaded = model.merge_and_unload(safe_merge=True)
	logits_unloaded = model_unloaded(**X).logits

	# check that the logits are the same after unloading
	assert torch.allclose(logits_peft, logits_unloaded, atol=atol, rtol=rtol)

	@pytest.mark.parametrize(
	"peft_config",
	[
	LoraConfig(
	target_modules="all-linear",
	trainable_token_indices={"embed_tokens": [0, 1, 3]},
	),
	],
	)
	def test_load_multiple_adapters(self, model, peft_config, tmp_path):
	# tests if having more than one adpater (even with just the same config) works
	original_model = copy.deepcopy(model)
	model = get_peft_model(model, peft_config)

	model.save_pretrained(tmp_path)
	del model

	model = original_model
	model = PeftModel.from_pretrained(model, tmp_path)
	load_result1 = model.load_adapter(tmp_path, adapter_name="other")
	load_result2 = model.load_adapter(tmp_path, adapter_name="yet-another")

	assert load_result1.missing_keys == []
	assert load_result2.missing_keys == []

	@pytest.mark.parametrize(
	"peft_config_factory",
	[
	lambda token_indices: LoraConfig(
	target_modules="all-linear",
	trainable_token_indices={"embed_tokens": token_indices},
	),
	],
	)
	def test_multiple_adapters_different_token_indices(self, model, peft_config_factory, tmp_path):
	# tests if multiple adapters with different token indices work
	original_model = copy.deepcopy(model)

	token_indices_1 = [0, 1, 2]
	token_indices_2 = [2, 3, 4]

	peft_config_1 = peft_config_factory(token_indices_1)
	peft_config_2 = peft_config_factory(token_indices_2)

	model = get_peft_model(model, peft_config_1, adapter_name="adapter_1")
	model.add_adapter("adapter_2", peft_config_2)

	# "train" adapter 1
	model.set_adapter("adapter_1")
	self.simulate_training(model.model.model.embed_tokens.token_adapter, "adapter_1")

	# "train" adapter 2
	model.set_adapter("adapter_2")
	self.simulate_training(model.model.model.embed_tokens.token_adapter, "adapter_2")

	# now we infer on adapter 1 and on adapter 2 and check if the requested indices are changed for
	# each adapter. e.g., for adapter 1, only token indices 1 should be changed.
	X = {
	"input_ids": torch.tensor([list(set(token_indices_1 + token_indices_2))]),
	"attention_mask": torch.tensor([[1] * (len(set(token_indices_1 + token_indices_2)))]),
	}

	original_output = original_model(output_hidden_states=True, **X).hidden_states[0]

	# infer with adapter 1, embeddings for token indices 1 should be changed, no others.
	model.set_adapter("adapter_1")
	adapter_1_output = model(output_hidden_states=True, **X).hidden_states[0]

	idcs_to_modify = token_indices_1
	idcs_to_keep = [i for i in X["input_ids"][0].tolist() if i not in idcs_to_modify]

	assert not torch.allclose(adapter_1_output[:, idcs_to_modify], original_output[:, idcs_to_modify])
	assert torch.allclose(adapter_1_output[:, idcs_to_keep], original_output[:, idcs_to_keep])

	# infer with adapter 2, embeddings for token indices 2 should be changed, no others.
	model.set_adapter("adapter_2")
	adapter_2_output = model(output_hidden_states=True, **X).hidden_states[0]

	idcs_to_modify = token_indices_2
	idcs_to_keep = [i for i in X["input_ids"][0].tolist() if i not in idcs_to_modify]

	assert not torch.allclose(adapter_2_output[:, idcs_to_modify], original_output[:, idcs_to_modify])
	assert torch.allclose(adapter_2_output[:, idcs_to_keep], original_output[:, idcs_to_keep])

	@pytest.mark.parametrize(
	"peft_config_factory",
	[
	lambda token_indices: LoraConfig(
	target_modules="all-linear",
	trainable_token_indices={"embed_tokens": token_indices},
	),
	],
	)
	def test_multiple_adapters_overlapping_token_indices_merging(self, model, peft_config_factory, tmp_path):
	# tests that merging multiple adapters that have overlapping indices is not defined at the moment
	# and would yield undefined behavior. note that merging a single adapter is fine.
	original_model = copy.deepcopy(model)

	token_indices_1 = [0, 1, 2]
	token_indices_2 = [2, 3, 4]

	peft_config_1 = peft_config_factory(token_indices_1)
	peft_config_2 = peft_config_factory(token_indices_2)

	model = get_peft_model(model, peft_config_1, adapter_name="adapter_1")
	model.add_adapter("adapter_2", peft_config_2)

	with pytest.raises(ValueError) as e:
	model.merge_and_unload(adapter_names=["adapter_1", "adapter_2"])
	assert "are already defined and would result in undefined merging behavior" in str(e)

	@pytest.mark.parametrize(
	"peft_config_factory",
	[
	lambda targets, token_indices: LoraConfig(
	target_modules=targets,
	trainable_token_indices={"embed_tokens": token_indices},
	),
	],
	)
	def test_multiple_adapters_mixed_forward(self, model, peft_config_factory, tmp_path):
	# tests if multiple adapters with different token indices work
	original_model = copy.deepcopy(model)

	token_indices_1 = [0, 1, 2]
	token_indices_2 = [2, 3, 4]

	peft_config_1 = peft_config_factory(".*q_proj", token_indices_1)
	peft_config_2 = peft_config_factory(".*o_proj", token_indices_2)

	model = get_peft_model(model, peft_config_1, adapter_name="adapter_1")
	model.add_adapter("adapter_2", peft_config_2)

	# "train" adapter 1
	model.set_adapter("adapter_1")
	self.simulate_training(model.model.model.embed_tokens.token_adapter, "adapter_1")

	# "train" adapter 2
	model.set_adapter("adapter_2")
	self.simulate_training(model.model.model.embed_tokens.token_adapter, "adapter_2")

	# forward(adapter_names=...) is not available in train mode
	model.eval()

	# Build a batch of 2 items, each the same input sequence but each sequence will be passed to a different
	# adapter via mixed batch forward.
	input_sequence = list(set(token_indices_1 + token_indices_2))
	X = {
	"input_ids": torch.tensor([input_sequence, input_sequence]),
	"attention_mask": torch.tensor([[1] * len(input_sequence), [1] * len(input_sequence)]),
	}
	batch_adapter_names = ["adapter_1", "adapter_2"]

	original_output = original_model(output_hidden_states=True, **X)
	mixed_output = model(output_hidden_states=True, adapter_names=batch_adapter_names, **X)

	# check that the active adapter is still the last activated adapter, adapter_2
	assert model.model.model.embed_tokens.token_adapter.active_adapter == ["adapter_2"]

	adapter_1_output = mixed_output.hidden_states[0][0:1]
	original_output_1 = original_output.hidden_states[0][0:1]
	adapter_2_output = mixed_output.hidden_states[0][1:2]
	original_output_2 = original_output.hidden_states[0][1:2]

	idcs_to_modify = token_indices_1
	idcs_to_keep = [i for i in X["input_ids"][0].tolist() if i not in idcs_to_modify]

	assert not torch.allclose(adapter_1_output[:, idcs_to_modify], original_output_1[:, idcs_to_modify])
	assert torch.allclose(adapter_1_output[:, idcs_to_keep], original_output_1[:, idcs_to_keep])

	idcs_to_modify = token_indices_2
	idcs_to_keep = [i for i in X["input_ids"][0].tolist() if i not in idcs_to_modify]

	assert not torch.allclose(adapter_2_output[:, idcs_to_modify], original_output_2[:, idcs_to_modify])
	assert torch.allclose(adapter_2_output[:, idcs_to_keep], original_output_2[:, idcs_to_keep])

	def test_stand_alone_raises_target_layer_not_found(self, model):
	config = TrainableTokensConfig(target_modules=["doesnt_exist"], token_indices=[0, 1, 3])
	with pytest.raises(ValueError) as e:
	model = get_peft_model(model, config)
	assert "Target modules ['doesnt_exist'] not found in the base model." in str(e)

	@pytest.mark.parametrize(
	"peft_config, target_layer_name",
	[
	(LoraConfig(trainable_token_indices={"does-not-exist": [0, 1, 2]}), "does-not-exist"),
	],
	)
	def test_combined_with_peft_raises_target_layer_not_found(self, model, peft_config, target_layer_name):
	# same as test_stand_alone_raises_target_layer_not_found but tests the peft method integration
	with pytest.raises(ValueError) as e:
	model = get_peft_model(model, peft_config)
	assert f"Target modules {{{repr(target_layer_name)}}} not found in the base model." in str(e)

	def test_multiple_targets(self, model_multi_embedding):
	# tests the ability of targeting two modules with the same token indices
	original_model = copy.deepcopy(model_multi_embedding)
	config = TrainableTokensConfig(target_modules=["emb_text", "emb_image"], token_indices=[0, 1])
	peft_model = get_peft_model(model_multi_embedding, config)

	self.simulate_training(peft_model.model.emb_text)
	self.simulate_training(peft_model.model.emb_image)

	X = {
	"x_text": torch.tensor([[0, 1, 2]]),
	"x_image": torch.tensor([[0, 1, 2]]),
	}

	_, (emb_text_orig, emb_image_orig) = original_model(**X)
	_, (emb_text_peft, emb_image_peft) = peft_model(**X)

	assert not torch.allclose(emb_text_orig[:, [0, 1]], emb_text_peft[:, [0, 1]])
	assert torch.allclose(emb_text_orig[:, [2]], emb_text_peft[:, [2]])
	assert not torch.allclose(emb_image_orig[:, [0, 1]], emb_image_peft[:, [0, 1]])
	assert torch.allclose(emb_image_orig[:, [2]], emb_image_peft[:, [2]])

	@pytest.mark.parametrize(
	"peft_config",
	[
	LoraConfig(
	target_modules="all-linear",
	trainable_token_indices={"embed_tokens": [0, 1, 3]},
	),
	],
	)
	def test_no_embeddings_in_save_with_combined_usage(self, model, tokenizer, peft_config, tmp_path):
	# make sure that in combined use the only state dict key is that of the token deltas and nothing more

	peft_model = get_peft_model(model, peft_config)
	state_dict = get_peft_model_state_dict(
	model=peft_model,
	state_dict=None,
	adapter_name="default",
	)

	embedding_keys = [n for n in state_dict.keys() if "embed_tokens" in n]
	assert embedding_keys == ["base_model.model.model.embed_tokens.token_adapter.trainable_tokens_delta"]

	@pytest.fixture()
	def model_weight_untied(self, model):
	return model

	@pytest.fixture()
	def model_id_weight_tied(self):
	return "facebook/opt-125m"

	@pytest.fixture()
	def model_weight_tied(self, model_id_weight_tied):
	return AutoModelForCausalLM.from_pretrained(model_id_weight_tied)

	@pytest.mark.parametrize(
	"peft_config",
	[
	LoraConfig(
	target_modules="all-linear",
	trainable_token_indices={"embed_tokens": [0, 1, 3]},
	),
	],
	)
	def test_weight_tying_noop_when_model_is_untied(self, model_weight_untied, peft_config, tmp_path):
	# test if the weight tying is affected as well when we modified the embedding.
	assert model_weight_untied._tied_weights_keys
	assert not model_weight_untied.config.tie_word_embeddings

	peft_model = get_peft_model(model_weight_untied, peft_config)
	assert hasattr(peft_model.model.model.embed_tokens, "token_adapter")
	assert not hasattr(peft_model.model.lm_head, "token_adapter")

	@pytest.mark.parametrize(
	"peft_config",
	[
	LoraConfig(
	target_modules="all-linear",
	trainable_token_indices={"embed_tokens": [0, 1, 3]},
	),
	],
	)
	def test_weight_tying_applied_when_model_is_tied(self, model_weight_tied, peft_config, tmp_path):
	# test if the weight tying is affected as well when we modified the embedding.
	assert model_weight_tied._tied_weights_keys
	assert model_weight_tied.config.tie_word_embeddings

	peft_model = get_peft_model(model_weight_tied, peft_config)

	# make it so that the input embeddings diverge. when the weights are tied this should
	# reflect in the output embeddings as well.
	self.simulate_training(peft_model.model.model.decoder.embed_tokens.token_adapter)

	# we have to find out if the input embedding tying is doing its job during forward.
	# for this we can leverage the fact that emb_out(1/emb_in(x)) is embed_dim on the
	# diagonal iff emb_in.weight == emb_out.weight.
	token_indices = [0, 1, 2, 3]
	emb_dim = 768
	emb_in = peft_model.model.model.decoder.embed_tokens(torch.tensor([token_indices]))
	emb_out = peft_model.model.lm_head(1 / emb_in)

	assert torch.allclose(torch.diag(emb_out[0]), torch.tensor([emb_dim] * len(token_indices)).float())

	# make sure that the state dict does not include weight-tied weights.
	state_dict = get_peft_model_state_dict(peft_model)
	assert not [key for key in state_dict if any(tied_key in key for tied_key in peft_model._tied_weights_keys)]

	# make sure that merging and unloading restores the weight-tying.
	merged_model = peft_model.merge_and_unload()

	assert merged_model.model.decoder.embed_tokens.weight.data_ptr() == merged_model.lm_head.weight.data_ptr()

	def test_weight_tying_applied_when_model_is_tied_standalone(self, model_weight_tied):
	# since weight tying is currently not supported make sure that an error is raised when attempting
	# to use a model that has tied input/output embeddings
	assert model_weight_tied._tied_weights_keys
	assert model_weight_tied.config.tie_word_embeddings

	peft_config = TrainableTokensConfig(
	target_modules=["embed_tokens"],
	token_indices=[0, 1, 3],
	)

	peft_model = get_peft_model(model_weight_tied, peft_config)

	# make it so that the input embeddings diverge. when the weights are tied this should
	# reflect in the output embeddings as well.
	self.simulate_training(peft_model.model.model.decoder.embed_tokens)

	# we have to find out if the input embedding tying is doing its job during forward.
	# for this we can leverage the fact that emb_out(1/emb_in(x)) is embed_dim on the
	# diagonal iff emb_in.weight == emb_out.weight.
	token_indices = [0, 1, 2, 3]
	emb_dim = 768
	emb_in = peft_model.model.model.decoder.embed_tokens(torch.tensor([token_indices]))
	emb_out = peft_model.model.lm_head(1 / emb_in)

	assert torch.allclose(torch.diag(emb_out[0]), torch.tensor([emb_dim] * len(token_indices)).float())

	# make sure that the state dict does not include weight-tied weights.
	state_dict = get_peft_model_state_dict(peft_model)
	assert not [key for key in state_dict if any(tied_key in key for tied_key in peft_model._tied_weights_keys)]

	# make sure that merging and unloading restores the weight-tying.
	merged_model = peft_model.merge_and_unload()

	assert merged_model.model.decoder.embed_tokens.weight.data_ptr() == merged_model.lm_head.weight.data_ptr()

	def test_weight_tying_normally_issues_warning(self, model_weight_tied, recwarn):
	# When using models with weight tying and targeting the embedding or the tied layer should raise a warning.
	peft_config = LoraConfig(target_modules=["embed_tokens"])
	peft_model = get_peft_model(model_weight_tied, peft_config)

	warnings = [w.message.args[0] for w in recwarn]
	warnings = [msg for msg in warnings if "Model with `tie_word_embeddings=True` and the" in msg]
	assert warnings

	def test_weight_tying_state_dict_ignores_tied_weights(self, model_weight_tied):
	# since weight tying is currently not supported make sure that an error is raised when attempting
	# to use a model that has tied input/output embeddings
	assert model_weight_tied._tied_weights_keys
	assert model_weight_tied.config.tie_word_embeddings

	peft_config = TrainableTokensConfig(
	target_modules=["embed_tokens"],
	token_indices=[0, 1, 3],
	)

	peft_model = get_peft_model(model_weight_tied, peft_config)

	state_dict = peft_model.state_dict()
	peft_state_dict = get_peft_model_state_dict(peft_model)

	# the state dict or the peft model state dict must not include tied adapter weights
	state_dict_keys = [n for n, _ in state_dict.items() if "tied_adapter." in n]
	peft_state_dict_keys = [n for n, _ in peft_state_dict.items() if "tied_adapter." in n]

	assert not state_dict_keys
	assert not peft_state_dict_keys

	@pytest.mark.parametrize(
	"peft_config",
	[
	LoraConfig(
	target_modules="all-linear",
	trainable_token_indices={"shared": [0, 1, 3]},
	),
	],
	)
	def test_weight_tying_applied_when_model_is_tied_encoder_decoder(self, peft_config):
	model_id = "hf-internal-testing/tiny-random-t5"
	base_model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

	peft_model = get_peft_model(base_model, peft_config)

	# make it so that the input embeddings diverge. when the weights are tied this should
	# reflect in the output embeddings as well.
	self.simulate_training(peft_model.model.shared.token_adapter)

	# we have to find out if the input embedding tying is doing its job during forward.
	# for this we can leverage the fact that emb_out(1/emb_in(x)) is embed_dim on the
	# diagonal iff emb_in.weight == emb_out.weight.
	token_indices = [0, 1, 2, 3]
	emb_dim = base_model.config.d_model
	emb_in = peft_model.model.encoder.embed_tokens(torch.tensor([token_indices]))
	emb_out = peft_model.model.lm_head(1 / emb_in)

	assert torch.allclose(torch.diag(emb_out[0]), torch.tensor([emb_dim] * len(token_indices)).float())

	# T5 has a decoder embedding layer, we can simply check if it's forward is equal to the encoder
	# embedding forward.
	emb_out = peft_model.model.decoder.embed_tokens(torch.tensor([token_indices]))

	assert torch.allclose(emb_in, emb_out)

	# make sure that the state dict does not include weight-tied weights.
	state_dict = get_peft_model_state_dict(peft_model)
	assert not [key for key in state_dict if any(tied_key in key for tied_key in peft_model._tied_weights_keys)]

	# make sure that merging and unloading restores the weight-tying.
	merged_model = peft_model.merge_and_unload()

	assert merged_model.encoder.embed_tokens.weight.data_ptr() == merged_model.lm_head.weight.data_ptr()
	assert (
	merged_model.encoder.embed_tokens.weight.data_ptr() == merged_model.decoder.embed_tokens.weight.data_ptr()
	)

	@pytest.mark.parametrize(
	"peft_config",
	[
	LoraConfig(
	target_modules="all-linear",
	trainable_token_indices={"embed_tokens": [0, 1, 3]},
	modules_to_save=["embed_tokens"],
	),
	],
	)
	def test_modules_to_save_excludes_trainable_tokens(self, model, peft_config):
	with pytest.raises(ValueError) as e:
	get_peft_model(model, peft_config)
	assert "The embedding layer is already marked to be trained fully" in str(e)

	def test_merge_and_unload_standalone(self, model):
	# test basic functionality of merge_and_unload for standalone TrainableTokens
	token_indices = [0, 1, 3]

	peft_config = TrainableTokensConfig(
	target_modules=["embed_tokens"],
	token_indices=token_indices,
	)

	peft_model = get_peft_model(model, peft_config)

	self.simulate_training(peft_model.model.model.embed_tokens)
	expected_changed_weights = peft_model.model.model.embed_tokens.trainable_tokens_delta.default.data.clone()

	# make sure no TrainableTokensLayer is in the module
	merged_model = peft_model.merge_and_unload()
	for _, module in merged_model.named_modules():
	assert not isinstance(module, TrainableTokensLayer)

	# make sure that deltas are applied to the embedding matrix
	assert torch.allclose(merged_model.model.embed_tokens.weight.data[token_indices], expected_changed_weights)

	def test_original_module_not_in_state_dict(self, model):
	# Every AuxiliaryTrainingWrapper has an original_module attribute. Since the TrainableTokensWrapper is wrapping
	# a TrainableTokensLayer and it already has a base layer which serves as the original module, we don't need that
	# and so it should not come up in the state dict to save memory.

	peft_config = LoraConfig(
	target_modules="all-linear",
	trainable_token_indices={"embed_tokens": [0, 1, 3]},
	)

	peft_model = get_peft_model(model, peft_config)

	# make sure that the original module is present and accessible even though
	# we want to exclude it from the state dict.
	assert peft_model.model.model.embed_tokens.original_module

	state_dict = get_peft_model_state_dict(peft_model)

	assert not [k for k in state_dict if ".original_module.weight" in k]

	state_dict = peft_model.state_dict()
	assert not [k for k in state_dict if ".original_module.weight" in k]

	@pytest.fixture
	def model_emb(self):
	return ModelEmb()

	@pytest.fixture
	def model_embed_in(self):
	return ModelEmbedIn()

	@pytest.fixture
	def model_embed_in_no_get(self):
	return ModelEmbedInNoGet()

	@pytest.fixture
	def model_embed_multiple(self):
	return ModelEmbedMultiple()

	@pytest.mark.parametrize(
	"model_fixture_name, getter",
	[
	("model_emb", lambda model: model.emb),
	("model_embed_in", lambda model: model.embed_in),
	("model", lambda model: model.model.model.embed_tokens),
	],
	)
	def test_default_embedding_name_is_inferred_standalone(self, model_fixture_name, getter, request):
	# make sure that the auto targeting works when `target_module=None`
	base_model = request.getfixturevalue(model_fixture_name)

	peft_config = TrainableTokensConfig(target_modules=None, token_indices=[0, 1, 3])
	peft_model = get_peft_model(base_model, peft_config)

	assert isinstance(getter(peft_model), TrainableTokensLayer)

	@pytest.mark.parametrize(
	"model_fixture_name, getter",
	[
	("model_emb", lambda model: model.emb),
	("model_embed_in", lambda model: model.embed_in),
	("model", lambda model: model.model.model.embed_tokens),
	],
	)
	def test_default_embedding_name_is_inferred_combined(self, model_fixture_name, getter, request):
	# make sure that the auto targeting works when `target_module=None`
	base_model = request.getfixturevalue(model_fixture_name)

	peft_config = LoraConfig(target_modules="all-linear", trainable_token_indices=[0, 1, 3])
	peft_model = get_peft_model(base_model, peft_config)

	assert isinstance(getter(peft_model), TrainableTokensWrapper)

	def test_default_embedding_name_cannot_be_inferred(self, model_embed_in_no_get):
	# should default to default value `embed_tokens` which is not present in this model
	base_model = model_embed_in_no_get

	peft_config = TrainableTokensConfig(target_modules=None, token_indices=[0, 1, 3])

	with pytest.raises(ValueError) as e:
	peft_model = get_peft_model(base_model, peft_config)

	assert "Target modules embed_tokens not found in the base model." in str(e)

	def test_embedding_name_is_used_when_given_standalone(self, model_embed_multiple):
	peft_config = TrainableTokensConfig(target_modules="embed_in_2", token_indices=[0, 1, 3])
	peft_model = get_peft_model(model_embed_multiple, peft_config)

	assert isinstance(peft_model.model.embed_in_2, TrainableTokensLayer)
	assert not isinstance(peft_model.model.embed_in, TrainableTokensLayer)

	def test_embedding_name_is_used_when_given_combined(self, model_embed_multiple):
	peft_config = LoraConfig(target_modules="all-linear", trainable_token_indices={"embed_in_2": [0, 1, 3]})
	peft_model = get_peft_model(model_embed_multiple, peft_config)

	assert isinstance(peft_model.model.embed_in_2, TrainableTokensWrapper)
	assert not isinstance(peft_model.model.embed_in, TrainableTokensWrapper)

	@pytest.mark.parametrize("resize_embedding", [True, False])
	@pytest.mark.parametrize(
	"peft_config",
	[
	LoraConfig(target_modules="all-linear", trainable_token_indices=[1, 2, 3]),
	TrainableTokensConfig(target_modules=None, token_indices=[1, 2, 3]),
	],
	)
	def test_save_pretrained_auto(self, model, resize_embedding, peft_config, tmp_path):
	# make sure that embeddings are saved alongside trainable token weights but only when
	# the we detect the embedding to be resized (as detected by save_embedding_layers="auto")
	if resize_embedding:
	model.resize_token_embeddings(model.config.vocab_size + 2)
	peft_model = get_peft_model(model, peft_config)

	peft_model.save_pretrained(tmp_path, save_embedding_layers="auto")
	state_dict = safe_load_file(tmp_path / "adapter_model.safetensors")

	if isinstance(peft_config, TrainableTokensConfig):
	contains_embedding = "base_model.model.model.embed_tokens.base_layer.weight" in state_dict
	else:
	contains_embedding = "base_model.model.model.embed_tokens.token_adapter.base_layer.weight" in state_dict

	if resize_embedding:
	assert contains_embedding
	else:
	assert not contains_embedding