{ "cells": [ { "cell_type": "markdown", "id": "9371cf89", "metadata": {}, "source": [ "# Loading Script\n", "\n", "Run this first to load local ChemQ3MTP libraries" ] }, { "cell_type": "code", "execution_count": 2, "id": "f52f283e", "metadata": {}, "outputs": [], "source": [ "import torch\n", "import sys\n", "import os\n", "from pathlib import Path\n", "import importlib.util\n", "from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer\n", "\n", "def load_custom_modules(library_path):\n", " \"\"\"Load all the custom modules required by the model from library directory\"\"\"\n", " \n", " library_path = Path(library_path)\n", " \n", " # Add the library directory to Python path\n", " if str(library_path) not in sys.path:\n", " sys.path.insert(0, str(library_path))\n", " \n", " print(f\"๐Ÿ”ง Loading custom modules from {library_path}...\")\n", " \n", " # Required module files\n", " required_files = {\n", " 'configuration_chemq3mtp.py': 'configuration_chemq3mtp',\n", " 'modeling_chemq3mtp.py': 'modeling_chemq3mtp', \n", " 'FastChemTokenizerHF.py': 'FastChemTokenizerHF'\n", " }\n", " \n", " loaded_modules = {}\n", " \n", " # Load each required module\n", " for filename, module_name in required_files.items():\n", " file_path = library_path / filename\n", " \n", " if not file_path.exists():\n", " print(f\"โŒ Required file not found: {filename}\")\n", " return None\n", " \n", " try:\n", " spec = importlib.util.spec_from_file_location(module_name, file_path)\n", " module = importlib.util.module_from_spec(spec)\n", " \n", " # Execute the module\n", " spec.loader.exec_module(module)\n", " loaded_modules[module_name] = module\n", " \n", " print(f\" โœ… Loaded {filename}\")\n", " \n", " except Exception as e:\n", " print(f\" โŒ Failed to load {filename}: {e}\")\n", " return None\n", " \n", " return loaded_modules\n", "\n", "def register_model_components(loaded_modules):\n", " \"\"\"Register the model components with transformers\"\"\"\n", " \n", " print(\"๐Ÿ”— Registering model components...\")\n", " \n", " try:\n", " # Get the classes from loaded modules\n", " ChemQ3MTPConfig = loaded_modules['configuration_chemq3mtp'].ChemQ3MTPConfig\n", " ChemQ3MTPForCausalLM = loaded_modules['modeling_chemq3mtp'].ChemQ3MTPForCausalLM\n", " FastChemTokenizerSelfies = loaded_modules['FastChemTokenizerHF'].FastChemTokenizerSelfies\n", " \n", " # Register with transformers\n", " AutoConfig.register(\"chemq3_mtp\", ChemQ3MTPConfig)\n", " AutoModelForCausalLM.register(ChemQ3MTPConfig, ChemQ3MTPForCausalLM)\n", " AutoTokenizer.register(ChemQ3MTPConfig, FastChemTokenizerSelfies)\n", " \n", " print(\"โœ… Model components registered successfully\")\n", " \n", " return ChemQ3MTPConfig, ChemQ3MTPForCausalLM, FastChemTokenizerSelfies\n", " \n", " except Exception as e:\n", " print(f\"โŒ Registration failed: {e}\")\n", " return None, None, None\n", "\n", "def load_model(model_path):\n", " \"\"\"Load the model using the registered components\"\"\"\n", " \n", " print(\"๐Ÿš€ Loading model...\")\n", " \n", " try:\n", " # Load config\n", " config = AutoConfig.from_pretrained(str(model_path), trust_remote_code=False)\n", " print(f\"โœ… Config loaded: {config.__class__.__name__}\")\n", " \n", " # Load model\n", " model = AutoModelForCausalLM.from_pretrained(\n", " str(model_path),\n", " config=config,\n", " torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,\n", " trust_remote_code=False # We've already registered everything\n", " )\n", " print(f\"โœ… Model loaded: {model.__class__.__name__}\")\n", " \n", " # Load tokenizer\n", " tokenizer = AutoTokenizer.from_pretrained(str(model_path), trust_remote_code=False)\n", " print(f\"โœ… Tokenizer loaded: {tokenizer.__class__.__name__}\")\n", " \n", " return model, tokenizer, config\n", " \n", " except Exception as e:\n", " print(f\"โŒ Model loading failed: {e}\")\n", " import traceback\n", " traceback.print_exc()\n", " return None, None, None\n", "\n", "def test_model(model, tokenizer, config):\n", " \"\"\"Test the loaded model\"\"\"\n", " \n", " print(\"\\n๐Ÿงช Testing model...\")\n", " \n", " # Setup device\n", " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", " print(f\"๐Ÿ–ฅ๏ธ Using device: {device}\")\n", " \n", " model = model.to(device)\n", " model.eval()\n", " \n", " # Model info\n", " print(f\"\\n๐Ÿ“Š Model Information:\")\n", " print(f\" Model class: {model.__class__.__name__}\")\n", " print(f\" Config class: {config.__class__.__name__}\")\n", " print(f\" Tokenizer class: {tokenizer.__class__.__name__}\")\n", " print(f\" Model type: {config.model_type}\")\n", " print(f\" Vocab size: {config.vocab_size}\")\n", " \n", " # Set pad token if needed\n", " if not hasattr(tokenizer, 'pad_token') or tokenizer.pad_token is None:\n", " if hasattr(tokenizer, 'eos_token'):\n", " tokenizer.pad_token = tokenizer.eos_token\n", " print(\"โœ… Set pad_token to eos_token\")\n", " \n", " # Test tokenization\n", " print(\"\\n๐Ÿ”ค Testing tokenization...\")\n", " test_inputs = [\"[C][C][O]\", \"[C]\", \"[O]\"]\n", " \n", " for test_input in test_inputs:\n", " try:\n", " tokens = tokenizer(test_input, return_tensors=\"pt\")\n", " print(f\" '{test_input}' -> {tokens.input_ids.tolist()}\")\n", " except Exception as e:\n", " print(f\" โŒ Tokenization failed for '{test_input}': {e}\")\n", " continue\n", " \n", " # Test generation\n", " print(\"\\n๐ŸŽฏ Testing generation...\")\n", " test_prompts = [\"[C]\", \"[C][C]\"]\n", " \n", " for prompt in test_prompts:\n", " try:\n", " input_ids = tokenizer(prompt, return_tensors=\"pt\").input_ids.to(device)\n", " \n", " with torch.no_grad():\n", " outputs = model.generate(\n", " input_ids,\n", " max_length=input_ids.shape[1] + 20,\n", " temperature=0.8,\n", " top_p=0.9,\n", " top_k=50,\n", " do_sample=True,\n", " pad_token_id=tokenizer.pad_token_id if hasattr(tokenizer, 'pad_token_id') else 0,\n", " num_return_sequences=3\n", " )\n", " \n", " print(f\"\\n Prompt: '{prompt}'\")\n", " for i, output in enumerate(outputs):\n", " generated = tokenizer.decode(output, skip_special_tokens=True)\n", " print(f\" {i+1}: {generated}\")\n", " \n", " except Exception as e:\n", " print(f\" โŒ Generation failed for '{prompt}': {e}\")\n", " \n", " # Test MTP functionality if available\n", " print(\"\\n๐Ÿ”ฌ Testing MTP functionality...\")\n", " try:\n", " if hasattr(model, 'set_mtp_training'):\n", " print(\" โœ… MTP training methods available\")\n", " if hasattr(model, 'generate_with_logprobs'):\n", " print(\" โœ… MTP generation methods available\")\n", " else:\n", " print(\" โ„น๏ธ Standard model - no MTP methods detected\")\n", " except Exception as e:\n", " print(f\" โš ๏ธ MTP test error: {e}\")\n" ] }, { "cell_type": "markdown", "id": "b16c5461", "metadata": {}, "source": [ "# Testing MTP Head Generation with RL-checkpoints (Local)\n", "\n", "- Download checkpoints at https://huggingface.co/gbyuvd/ChemMiniQ3-SAbRLo-RL-checkpoints\n", "- Make sure to change this in loading script: \n", "```\n", " # Load model from checkpoint directory\n", " checkpoint_dir = \"./ppo_checkpoints_45/model_step_4500\"\n", "```" ] }, { "cell_type": "code", "execution_count": 3, "id": "cefc1a68", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "๐Ÿš€ ChemQ3-MTP Model Loader Starting...\n", "\n", "๐Ÿ“ Loading library from: ./ChemQ3MTP\n", "๐Ÿ”ง Loading custom modules from ChemQ3MTP...\n", " โœ… Loaded configuration_chemq3mtp.py\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "`torch_dtype` is deprecated! Use `dtype` instead!\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " โœ… Loaded modeling_chemq3mtp.py\n", " โœ… Loaded FastChemTokenizerHF.py\n", "\n", "๐Ÿ”— Registering model components...\n", "โœ… Model components registered successfully\n", "\n", "๐Ÿ“ Loading model weights from checkpoint: ./checkpoints-1/model_step_4500\n", "๐Ÿ“ Checkpoint files:\n", " config.json (1161 bytes)\n", " generation_config.json (174 bytes)\n", " model.safetensors (39437252 bytes)\n", " tokenizer_config.json (302 bytes)\n", " training_state.pt (78926669 bytes)\n", " vocab.json (21574 bytes)\n", "\n", "๐Ÿš€ Loading model...\n", "โœ… Config loaded: ChemQ3MTPConfig\n", "โœ… Model loaded: ChemQ3MTPForCausalLM\n", "โœ… Tokenizer loaded: FastChemTokenizerSelfies\n", "\n", "๐Ÿงช Testing model...\n", "๐Ÿ–ฅ๏ธ Using device: cuda\n", "\n", "๐Ÿ“Š Model Information:\n", " Model class: ChemQ3MTPForCausalLM\n", " Config class: ChemQ3MTPConfig\n", " Tokenizer class: FastChemTokenizerSelfies\n", " Model type: chemq3_mtp\n", " Vocab size: 782\n", "\n", "๐Ÿ”ค Testing tokenization...\n", " '[C][C][O]' -> [[0, 379, 379, 377, 1]]\n", " '[C]' -> [[0, 379, 1]]\n", " '[O]' -> [[0, 377, 1]]\n", "\n", "๐ŸŽฏ Testing generation...\n", "\n", " Prompt: '[C]'\n", " 1: [C]\n", " 2: [C] [=C] [C] [=C] [C] [=C] [Branch1] [#C] [C] [=C] [C] [Branch1] [C] [O] [=N] [C] [Branch1] [Ring1] [C] [=O] [=C] [Ring1] [=Branch2] [C] [=C] [Ring1] [=C]\n", " 3: [C]\n", "\n", " Prompt: '[C][C]'\n", " 1: [C] [C]\n", " 2: [C] [C]\n", " 3: [C] [C] .[C] [C] [C] [N] [Branch1] [C] [C] [C] [C] [N] [C] [C] [=C] [C] [=C] [C] [=C] [Ring1] [=Branch1] [N] [C] [Ring1] [#Branch2] [=O]\n", "\n", "๐Ÿ”ฌ Testing MTP functionality...\n", " โœ… MTP training methods available\n", " โœ… MTP generation methods available\n", "\n", "๐ŸŽ‰ Model loading and testing completed successfully!\n" ] } ], "source": [ "def main():\n", " print(\"๐Ÿš€ ChemQ3-MTP Model Loader Starting...\\n\")\n", " \n", " # Library directory (contains the .py files)\n", " library_dir = \"./ChemQ3MTP\"\n", " \n", " # Check if library directory exists\n", " if not Path(library_dir).exists():\n", " print(f\"โŒ Library directory does not exist: {library_dir}\")\n", " return None, None, None\n", " \n", " print(f\"๐Ÿ“ Loading library from: {library_dir}\")\n", " \n", " # Load custom modules from library directory\n", " loaded_modules = load_custom_modules(Path(library_dir))\n", " if loaded_modules is None:\n", " return None, None, None\n", " \n", " print()\n", " \n", " # Register components\n", " config_class, model_class, tokenizer_class = register_model_components(loaded_modules)\n", " if config_class is None:\n", " return None, None, None\n", " \n", " print()\n", " \n", " # Load model from checkpoint directory\n", " checkpoint_dir = \"./checkpoints-1/model_step_4500\" # <======\n", " \n", " # Check if checkpoint directory exists\n", " if not Path(checkpoint_dir).exists():\n", " print(f\"โŒ Checkpoint directory does not exist: {checkpoint_dir}\")\n", " return None, None, None\n", " \n", " print(f\"๐Ÿ“ Loading model weights from checkpoint: {checkpoint_dir}\")\n", " \n", " # List checkpoint files\n", " print(\"๐Ÿ“ Checkpoint files:\")\n", " for file in Path(checkpoint_dir).iterdir():\n", " if file.is_file():\n", " print(f\" {file.name} ({file.stat().st_size} bytes)\")\n", " \n", " print()\n", " \n", " # Load the model from checkpoint\n", " model, tokenizer, config = load_model(Path(checkpoint_dir))\n", " if model is None:\n", " return None, None, None\n", " \n", " # Test the model\n", " test_model(model, tokenizer, config)\n", " \n", " print(\"\\n๐ŸŽ‰ Model loading and testing completed successfully!\")\n", " \n", " return model, tokenizer, config\n", "\n", "if __name__ == \"__main__\":\n", " model, tokenizer, config = main()" ] }, { "cell_type": "code", "execution_count": 4, "id": "56628930", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using MTP-specific generation...\n", "Generated SELFIES: [=C][C][Branch2][Ring1][Ring1][C][=C][C][=C][Branch1][=Branch2][C][=C][C][=C][C][=N][Ring1][=Branch1][S][Ring1][O][=C][C][=C][Ring1][S][N][C][C][C][C][C][Ring1][=Branch1]\n", "Decoded SMILES: C1C(C2=CC=C(C3=CC=CC=N3)S2=C)C=C1N4CCCCC4\n" ] }, { "data": { "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAEsASwDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD3+iiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAoornPGviyLwdoa35s5L65mnS2trSJsPNKx4UcHsCeh6UAdHRXnw+J15bKP7S8A+KoD3MFoJ1H4gilHxm8IREC/k1HTj6XdhKuPyBoA9Aork7T4n+CL3HleJ9OXP/PaXyv/AEPFdBp+rabq8byabqFpeohwzW0yyBT6EqTigC5RRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRRQAUUUUAFFFFABRRWX4i16z8M+H73WL9sQWsZcgHlz/AAqPcnAH1oA1KK4nwj8SbHxFMmm6laTaJrpUP/Z94CrSKeQ0bEDcCO2M9eOM121ABRRRQAUUUUAFeQ+KNfguPG+pa5cYk0jwXbHYpPyzahKMKvvjgex+teg+MfEUXhTwnqGsSAM0Ef7lD/HKeEX8WI/DNeW6N4dludX8O+DLkmVrUnxB4ic8+ZcOcxxt6nJGR3ABoA9R8FxaxF4Q046/cvcapJF5s7OoUqWO4JgD+EED8K3iMjBoooA5nxdaaLp/hvUtXuvDlhqX2OBp2hkgQlwoyeSpxxk9K4/wMunaV8TryLSrWKz03X9DttUghiACKVO0gAcfxE8V6df2ceoaddWU3+quInif6MCD/OvDfC95JaJ8NtSn+WWzvLrQLv6tkRr+GAaAPeqKKKACiiigAooooAKKKKACiiigAooooAKKKKACiuS+JOv3fh3wTd3OnPt1Od0tbL5QxMsjADAPGQMn8Kx7f4WSXtvH/wAJL4u8QapIVHmwreGKAnvhF5/WgDr9U8UaBogP9p6zYWhH8M1wqt+Azk1y0vxi8LSSmHSV1PWphx5em2Mkhz9SAP1rU0z4Z+C9IwbTw5YFl6PPH5zA+uXya6iKKOCNY4o0jjXgKgwB+FAHD6J8STfeJI9G1rw/faBJdrmwe+OPtJHVemFbkcZP54B7usTxV4XsPF2iSabfBlOd8E6cSQSD7rqexH69KxPBviLUYrt/Cnil0XXrVcxTjhb6EdJF/wBrA5H/ANcC4U5zvyq9ld+gXsdtRRRUAFFFFABXmb/8XJ8e+UPn8L+HJ8ueqXl6O3uqfz9Qa1fiH4gvYIrTwxoLf8T/AFomKFh/y7Q/8tJj6YGce/TpXReG/D9l4X8P2mj2CYgtk27j1durMfcnJoAh8TeEtG8W2AtdWtBJs5imQ7ZYW/vI3UH9PXNccNW8VfDg7NeE3iHw0vC6nCmbq1X/AKbL/Go/vDnuewr0yggEYPIoApaVq+n65p8d/pd5Fd2sg+WWJsj6H0PseRV2vJvHei2/gFf+Ep8KXbaXqVxOkR0yJN8GoyMcBPKyMN1OV9D3Oa1lsPipq8am61rQ9CRgCVsrVriRfY+YdufoaAPQ6ydT8UaBouRqetafaMP4JrhFb8icmuVPwtF+M+IPFviLVM/eh+1eTCf+AIP61qaX8MvBWjkNaeHLEuOjzoZmHvlyTQBx174n0b4kfEbQtF02/iudK03fqU+coLmdOI41DY3bc7j1BBPpXWfD/wAPahpdvqmra5Gqa3q949xcqrhxGgJWOMMOoC9PrVzxT4J0zxPpkUBBsry0O+xvbYbJLV+xXGOOmR39jgip4P1zXTPPoXiqz8rU7TAS+jGIL5DnDKezccr/AC6UpSUVduwHYUUUUwCvAvEVvJp1n8QbKEfvtI1m11+1H/XQgsR9BmvfGZUUsxAUDJJPAFeT+Jdf0jxDfaxpfg/RjrutajaGxvL6FtttBGQQN8h+UkZyAOuOvagD1S2uI7u1huYTuimRZEPqCMipa8u0rxXrngDSbPSvGOgznT7OFII9Y00m4i2KAAZFwGXAHJxz6V6Bo+vaV4gshd6RqFveQHq0Lg7fYjqD7GgDRooooAKxfFGqX2kaSlxptvFcXT3EUKRSkgNvcKRkdDzwag8TeMdN8MpHFL5l1qM/FtYWw3zTHtgDoPc/r0rL0fRPEesXI1XxTfG2G+OW30m1KmODY4dS7EHc2Rg47E8+ndQw/KlXrWUfP7Xot/V7edyW+iNnQ/EC63f3awBfsqQQSxEgh8vv3KwzwQUxjsc1uV5X400ew0zWbm7uNH1xLC6ZLltU0u45tJxuBYRjkDDEljnljgdcyaJ4l8SW9uLjTb+z8ZaOpG5oT5V9Cv8AtJ/Fj0xuNdVXLo1KarUXZNLR+n82179HyvsiVOzsz1CiuW8O/ELw74kmNrb3ZtdRU7X0+9XybhT6bD1/DNdTXjmgUUUUAFFFBIAyTgUAeeeI/wDiovi14c0IfNa6RE+r3Q7b/uQj6g5P0Neh1558MQdZu/EfjJxkavfGO1Y/8+0PyJj6ndn6V6HQAUUUUAFc94u8KxeJtPj8uU2mp2jebY3qfehkH81OOR/hXQ0VpSqzozVSDs0Jq6sznfD2u3E1umna8IbbX4bfzri3jbcCgYqJBjjBxnA6ZFaja1piafDfy39vDaTxrLHLNIIwykZB+bHY1y2tabJd+KtTks5oF1aKxt57KNpVDSBWmWRSuchGDhSSMZIPUUaR8NvDhsrG71XRIZtT+yQJcec5kAZY1Ujbkr27CvQqUsL/ABJyavbRJPdX0u1p0T17Epy2RNffFHwhZblTVBeyqM+VYxtOx5A42jHUjvWd/wALD17Ujt0H4f63Pn7smolLNPr82ciu5s9OstOi8qxs7e2j/uwRKg/ICrNcFZ0nL90ml5u/5JFK/U8V0XVr3Q/FeuXPi+FNK8V6snlaZeXfzWKxgfJErqTjDcnPXA7nnrdG+IbW+ox6F4zs10XWG4imLZtLv3jkPA/3T6gdeK7DVdI0/XNOl0/VLOK7tJRhopVyPqPQ+45FeYa14N1bwzp0lnb2Z8V+EW5fSLo7rq0HrA/U47Dr6dzWQz1ymu6RRtJIyoigszMcAAdSTXj3hDWdV05In8KXcnifw55ixS6dcuE1DTCTjHzfeUe/pxwCa3/Hl9c+JdZtvh/pEzRvdIJtXuY+tvad1/3n6Y9D6GgCDwyj/EDxi/jG6Vv7E01nt9EiYcSNnElwR7kYH09RXpdV7CxttMsLexs4VhtreMRxRr0VQMAVYoAKKKKACq95Zw3sBilX3Vh1U+oqxRUzhGcXGSumCdjNtri5tN8F8GdUA2TqM7wTgAj+9UlxqcUARRFM8z/ciCEMfz7VLevF5axSLvMhwqbsEkc8H1FJYFmgJZw+HYD5gxUehI71wpVIy9hCfTd6tf0tt31Zem7MnUvD58T6fLZ67n7DLjdaQuVyAQfmYc9u1aum6XYaPYx2Om2kNraxjCxQoFUf/X96t0V10qSpRsm35t3/AK+WhLdwIBGDyK4rWPhhod9eHUtJe40HVu15pj+VuP8AtIPlYevGT612tFaiPOf7b8e+D+Nd0tPEumL/AMv+lJtuVHq8PQ/8B496im+IeoeL3/sz4f2byTED7Vqd7C0cNlnttIy8g9On1Gcel14Tqep33hzT/iFJYXUttJZeIbe8do2wSkxTcD7GgD0/wp4IsvDIlupJ5tS1e4O651G6O6RycZC/3V4HA9BnOK6eiim23uAVyWu/DvQ9ZmN5bifSdTDb1vtNfyZA3qccN15yPxrraKcZyimk7XA8qufBOv6jqVtpniyw0vxLpjEqmsJ/ot5agAkFtvXoOFPJPNafwye+tNQ8U6BealdX0WlX6x2z3T75FiZMqC3fgV6FXn/hr/RfjJ43t+guoLG5UfSMof1qQPQKKKKACuY+It1qNn8Pdcm0q3kuL37MyIkYywDYVmGO6qS34V09FAHNfD1tNPw/0NdJmWW0S0RQy/3gPnz6NuzketdLXmOr2Vz8Mtcm8SaRC8vhi8k3atp8Yz9lY/8ALxGPT+8P6Y2+j2V5bajZQ3lnOk9tOgkjlQ5VlPQigCeiiigAoorm/H2vnw14G1bVIzidISkGOplf5Ux6/MQfwoAx/A//ABOvFPibxQ3zRyXA0+zP/TKLgkezNz+Fd5WF4M0U+HvB2l6W4HnQwKZsDGZG+Z//AB4mt2unF1o1qrlD4dEvRKy/DfzFFWQUUUVzDCiiigDjfE/w/tdXvf7a0e5fRvEUYJjv7bjzD/dlXo6n35/DipvAfhK48M6fdXGq3SXuvajMZ7+8UcO38KrwPlUdBgdTwOldZRQAUUUUAFFFFABRRRQBmXpDa3p6noiyOfyxS6CD/ZMbnq7Mx/76NcN4l8RX1l4811YZ8WemeFZbvyyoI8/edpzjPQdK6/wZNcXPgnRLm7INxPZRTSYGBuZQx4/GuGnhZxxLrN6O/wCKiv8A238SnLSxuUUUV3EhRRRQAV4d40tc6t8VLPHF1pdlegf9chgn9K9xryfxda7/AImazbY+XU/B1xEB6urt/Q0Aek6JdfbdA067znz7WKTP+8oP9av1y/w3uvtfw28OS5ziwijz/urt/pXUUAFFFFABXn4/0X4/MOiXvh0H6uk/+Favivx5Y+G5o9NtoJdU124H+j6Za8yN7sf4F9z/AENczeavDd/G3wnFHt/tBdOuE1C3icSfZcpvCuw4+8CPy9RQB6jRRRQAUUUUANkjSWNo5EV0YFWVhkEHqCK8v/ffCTW/45PA+oTe5OlTMf8A0Ux/L6/e9SqC9srbUbKeyvIEntp0McsTjIZT1BoAlR0kjWSNldGAKspyCD3BrH1Pxd4c0XI1LXNPtmHVJLhQ/wD3znJ/KuRtPgzo8VsLO91zxDfWCEiKylv2WGNOyhVx/Suh0v4deDtG2my8OaerL0eSISuP+BPk/rQBjP8AGHw7cSGLRbTWNdkBxjTbB3Gfq22s69XxR4/1vQ4b7wtNo/h+zvVvbiS6uYzJMyAlEMY5Ayeev6V6eiLGgRFCqBgKowBTqACiiigAooooAKKKKACiiigAooooAKKKKACiiigDzLxR4K12/uvHN7aLBK+rWNtaWCebhsKP3m7OAPbnmvQ9MtPsGk2dn/z7wJFx/sqB/SrVFABRRRQAUUUUAFcL408MeILvxLpXibw1cWRvdPhkge0vQRHPG/Ubh0P146c+vdUUAef6d8TbCwni0vxVpVx4XvD8qC4UG1c/7Eq/Lj64HvXewzRXEKTQyJJE43K6MCrD1BHWor2ws9StHtL61hurdxh4pkDq31BrhJvhrcaJM934F1y40WQnc1jMTPZyH/cbJXPqM47CgD0GSRIYmlldUjQFmZjgKB1JNeZ6v471DxKbqz8Gyw2umW+RfeJLsYt4AOvlZ++3v0/MGs7xN4qvBo50P4heDNRkE0iCKTRpDJBduDkLkMGXOPunJrQ0jwHqHiYWtz4wghstJt8Gx8NWZ2wQgdDMR99vbp+ZFAHP+GdLu9YE9r4HNxZ2E7Eaj4tvlLXV6c/MIc84z34x7Ec+p+GPCWkeEdPNppVvtLndNPId0s7f3nbqT19ueBWzFFHBEkUMaxxooVEQYCgdAAOgpf/2Q==", "image/png": "", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Generate Mol Viz with MTP-specific generation\n", "from rdkit import Chem\n", "from rdkit.Chem import Draw\n", "import selfies as sf\n", "import torch\n", "\n", "# Setup device first\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "# Check if MTP-specific generation is available\n", "if hasattr(model, 'generate_with_logprobs'):\n", " print(\"Using MTP-specific generation...\")\n", " input_ids = tokenizer(\"\", return_tensors=\"pt\").input_ids.to(device)\n", " \n", " # Try MTP-specific generation with log probabilities\n", " try:\n", " outputs = model.generate_with_logprobs(\n", " input_ids,\n", " max_new_tokens=25, # Correct parameter name\n", " temperature=1,\n", " top_k=50,\n", " do_sample=True,\n", " return_probs=True, # This returns action probabilities\n", " tokenizer=tokenizer # Pass tokenizer for decoding\n", " )\n", " \n", " # Handle the output (returns: decoded_list, logprobs, tokens, probs)\n", " gen = outputs[2] # Get the generated token IDs (index 2)\n", " except Exception as e:\n", " print(f\"MTP generation failed: {e}, falling back to standard generation\")\n", " gen = model.generate(input_ids, max_length=25, top_k=50, temperature=1, do_sample=True, pad_token_id=tokenizer.pad_token_id)\n", "else:\n", " print(\"Using standard generation...\")\n", " input_ids = tokenizer(\"\", return_tensors=\"pt\").input_ids.to(device)\n", " gen = model.generate(input_ids, max_length=25, top_k=50, temperature=1, do_sample=True, pad_token_id=tokenizer.pad_token_id)\n", "\n", "# Decode and process the generated molecule\n", "generatedmol = tokenizer.decode(gen[0], skip_special_tokens=True)\n", "test = generatedmol.replace(' ', '')\n", "csmi_gen = sf.decoder(test)\n", "print(f\"Generated SELFIES: {test}\")\n", "print(f\"Decoded SMILES: {csmi_gen}\")\n", "\n", "mol = Chem.MolFromSmiles(csmi_gen)\n", "\n", "# Draw the molecule\n", "if mol is not None:\n", " img = Draw.MolToImage(mol)\n", " display(img) # Use display() in Jupyter notebooks\n", "else:\n", " print(\"โŒ Could not create molecule from generated SMILES\")\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "0dc9e278", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "--- Standard Generation Test ---\n", "Generated SELFIES 1: [C]\n", "Generated SELFIES 2: [C] .[N] [=C] [Branch1] [C] [N] [S] [N] [C] [C] [C] [C] [=C] [C] [Branch1] [C] [Br] [=C] [C] [=C] [Ring1] [#Branch1] [C] [Ring1] [N]\n", "Generated SELFIES 3: [C] [Ring1] [Ring1] [C] [C] [C] [C] [C] [Ring1] [=Branch1]\n" ] } ], "source": [ "print(\"\\n--- Standard Generation Test ---\")\n", "input_ids = tokenizer(\" [C]\", return_tensors=\"pt\").input_ids.to(device)\n", "with torch.no_grad():\n", " model.set_mtp_training(False)\n", " gen = model.generate(\n", " input_ids,\n", " max_length=25,\n", " top_k=50,\n", " top_p=0.9,\n", " temperature=1.0,\n", " do_sample=True,\n", " pad_token_id=tokenizer.pad_token_id,\n", " eos_token_id=tokenizer.eos_token_id,\n", " num_return_sequences=3,\n", " )\n", " for i, sequence in enumerate(gen):\n", " result = tokenizer.decode(sequence, skip_special_tokens=True)\n", " print(f\"Generated SELFIES {i+1}: {result}\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "366bd9c2", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Device set to use cuda:0\n" ] }, { "data": { "text/plain": [ "[{'label': 'Easy', 'score': 0.9802612066268921}]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import pipeline\n", "\n", "classifier = pipeline(\"text-classification\", model=\"gbyuvd/synthaccess-chemselfies\")\n", "classifier(\".[C] [C] [=C] [C] [=C] [Branch1] [P] [C] [N] [C] [C] [C@@H1] [C] [C] [C@@H1] [C] [C@H1] [Ring1] [=Branch1] [C@H1] [Ring1] [=Branch2] [C] [Ring1] [Ring2] [S] [Ring1] [#C]\") # Gabapentin\n", "# [{'label': 'Easy', 'score': 0.9187200665473938}]\n" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.0" } }, "nbformat": 4, "nbformat_minor": 5 }