Improve model card: Add metadata, abstract, update GitHub link, and refine usage
Browse filesThis PR enhances the model card for ProLLaMA by:
- Adding `pipeline_tag: text-generation` to improve discoverability on the Hugging Face Hub.
- Adding `library_name: transformers` to enable the automated "how to use" widget.
- Adding `tags: - protein-language-model` for better categorization.
- Updating the GitHub repository link to `https://github.com/PKU-YuanGroup/ProLLaMA` as indicated in the project's official GitHub README.
- Including the paper's abstract for a comprehensive overview of the model.
- Refining the usage section to incorporate the input format and a concise Python inference example directly from the GitHub README's "Quick Inference" section, providing a clear and functional `transformers` usage snippet.
These changes will significantly improve the model's visibility, context, and user experience on the Hugging Face Hub.
|
@@ -1,36 +1,57 @@
|
|
| 1 |
---
|
| 2 |
license: apache-2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
---
|
|
|
|
| 4 |
# ProLLaMA: A Protein Large Language Model for Multi-Task Protein Language Processing
|
| 5 |
|
| 6 |
[Paper on arxiv](https://arxiv.org/abs/2402.16445) for more information
|
| 7 |
-
|
| 8 |
-
[Github](https://github.com/Lyu6PosHao/ProLLaMA) for more information
|
| 9 |
|
| 10 |
ProLLaMA_Stage_1 is based on Llama-2-7b, so please follow the license of Llama2.
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
```text
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
#You can also specify the first few amino acids of the protein sequence:
|
| 16 |
-
Seq=<
|
| 17 |
```
|
| 18 |
-
|
| 19 |
-
# Quick usage:
|
| 20 |
-
```bash
|
| 21 |
-
# you can replace the model_path with your local path
|
| 22 |
-
CUDA_VISIBLE_DEVICES=0 python main.py --model "GreatCaptainNemo/ProLLaMA_Stage_1" --interactive
|
| 23 |
-
# main.py is as follows 👇:
|
| 24 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
```python
|
| 27 |
-
import argparse
|
| 28 |
-
import json, os
|
| 29 |
import torch
|
| 30 |
-
from transformers import
|
| 31 |
-
from transformers import GenerationConfig
|
| 32 |
from tqdm import tqdm
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
generation_config = GenerationConfig(
|
| 35 |
temperature=0.2,
|
| 36 |
top_k=40,
|
|
@@ -40,84 +61,21 @@ generation_config = GenerationConfig(
|
|
| 40 |
repetition_penalty=1.2,
|
| 41 |
max_new_tokens=400
|
| 42 |
)
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
raise ValueError("interactive is False, but input_file is None.")
|
| 56 |
-
if args.input_file and (args.output_file is None):
|
| 57 |
-
raise ValueError("input_file is not None, but output_file is None.")
|
| 58 |
-
|
| 59 |
-
load_type = torch.bfloat16
|
| 60 |
-
if torch.cuda.is_available():
|
| 61 |
-
device = torch.device(0)
|
| 62 |
-
else:
|
| 63 |
-
raise ValueError("No GPU available.")
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
model = LlamaForCausalLM.from_pretrained(
|
| 67 |
-
args.model,
|
| 68 |
-
torch_dtype=load_type,
|
| 69 |
-
low_cpu_mem_usage=True,
|
| 70 |
-
device_map='auto',
|
| 71 |
-
quantization_config=None
|
| 72 |
-
)
|
| 73 |
-
tokenizer = LlamaTokenizer.from_pretrained(args.model)
|
| 74 |
-
|
| 75 |
-
model.eval()
|
| 76 |
-
with torch.no_grad():
|
| 77 |
-
if args.interactive:
|
| 78 |
-
while True:
|
| 79 |
-
raw_input_text = input("Input:")
|
| 80 |
-
if len(raw_input_text.strip())==0:
|
| 81 |
-
break
|
| 82 |
-
input_text = raw_input_text
|
| 83 |
-
input_text = tokenizer(input_text,return_tensors="pt")
|
| 84 |
-
|
| 85 |
-
generation_output = model.generate(
|
| 86 |
-
input_ids = input_text["input_ids"].to(device),
|
| 87 |
-
attention_mask = input_text['attention_mask'].to(device),
|
| 88 |
-
eos_token_id=tokenizer.eos_token_id,
|
| 89 |
-
pad_token_id=tokenizer.pad_token_id,
|
| 90 |
-
generation_config = generation_config,
|
| 91 |
-
output_attentions=False
|
| 92 |
-
)
|
| 93 |
-
s = generation_output[0]
|
| 94 |
-
output = tokenizer.decode(s,skip_special_tokens=True)
|
| 95 |
-
print("Output:",output)
|
| 96 |
-
print("\n")
|
| 97 |
-
else:
|
| 98 |
-
outputs=[]
|
| 99 |
-
with open(args.input_file, 'r') as f:
|
| 100 |
-
examples =f.read().splitlines()
|
| 101 |
-
print("Start generating...")
|
| 102 |
-
for index, example in tqdm(enumerate(examples),total=len(examples)):
|
| 103 |
-
input_text = tokenizer(example,return_tensors="pt") #add_special_tokens=False ?
|
| 104 |
-
|
| 105 |
-
generation_output = model.generate(
|
| 106 |
-
input_ids = input_text["input_ids"].to(device),
|
| 107 |
-
attention_mask = input_text['attention_mask'].to(device),
|
| 108 |
-
eos_token_id=tokenizer.eos_token_id,
|
| 109 |
-
pad_token_id=tokenizer.pad_token_id,
|
| 110 |
-
generation_config = generation_config
|
| 111 |
-
)
|
| 112 |
-
s = generation_output[0]
|
| 113 |
-
output = tokenizer.decode(s,skip_special_tokens=True)
|
| 114 |
-
outputs.append(output)
|
| 115 |
-
with open(args.output_file,'w') as f:
|
| 116 |
-
f.write("\n".join(outputs))
|
| 117 |
-
print("All the outputs have been saved in",args.output_file)
|
| 118 |
```
|
| 119 |
|
| 120 |
-
|
| 121 |
```
|
| 122 |
@article{lv2025prollama,
|
| 123 |
title={Prollama: A protein large language model for multi-task protein language processing},
|
|
|
|
| 1 |
---
|
| 2 |
license: apache-2.0
|
| 3 |
+
pipeline_tag: text-generation
|
| 4 |
+
library_name: transformers
|
| 5 |
+
tags:
|
| 6 |
+
- protein-language-model
|
| 7 |
---
|
| 8 |
+
|
| 9 |
# ProLLaMA: A Protein Large Language Model for Multi-Task Protein Language Processing
|
| 10 |
|
| 11 |
[Paper on arxiv](https://arxiv.org/abs/2402.16445) for more information
|
| 12 |
+
[Github](https://github.com/PKU-YuanGroup/ProLLaMA) for more information
|
|
|
|
| 13 |
|
| 14 |
ProLLaMA_Stage_1 is based on Llama-2-7b, so please follow the license of Llama2.
|
| 15 |
|
| 16 |
+
## Abstract
|
| 17 |
+
Recent advances in Protein Language Models (PLMs) have transformed protein engineering, yet unlike their counterparts in Natural Language Processing (NLP), current PLMs exhibit a fundamental limitation: they excel in either Protein Language Understanding (PLU) or Protein Language Generation (PLG), but rarely both. This fragmentation hinders progress in protein engineering. To bridge this gap, we introduce ProLLaMA, a multitask protein language model enhanced by the Evolutionary Protein Generation Framework (EPGF). We construct a comprehensive instruction dataset containing approximately 13 million samples with over 11,000 superfamily annotations to facilitate better modeling of sequence-function landscapes. We leverage a two-stage training approach to develop ProLLaMA, a multitask LLM with protein domain expertise. Our EPGF addresses the mismatch between statistic language modeling and biological constraints through three innovations: a multi-dimensional interpretable scorer, hierarchical efficient decoding, and a probabilistic-biophysical joint selection mechanism. Extensive experiments demonstrate that ProLLaMA excels in both unconditional and controllable protein generation tasks, achieving superior structural quality metrics compared to existing PLMs. Additionally, ProLLaMA demonstrates strong understanding capabilities with a 67.1% exact match rate in superfamily prediction. EPGF significantly enhances the biological viability of generated sequences, as evidenced by improved biophysical scores (+4.3%) and structural metrics (+14.5%).
|
| 18 |
+
|
| 19 |
+
## Usage
|
| 20 |
+
|
| 21 |
+
This model is compatible with the `transformers` library. Below is a quick example of how to perform inference.
|
| 22 |
+
|
| 23 |
+
### Input Format
|
| 24 |
+
The instructions which you input to the model should follow the following format:
|
| 25 |
```text
|
| 26 |
+
[Generate by superfamily] Superfamily=<xxx>
|
| 27 |
+
or
|
| 28 |
+
[Determine superfamily] Seq=<yyy>
|
| 29 |
+
```
|
| 30 |
+
Here are some examples of the input:
|
| 31 |
+
```text
|
| 32 |
+
[Generate by superfamily] Superfamily=<Ankyrin repeat-containing domain superfamily>
|
| 33 |
+
```
|
| 34 |
+
```
|
| 35 |
#You can also specify the first few amino acids of the protein sequence:
|
| 36 |
+
[Generate by superfamily] Superfamily=<Ankyrin repeat-containing domain superfamily> Seq=<MKRVL
|
| 37 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
```
|
| 39 |
+
[Determine superfamily] Seq=<MAPGGMPREFPSFVRTLPEADLGYPALRGWVLQGERGCVLYWEAVTEVALPEHCHAECWGVVVDGRMELMVDGYTRVYTRGDLYVVPPQARHRARVFPGFRGVEHLSDPDLLPVRKR>
|
| 40 |
+
```
|
| 41 |
+
For a full list of optional superfamilies, refer to [this file](https://github.com/PKU-YuanGroup/ProLLaMA/blob/main/superfamilies.txt) in the GitHub repository.
|
| 42 |
+
|
| 43 |
+
### Quick Inference Example
|
| 44 |
|
| 45 |
```python
|
|
|
|
|
|
|
| 46 |
import torch
|
| 47 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
|
|
|
|
| 48 |
from tqdm import tqdm
|
| 49 |
|
| 50 |
+
device = torch.device('cuda:0')
|
| 51 |
+
|
| 52 |
+
# You can replace the file_path with your local path
|
| 53 |
+
tokenizer = AutoTokenizer.from_pretrained("GreatCaptainNemo/ProLLaMA", use_fast=False, trust_remote_code=True)
|
| 54 |
+
model = AutoModelForCausalLM.from_pretrained("GreatCaptainNemo/ProLLaMA", device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)
|
| 55 |
generation_config = GenerationConfig(
|
| 56 |
temperature=0.2,
|
| 57 |
top_k=40,
|
|
|
|
| 61 |
repetition_penalty=1.2,
|
| 62 |
max_new_tokens=400
|
| 63 |
)
|
| 64 |
+
model.eval()
|
| 65 |
+
print("####Enter 'exit' to exit.")
|
| 66 |
+
with torch.no_grad():
|
| 67 |
+
while True:
|
| 68 |
+
messages = []
|
| 69 |
+
user = str(input("Input:"))
|
| 70 |
+
if user.strip() == "exit":
|
| 71 |
+
break
|
| 72 |
+
inputs = tokenizer(user, return_tensors="pt").to(device)
|
| 73 |
+
generate_ids = model.generate(inputs.input_ids, generation_config=generation_config).to(device)
|
| 74 |
+
response = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
| 75 |
+
print("Output:", response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
```
|
| 77 |
|
| 78 |
+
## Citation:
|
| 79 |
```
|
| 80 |
@article{lv2025prollama,
|
| 81 |
title={Prollama: A protein large language model for multi-task protein language processing},
|