Upload README.md with huggingface_hub
Browse files
README.md
CHANGED
|
@@ -37,16 +37,43 @@ outputs = model.generate(input_ids, max_new_tokens=128)
|
|
| 37 |
print(tokenizer.decode(outputs[0]))
|
| 38 |
```
|
| 39 |
|
| 40 |
-
How to use
|
| 41 |
```python
|
| 42 |
-
from
|
|
|
|
| 43 |
model_path = 'vita-group/llama-2-7b_wanda_2_4_gptq_4bit_128g'
|
|
|
|
| 44 |
model = AutoGPTQForCausalLM.from_quantized(
|
| 45 |
model_path,
|
| 46 |
# inject_fused_attention=False, # or
|
| 47 |
disable_exllama=True,
|
| 48 |
device_map='auto',
|
|
|
|
| 49 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
```
|
| 51 |
|
| 52 |
|
|
|
|
| 37 |
print(tokenizer.decode(outputs[0]))
|
| 38 |
```
|
| 39 |
|
| 40 |
+
How to use wanda+gptq models
|
| 41 |
```python
|
| 42 |
+
from transformers import AutoTokenizer
|
| 43 |
+
from auto_gptq import AutoGPTQForCausalLM
|
| 44 |
model_path = 'vita-group/llama-2-7b_wanda_2_4_gptq_4bit_128g'
|
| 45 |
+
tokenizer_path = 'meta-llama/Llama-2-7b-hf'
|
| 46 |
model = AutoGPTQForCausalLM.from_quantized(
|
| 47 |
model_path,
|
| 48 |
# inject_fused_attention=False, # or
|
| 49 |
disable_exllama=True,
|
| 50 |
device_map='auto',
|
| 51 |
+
revision='2bit_128g',
|
| 52 |
)
|
| 53 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
|
| 54 |
+
input_ids = tokenizer('Hello! I am a VITA-compressed-LLM chatbot!', return_tensors='pt').input_ids.to('cuda')
|
| 55 |
+
outputs = model.generate(input_ids=input_ids, max_length=128)
|
| 56 |
+
tokenizer.decode(outputs[0])
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
How to use gptq models
|
| 60 |
+
```python
|
| 61 |
+
from transformers import AutoTokenizer
|
| 62 |
+
from auto_gptq import AutoGPTQForCausalLM
|
| 63 |
+
model_path = 'vita-group/vicuna-7b-v1.3_gptq'
|
| 64 |
+
tokenizer_path = 'lmsys/vicuna-7b-v1.3'
|
| 65 |
+
revision = '2bit_128g'
|
| 66 |
+
model = AutoGPTQForCausalLM.from_quantized(
|
| 67 |
+
model_path,
|
| 68 |
+
# inject_fused_attention=False, # or
|
| 69 |
+
disable_exllama=True,
|
| 70 |
+
device_map='auto',
|
| 71 |
+
revision=revision,
|
| 72 |
+
)
|
| 73 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
|
| 74 |
+
input_ids = tokenizer('Hello! I am a VITA-compressed-LLM chatbot!', return_tensors='pt').input_ids.to('cuda')
|
| 75 |
+
outputs = model.generate(input_ids=input_ids, max_length=128)
|
| 76 |
+
tokenizer.decode(outputs[0])
|
| 77 |
```
|
| 78 |
|
| 79 |
|