Spaces:
Paused
Paused
| import streamlit as st | |
| from langchain.chains import LLMChain | |
| from langchain.prompts import PromptTemplate | |
| import torch,os | |
| from langchain.llms import HuggingFacePipeline | |
| from transformers import AutoTokenizer,AutoModelForCausalLM,pipeline,BitsAndBytesConfig | |
| model_name_or_path = "meta-llama/Llama-2-13b-chat-hf" | |
| # Count the number of GPUs available | |
| gpu_count = torch.cuda.device_count() | |
| # Determine the device to use based on GPU availability and count | |
| # If more than one GPU is available, use 'auto' to allow the library to choose | |
| # If only one GPU is available, use 'cuda:0' to specify the first GPU | |
| # If no GPU is available, use the CPU | |
| if torch.cuda.is_available() and gpu_count > 1: | |
| device = 'auto' | |
| elif torch.cuda.is_available(): | |
| device = 'cuda:0' | |
| else: | |
| device = 'cpu' | |
| tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) | |
| model = AutoModelForCausalLM.from_pretrained(model_name_or_path, | |
| # quantization_config=bnb_config, | |
| torch_dtype=torch.float16, | |
| device_map='auto',) | |
| print(model.hf_device_map) | |
| pipe = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| max_length=2500, | |
| return_full_text=True, | |
| do_sample=True, | |
| repetition_penalty=1.15, | |
| num_return_sequences=1, | |
| pad_token_id=2, | |
| model_kwargs={"temperature": 0.3, | |
| "top_p":0.95, | |
| "top_k":40, | |
| "max_new_tokens":2500}, | |
| ) | |
| llm = HuggingFacePipeline(pipeline=pipe) | |
| template = template = """Prompt: {query} | |
| Answer: """ | |
| prompt_template = PromptTemplate( | |
| input_variables=["query"], | |
| template=template | |
| ) | |
| #instantiate the chain | |
| llm_chain = LLMChain(prompt=prompt_template, llm=llm) | |
| st.title('Test Multi GPU') | |
| md = st.text_area('Type in your markdown string (without outer quotes)') | |
| if st.button("Enter"): | |
| with st.spinner(text="In progress..."): | |
| resp=llm_chain.invoke(md)['text'] | |
| st.write(resp) | |