| # pip install llama-cpp-python --prefer-binary --extra-index-url=https://jllllll.github.io/llama-cpp-python-cuBLAS-wheels/AVX2/cu118 | |
| from llama_cpp import Llama | |
| llm = Llama( | |
| model_path="recurv_llama_13B.gguf", | |
| n_ctx=2048, # Context window | |
| n_threads=4 # Number of CPU threads to use | |
| ) | |
| prompt = "What is Paracetamol?" | |
| output = llm( | |
| prompt, | |
| max_tokens=256, # Maximum number of tokens to generate | |
| temperature=0.5, # Controls randomness (0.0 = deterministic, 1.0 = creative) | |
| top_p=0.95, # Nucleus sampling parameter | |
| stop=["###"], # Optional stop words | |
| echo=True # Include prompt in the output | |
| ) | |
| # Print the generated text | |
| print(output['choices'][0]['text']) |