Oleg Lavrovsky commited on
Commit
5eb40a3
·
unverified ·
0 Parent(s):

Initial release 🫂

Browse files
Files changed (5) hide show
  1. .gitignore +39 -0
  2. Dockerfile +18 -0
  3. README.md +8 -0
  4. app.py +122 -0
  5. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.py[cod]
2
+ .flaskenv
3
+ coverage.xml
4
+ profile/*.prof
5
+
6
+ build
7
+ eggs
8
+ parts
9
+ bin
10
+ var
11
+ sdist
12
+ develop-eggs
13
+ .installed.cfg
14
+ lib
15
+ lib64
16
+ .coverage
17
+ .tox
18
+ nosetests.xml
19
+ .cache
20
+ tests/.cache
21
+ .pytest_cache/
22
+ .vscode/
23
+
24
+ *.mo
25
+
26
+ .project
27
+ .pydevproject
28
+ /*.db
29
+ .idea/
30
+ .ropeproject/
31
+
32
+ .env
33
+ env/
34
+ debug.sh
35
+ .db/
36
+ data/
37
+ pip-log.txt
38
+ start.sh
39
+ debug.sh
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use official lightweight Python image
2
+ FROM python:3-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Install dependencies
8
+ COPY requirements.txt .
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+
11
+ # Copy project files
12
+ COPY . .
13
+
14
+ # Expose port
15
+ EXPOSE 8000
16
+
17
+ # Run FastAPI app with Uvicorn
18
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Apertus transformer on FastAPI
2
+ ------------------------------
3
+
4
+ A FastAPI-based Python application that provides an API to interface with the Apertus LLM from the Swiss AI Initiative.
5
+
6
+ ## TODOs
7
+
8
+ - Implement the Apertus Format API https://github.com/swiss-ai/apertus_format
app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import asynccontextmanager
2
+ from fastapi import FastAPI, HTTPException
3
+ from pydantic import BaseModel
4
+
5
+ from torch import cuda
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer
7
+
8
+ import logging
9
+ # Configure logging
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ model = None
15
+ tokenizer = None
16
+
17
+ class TextInput(BaseModel):
18
+ text: str
19
+ min_length: int = 3
20
+ # Apertus by default supports a context length up to 65,536 tokens.
21
+ max_length: int = 65536
22
+
23
+ class ModelResponse(BaseModel):
24
+ text: str
25
+ confidence: float
26
+ processing_time: float
27
+
28
+
29
+ @asynccontextmanager
30
+ async def lifespan(app: FastAPI):
31
+ """Load the transformer model on startup"""
32
+ global model, tokenizer
33
+ try:
34
+ logger.info("Loading sentiment analysis model...")
35
+ # TODO: make this configurable
36
+ model_name = "swiss-ai/Apertus-8B-Instruct-2509"
37
+
38
+ # Automatically select device based on availability
39
+ device = "cuda" if cuda.is_available() else "cpu"
40
+
41
+ # load the tokenizer and the model
42
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
43
+ model = AutoModelForCausalLM.from_pretrained(
44
+ model_name,
45
+ ).to(device)
46
+ logger.info("Model loaded successfully!")
47
+ except Exception as e:
48
+ logger.error(f"Failed to load model: {e}")
49
+ raise e
50
+ # Release resources when the app is stopped
51
+ yield
52
+ model.clear()
53
+ tokenizer.clear()
54
+
55
+
56
+ # Setup our app
57
+ app = FastAPI(
58
+ title="Apertus API",
59
+ description="REST API for serving Apertus models via Hugging Face transformers",
60
+ version="0.1.0",
61
+ lifespan=lifespan
62
+ )
63
+
64
+ @app.get("/predict", response_model=ModelResponse)
65
+ async def predict(q: str):
66
+ """Generate a model response for input text"""
67
+ if model is None or tokenizer is None:
68
+ raise HTTPException(status_code=503, detail="Model not loaded")
69
+
70
+ try:
71
+ import time
72
+ start_time = time.time()
73
+
74
+ input_data = TextInput(text=q)
75
+
76
+ # Truncate text if too long
77
+ text = input_data.text[:input_data.max_length]
78
+ if len(text) == input_data.max_length:
79
+ logger.warning("Warning: text truncated")
80
+ if len(text) < input_data.min_length:
81
+ logger.warning("Warning: empty text, aborting")
82
+ return None
83
+
84
+ # Prepare the model input
85
+ messages_think = [
86
+ {"role": "user", "content": text}
87
+ ]
88
+ text = tokenizer.apply_chat_template(
89
+ messages_think,
90
+ tokenize=False,
91
+ add_generation_prompt=True,
92
+ )
93
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
94
+
95
+ # Generate the output
96
+ generated_ids = model.generate(**model_inputs, max_new_tokens=32768)
97
+
98
+ # Get and decode the output
99
+ output_ids = generated_ids[0][len(model_inputs.input_ids[0]) :]
100
+ result = tokenizer.decode(output_ids, skip_special_tokens=True)
101
+
102
+ # Checkpoint
103
+ processing_time = time.time() - start_time
104
+
105
+ return ModelResponse(
106
+ text=result['label'],
107
+ confidence=result['score'],
108
+ processing_time=processing_time
109
+ )
110
+
111
+ except Exception as e:
112
+ logger.error(f"Evaluation error: {e}")
113
+ raise HTTPException(status_code=500, detail="Evaluation failed")
114
+
115
+ @app.get("/health")
116
+ async def health_check():
117
+ """Health check and basic configuration"""
118
+ return {
119
+ "status": "healthy",
120
+ "model_loaded": model is not None,
121
+ "gpu_available": cuda.is_available()
122
+ }
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ contextlib
2
+ fastapi
3
+ pydantic
4
+ torch
5
+ transformers
6
+ uvicorn