Kirim-V1-Base / docker-compose.yml
Kirim1's picture
Create docker-compose.yml
177e0fd verified
version: '3.8'
services:
kirim-v1-base:
build:
context: .
dockerfile: Dockerfile
image: kirim-ai/kirim-v1-base:latest
container_name: kirim-v1-inference
# GPU support
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
# Environment variables
environment:
- CUDA_VISIBLE_DEVICES=0
- HF_HOME=/app/models/.cache
- TRANSFORMERS_CACHE=/app/models
- MODEL_PATH=Kirim-ai/Kirim-V1-base
- LOAD_IN_4BIT=false
- LOAD_IN_8BIT=false
# Volumes for persistent storage
volumes:
- ./models:/app/models
- ./outputs:/app/outputs
- ~/.cache/huggingface:/root/.cache/huggingface
# Port mapping (for API server)
ports:
- "8000:8000"
# Keep container running
stdin_open: true
tty: true
# Restart policy
restart: unless-stopped
# Resource limits
shm_size: '16gb'
# Health check
healthcheck:
test: ["CMD", "python3", "-c", "import torch; assert torch.cuda.is_available()"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
# Optional: Quantized version (4-bit)
kirim-v1-4bit:
build:
context: .
dockerfile: Dockerfile
image: kirim-ai/kirim-v1-base:4bit
container_name: kirim-v1-4bit
profiles: ["quantized"]
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
environment:
- CUDA_VISIBLE_DEVICES=0
- HF_HOME=/app/models/.cache
- LOAD_IN_4BIT=true
volumes:
- ./models:/app/models
- ./outputs:/app/outputs
ports:
- "8001:8000"
stdin_open: true
tty: true
restart: unless-stopped
shm_size: '8gb'
command: ["python3", "inference.py", "--load_in_4bit", "--chat"]
volumes:
models:
outputs:
networks:
default:
name: kirim-network