version: '3.8' services: kirim-v1-base: build: context: . dockerfile: Dockerfile image: kirim-ai/kirim-v1-base:latest container_name: kirim-v1-inference # GPU support deploy: resources: reservations: devices: - driver: nvidia count: all capabilities: [gpu] # Environment variables environment: - CUDA_VISIBLE_DEVICES=0 - HF_HOME=/app/models/.cache - TRANSFORMERS_CACHE=/app/models - MODEL_PATH=Kirim-ai/Kirim-V1-base - LOAD_IN_4BIT=false - LOAD_IN_8BIT=false # Volumes for persistent storage volumes: - ./models:/app/models - ./outputs:/app/outputs - ~/.cache/huggingface:/root/.cache/huggingface # Port mapping (for API server) ports: - "8000:8000" # Keep container running stdin_open: true tty: true # Restart policy restart: unless-stopped # Resource limits shm_size: '16gb' # Health check healthcheck: test: ["CMD", "python3", "-c", "import torch; assert torch.cuda.is_available()"] interval: 30s timeout: 10s retries: 3 start_period: 60s # Optional: Quantized version (4-bit) kirim-v1-4bit: build: context: . dockerfile: Dockerfile image: kirim-ai/kirim-v1-base:4bit container_name: kirim-v1-4bit profiles: ["quantized"] deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] environment: - CUDA_VISIBLE_DEVICES=0 - HF_HOME=/app/models/.cache - LOAD_IN_4BIT=true volumes: - ./models:/app/models - ./outputs:/app/outputs ports: - "8001:8000" stdin_open: true tty: true restart: unless-stopped shm_size: '8gb' command: ["python3", "inference.py", "--load_in_4bit", "--chat"] volumes: models: outputs: networks: default: name: kirim-network