File size: 2,075 Bytes
177e0fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
version: '3.8'

services:
  kirim-v1-base:
    build:
      context: .
      dockerfile: Dockerfile
    image: kirim-ai/kirim-v1-base:latest
    container_name: kirim-v1-inference
    
    # GPU support
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    
    # Environment variables
    environment:
      - CUDA_VISIBLE_DEVICES=0
      - HF_HOME=/app/models/.cache
      - TRANSFORMERS_CACHE=/app/models
      - MODEL_PATH=Kirim-ai/Kirim-V1-base
      - LOAD_IN_4BIT=false
      - LOAD_IN_8BIT=false
    
    # Volumes for persistent storage
    volumes:
      - ./models:/app/models
      - ./outputs:/app/outputs
      - ~/.cache/huggingface:/root/.cache/huggingface
    
    # Port mapping (for API server)
    ports:
      - "8000:8000"
    
    # Keep container running
    stdin_open: true
    tty: true
    
    # Restart policy
    restart: unless-stopped
    
    # Resource limits
    shm_size: '16gb'
    
    # Health check
    healthcheck:
      test: ["CMD", "python3", "-c", "import torch; assert torch.cuda.is_available()"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s

  # Optional: Quantized version (4-bit)
  kirim-v1-4bit:
    build:
      context: .
      dockerfile: Dockerfile
    image: kirim-ai/kirim-v1-base:4bit
    container_name: kirim-v1-4bit
    profiles: ["quantized"]
    
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    
    environment:
      - CUDA_VISIBLE_DEVICES=0
      - HF_HOME=/app/models/.cache
      - LOAD_IN_4BIT=true
    
    volumes:
      - ./models:/app/models
      - ./outputs:/app/outputs
    
    ports:
      - "8001:8000"
    
    stdin_open: true
    tty: true
    restart: unless-stopped
    shm_size: '8gb'
    
    command: ["python3", "inference.py", "--load_in_4bit", "--chat"]

volumes:
  models:
  outputs:

networks:
  default:
    name: kirim-network