Oviya commited on
Commit
f4c8c9a
·
1 Parent(s): b8af084
Files changed (2) hide show
  1. Dockerfile +20 -1
  2. requirements.txt +3 -0
Dockerfile CHANGED
@@ -1,5 +1,10 @@
1
  FROM python:3.11-slim
2
- ENV DEBIAN_FRONTEND=noninteractive
 
 
 
 
 
3
 
4
  # System deps + Microsoft key via keyring (for msodbcsql17)
5
  RUN set -eux; \
@@ -8,6 +13,10 @@ RUN set -eux; \
8
  curl ca-certificates gnupg2 apt-transport-https \
9
  unixodbc unixodbc-dev \
10
  ffmpeg \
 
 
 
 
11
  ; \
12
  mkdir -p /etc/apt/keyrings; \
13
  curl -fsSL https://packages.microsoft.com/keys/microsoft.asc \
@@ -16,12 +25,22 @@ RUN set -eux; \
16
  > /etc/apt/sources.list.d/mssql-release.list; \
17
  apt-get update; \
18
  ACCEPT_EULA=Y apt-get install -y msodbcsql17; \
 
 
19
  rm -rf /var/lib/apt/lists/*
20
 
21
  WORKDIR /app
 
 
 
 
22
  COPY requirements.txt /app/
23
  RUN pip install --no-cache-dir -r requirements.txt
 
 
24
  COPY . /app
25
 
26
  EXPOSE 7860
 
 
27
  CMD ["gunicorn","--workers","2","--threads","4","--timeout","120","-b","0.0.0.0:7860","verification:app"]
 
1
  FROM python:3.11-slim
2
+ ENV DEBIAN_FRONTEND=noninteractive \
3
+ PYTHONUNBUFFERED=1 \
4
+ # Tesseract path for pytesseract (your code reads this)
5
+ TESSERACT_CMD=/usr/bin/tesseract \
6
+ # Default Chroma persistence (your code also reads CHROMA_DIR)
7
+ CHROMA_DIR=/data/chroma
8
 
9
  # System deps + Microsoft key via keyring (for msodbcsql17)
10
  RUN set -eux; \
 
13
  curl ca-certificates gnupg2 apt-transport-https \
14
  unixodbc unixodbc-dev \
15
  ffmpeg \
16
+ # ---- Added for pdf2image + pytesseract ----
17
+ poppler-utils \ # provides pdftoppm / pdftocairo
18
+ tesseract-ocr \ # OCR engine
19
+ tesseract-ocr-eng # English language data (pulled with tesseract on many distros, kept explicit)
20
  ; \
21
  mkdir -p /etc/apt/keyrings; \
22
  curl -fsSL https://packages.microsoft.com/keys/microsoft.asc \
 
25
  > /etc/apt/sources.list.d/mssql-release.list; \
26
  apt-get update; \
27
  ACCEPT_EULA=Y apt-get install -y msodbcsql17; \
28
+ # Create a writable place for Chroma persistence
29
+ mkdir -p /data/chroma; \
30
  rm -rf /var/lib/apt/lists/*
31
 
32
  WORKDIR /app
33
+
34
+ # (Optional but helpful) make sure pip is recent and faster
35
+ RUN python -m pip install --upgrade pip
36
+
37
  COPY requirements.txt /app/
38
  RUN pip install --no-cache-dir -r requirements.txt
39
+
40
+ # Your code
41
  COPY . /app
42
 
43
  EXPOSE 7860
44
+
45
+ # Gunicorn entrypoint stays the same
46
  CMD ["gunicorn","--workers","2","--threads","4","--timeout","120","-b","0.0.0.0:7860","verification:app"]
requirements.txt CHANGED
@@ -23,6 +23,9 @@ langchain-text-splitters==0.2.2
23
  sentence-transformers==2.2.2
24
  pypdf>=4
25
  tiktoken
 
 
 
26
 
27
 
28
 
 
23
  sentence-transformers==2.2.2
24
  pypdf>=4
25
  tiktoken
26
+ pdf2image==1.17.0
27
+ pytesseract==0.3.10
28
+ Pillow==10.4.0
29
 
30
 
31