Spaces:
Running
Running
Oviya
commited on
Commit
·
f4c8c9a
1
Parent(s):
b8af084
fix
Browse files- Dockerfile +20 -1
- requirements.txt +3 -0
Dockerfile
CHANGED
|
@@ -1,5 +1,10 @@
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
-
ENV DEBIAN_FRONTEND=noninteractive
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
# System deps + Microsoft key via keyring (for msodbcsql17)
|
| 5 |
RUN set -eux; \
|
|
@@ -8,6 +13,10 @@ RUN set -eux; \
|
|
| 8 |
curl ca-certificates gnupg2 apt-transport-https \
|
| 9 |
unixodbc unixodbc-dev \
|
| 10 |
ffmpeg \
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
; \
|
| 12 |
mkdir -p /etc/apt/keyrings; \
|
| 13 |
curl -fsSL https://packages.microsoft.com/keys/microsoft.asc \
|
|
@@ -16,12 +25,22 @@ RUN set -eux; \
|
|
| 16 |
> /etc/apt/sources.list.d/mssql-release.list; \
|
| 17 |
apt-get update; \
|
| 18 |
ACCEPT_EULA=Y apt-get install -y msodbcsql17; \
|
|
|
|
|
|
|
| 19 |
rm -rf /var/lib/apt/lists/*
|
| 20 |
|
| 21 |
WORKDIR /app
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
COPY requirements.txt /app/
|
| 23 |
RUN pip install --no-cache-dir -r requirements.txt
|
|
|
|
|
|
|
| 24 |
COPY . /app
|
| 25 |
|
| 26 |
EXPOSE 7860
|
|
|
|
|
|
|
| 27 |
CMD ["gunicorn","--workers","2","--threads","4","--timeout","120","-b","0.0.0.0:7860","verification:app"]
|
|
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
+
ENV DEBIAN_FRONTEND=noninteractive \
|
| 3 |
+
PYTHONUNBUFFERED=1 \
|
| 4 |
+
# Tesseract path for pytesseract (your code reads this)
|
| 5 |
+
TESSERACT_CMD=/usr/bin/tesseract \
|
| 6 |
+
# Default Chroma persistence (your code also reads CHROMA_DIR)
|
| 7 |
+
CHROMA_DIR=/data/chroma
|
| 8 |
|
| 9 |
# System deps + Microsoft key via keyring (for msodbcsql17)
|
| 10 |
RUN set -eux; \
|
|
|
|
| 13 |
curl ca-certificates gnupg2 apt-transport-https \
|
| 14 |
unixodbc unixodbc-dev \
|
| 15 |
ffmpeg \
|
| 16 |
+
# ---- Added for pdf2image + pytesseract ----
|
| 17 |
+
poppler-utils \ # provides pdftoppm / pdftocairo
|
| 18 |
+
tesseract-ocr \ # OCR engine
|
| 19 |
+
tesseract-ocr-eng # English language data (pulled with tesseract on many distros, kept explicit)
|
| 20 |
; \
|
| 21 |
mkdir -p /etc/apt/keyrings; \
|
| 22 |
curl -fsSL https://packages.microsoft.com/keys/microsoft.asc \
|
|
|
|
| 25 |
> /etc/apt/sources.list.d/mssql-release.list; \
|
| 26 |
apt-get update; \
|
| 27 |
ACCEPT_EULA=Y apt-get install -y msodbcsql17; \
|
| 28 |
+
# Create a writable place for Chroma persistence
|
| 29 |
+
mkdir -p /data/chroma; \
|
| 30 |
rm -rf /var/lib/apt/lists/*
|
| 31 |
|
| 32 |
WORKDIR /app
|
| 33 |
+
|
| 34 |
+
# (Optional but helpful) make sure pip is recent and faster
|
| 35 |
+
RUN python -m pip install --upgrade pip
|
| 36 |
+
|
| 37 |
COPY requirements.txt /app/
|
| 38 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 39 |
+
|
| 40 |
+
# Your code
|
| 41 |
COPY . /app
|
| 42 |
|
| 43 |
EXPOSE 7860
|
| 44 |
+
|
| 45 |
+
# Gunicorn entrypoint stays the same
|
| 46 |
CMD ["gunicorn","--workers","2","--threads","4","--timeout","120","-b","0.0.0.0:7860","verification:app"]
|
requirements.txt
CHANGED
|
@@ -23,6 +23,9 @@ langchain-text-splitters==0.2.2
|
|
| 23 |
sentence-transformers==2.2.2
|
| 24 |
pypdf>=4
|
| 25 |
tiktoken
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
|
|
|
|
| 23 |
sentence-transformers==2.2.2
|
| 24 |
pypdf>=4
|
| 25 |
tiktoken
|
| 26 |
+
pdf2image==1.17.0
|
| 27 |
+
pytesseract==0.3.10
|
| 28 |
+
Pillow==10.4.0
|
| 29 |
|
| 30 |
|
| 31 |
|