Spaces:

ahmadsalahudin
/

Business_card_scanner_to_cev

Runtime error

App Files Files Community

ahmadsalahudin commited on Jan 28

Commit

7b67189

verified ·

1 Parent(s): aa23d05

Create app.py

Browse files

Files changed (1) hide show

app.py +190 -0

app.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import streamlit as st
+import easyocr
+import pandas as pd
+from io import BytesIO
+from PIL import Image
+import numpy as np
+import os
+from pathlib import Path
+from gliner import GLiNER
+# Set environment variables for model storage
+os.environ['GLINER_HOME'] = str(Path.home() / '.gliner_models')
+os.environ['TRANSFORMERS_CACHE'] = str(Path.home() / '.gliner_models' / 'cache')
+# Initialize EasyOCR reader
+reader = easyocr.Reader(['en'])
+def get_model_path():
+    """Get the path to the local model directory."""
+    base_dir = Path.home() / '.gliner_models'
+    model_dir = base_dir / 'gliner_large-v2.1'
+    return model_dir
+def download_model():
+    """Download the model if it doesn't exist locally."""
+    model_dir = get_model_path()
+    if not model_dir.exists():
+        st.info("Downloading GLiNER model for the first time... This may take a few minutes.")
+        try:
+            model_dir.parent.mkdir(parents=True, exist_ok=True)
+            temp_model = GLiNER.from_pretrained("urchade/gliner_large-v2.1")
+            temp_model.save_pretrained(str(model_dir))
+            st.success("Model downloaded successfully!")
+            return temp_model
+        except Exception as e:
+            st.error(f"Error downloading model: {str(e)}")
+            raise e
+    return None
+@st.cache_resource
+def load_gliner_model():
+    """Load the GLiNER model, downloading it if necessary."""
+    model_dir = get_model_path()
+    if model_dir.exists():
+        try:
+            return GLiNER.from_pretrained(str(model_dir))
+        except Exception as e:
+            st.warning("Error loading existing model. Attempting to redownload...")
+            import shutil
+            shutil.rmtree(model_dir, ignore_errors=True)
+    model = download_model()
+    if model:
+        return model
+    return GLiNER.from_pretrained(str(model_dir))
+def extract_text_from_image(image):
+    """Extracts text from a single image using EasyOCR."""
+    image_array = np.array(image)
+    return reader.readtext(image_array, detail=0, paragraph=True)
+def process_entities(text: str, model, threshold: float, nested_ner: bool) -> dict:
+    """Process text with GLiNER model - matching app.py implementation."""
+    # Define our business card labels
+    labels = "person name, company name, job title, phone, email, address"
+    labels = [label.strip() for label in labels.split(",")]
+    # Get predictions
+    entities = model.predict_entities(
+        text,
+        labels,
+        flat_ner=not nested_ner,
+        threshold=threshold
+    )
+    # Format results matching app.py structure
+    formatted_entities = []
+    for entity in entities:
+        formatted_entities.append({
+            "entity": entity["label"],
+            "word": entity["text"],
+            "start": entity["start"],
+            "end": entity["end"]
+        })
+    # Organize results by category
+    results = {
+        "Person Name": [],
+        "Company Name": [],
+        "Job Title": [],
+        "Phone": [],
+        "Email": [],
+        "Address": []
+    }
+    for entity in formatted_entities:
+        category = entity["entity"].title()
+        if category in results:
+            results[category].append(entity["word"])
+    # Join multiple entries with semicolons
+    return {k: "; ".join(set(v)) if v else "" for k, v in results.items()}
+def main():
+    st.title("Business Card Information Extractor")
+    # Model settings in sidebar
+    st.sidebar.title("Settings")
+    threshold = st.sidebar.slider(
+        "Detection Threshold",
+        min_value=0.0,
+        max_value=1.0,
+        value=0.3,
+        step=0.05,
+        help="Lower values will detect more entities (as in app.py example)"
+    )
+    nested_ner = st.sidebar.checkbox(
+        "Enable Nested NER",
+        value=True,
+        help="Allow detection of nested entities"
+    )
+    # Upload options
+    upload_type = st.sidebar.radio("Upload Type", ("Single", "Batch"))
+    # File uploader
+    uploaded_files = st.file_uploader(
+        "Upload Business Card Image(s)",
+        type=["png", "jpg", "jpeg"],
+        accept_multiple_files=(upload_type == "Batch")
+    )
+    if uploaded_files:
+        # Load model
+        model = load_gliner_model()
+        # Process files
+        results = []
+        files_to_process = uploaded_files if isinstance(uploaded_files, list) else [uploaded_files]
+        progress_bar = st.progress(0)
+        for idx, file in enumerate(files_to_process):
+            with st.expander(f"Processing {file.name}"):
+                # Load and extract text
+                image = Image.open(file)
+                extracted_text = extract_text_from_image(image)
+                clean_text = " ".join(extracted_text)
+                # Show extracted text
+                st.text("Extracted Text:")
+                st.text(clean_text)
+                # Process with GLiNER
+                result = process_entities(clean_text, model, threshold, nested_ner)
+                result["File Name"] = file.name
+                results.append(result)
+                # Show individual results
+                st.json(result)
+            progress_bar.progress((idx + 1) / len(files_to_process))
+        # Show final results
+        if results:
+            st.success("Processing Complete!")
+            # Convert to DataFrame
+            df = pd.DataFrame(results)
+            # Reorder columns to put filename first
+            cols = ["File Name"] + [col for col in df.columns if col != "File Name"]
+            df = df[cols]
+            # Display results
+            st.dataframe(df, use_container_width=True)
+            # Provide download option
+            csv = df.to_csv(index=False)
+            st.download_button(
+                "Download Results CSV",
+                csv,
+                "business_card_results.csv",
+                "text/csv",
+                key='download-csv'
+            )
+if __name__ == "__main__":
+    main()