Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
5d256ae
1
Parent(s):
34a5af9
Enhance is_arabic_text function: refine detection to focus on headers and paragraphs, excluding lists and code blocks
Browse files
app.py
CHANGED
|
@@ -5,6 +5,7 @@ import os
|
|
| 5 |
import traceback
|
| 6 |
from io import BytesIO
|
| 7 |
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
| 8 |
|
| 9 |
import fitz # PyMuPDF
|
| 10 |
import gradio as gr
|
|
@@ -186,15 +187,43 @@ def draw_layout_on_image(image: Image.Image, layout_data: List[Dict]) -> Image.I
|
|
| 186 |
|
| 187 |
|
| 188 |
def is_arabic_text(text: str) -> bool:
|
| 189 |
-
"""Check if text contains mostly Arabic characters"""
|
| 190 |
if not text:
|
| 191 |
return False
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
# Arabic Unicode ranges
|
| 194 |
arabic_chars = 0
|
| 195 |
total_chars = 0
|
| 196 |
|
| 197 |
-
for char in
|
| 198 |
if char.isalpha():
|
| 199 |
total_chars += 1
|
| 200 |
# Arabic script ranges
|
|
|
|
| 5 |
import traceback
|
| 6 |
from io import BytesIO
|
| 7 |
from typing import Any, Dict, List, Optional, Tuple
|
| 8 |
+
import re
|
| 9 |
|
| 10 |
import fitz # PyMuPDF
|
| 11 |
import gradio as gr
|
|
|
|
| 187 |
|
| 188 |
|
| 189 |
def is_arabic_text(text: str) -> bool:
|
| 190 |
+
"""Check if text in headers and paragraphs contains mostly Arabic characters"""
|
| 191 |
if not text:
|
| 192 |
return False
|
| 193 |
|
| 194 |
+
# Extract text from headers and paragraphs only
|
| 195 |
+
# Match markdown headers (# ## ###) and regular paragraph text
|
| 196 |
+
header_pattern = r'^#{1,6}\s+(.+)$'
|
| 197 |
+
paragraph_pattern = r'^(?!#{1,6}\s|!\[|```|\||\s*[-*+]\s|\s*\d+\.\s)(.+)$'
|
| 198 |
+
|
| 199 |
+
content_text = []
|
| 200 |
+
|
| 201 |
+
for line in text.split('\n'):
|
| 202 |
+
line = line.strip()
|
| 203 |
+
if not line:
|
| 204 |
+
continue
|
| 205 |
+
|
| 206 |
+
# Check for headers
|
| 207 |
+
header_match = re.match(header_pattern, line, re.MULTILINE)
|
| 208 |
+
if header_match:
|
| 209 |
+
content_text.append(header_match.group(1))
|
| 210 |
+
continue
|
| 211 |
+
|
| 212 |
+
# Check for paragraph text (exclude lists, tables, code blocks, images)
|
| 213 |
+
if re.match(paragraph_pattern, line, re.MULTILINE):
|
| 214 |
+
content_text.append(line)
|
| 215 |
+
|
| 216 |
+
if not content_text:
|
| 217 |
+
return False
|
| 218 |
+
|
| 219 |
+
# Join all content text and check for Arabic characters
|
| 220 |
+
combined_text = ' '.join(content_text)
|
| 221 |
+
|
| 222 |
# Arabic Unicode ranges
|
| 223 |
arabic_chars = 0
|
| 224 |
total_chars = 0
|
| 225 |
|
| 226 |
+
for char in combined_text:
|
| 227 |
if char.isalpha():
|
| 228 |
total_chars += 1
|
| 229 |
# Arabic script ranges
|