Spaces:

BookingCare
/

ner-annotation

Running

App Files Files Community

nam pham commited on May 29

Commit

a33a001

1 Parent(s): 9faf7cc

feat: improve ui/ux

Browse files

Files changed (21) hide show

README.md +82 -0
app.py +403 -907
data/annotated_data.json +0 -0
pyproject.toml +22 -8
src/ner_annotation/__init__.py +3 -0
src/ner_annotation/__main__.py +6 -0
src/ner_annotation/__pycache__/__init__.cpython-310.pyc +0 -0
src/ner_annotation/core/__init__.py +6 -0
src/ner_annotation/core/__pycache__/__init__.cpython-310.pyc +0 -0
src/ner_annotation/core/__pycache__/annotator.cpython-310.pyc +0 -0
src/ner_annotation/core/__pycache__/dataset.cpython-310.pyc +0 -0
src/ner_annotation/core/annotator.py +192 -0
src/ner_annotation/core/dataset.py +162 -0
src/ner_annotation/utils/__init__.py +31 -0
src/ner_annotation/utils/__pycache__/__init__.cpython-310.pyc +0 -0
src/ner_annotation/utils/__pycache__/file_processing.cpython-310.pyc +0 -0
src/ner_annotation/utils/__pycache__/huggingface.cpython-310.pyc +0 -0
src/ner_annotation/utils/__pycache__/text_processing.cpython-310.pyc +0 -0
src/ner_annotation/utils/file_processing.py +215 -0
src/ner_annotation/utils/huggingface.py +137 -0
src/ner_annotation/utils/text_processing.py +124 -0

README.md CHANGED Viewed

@@ -11,3 +11,85 @@ short_description: the ui for annotation ner for healthcare
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# NER Annotation Tool
+A powerful tool for annotating text with named entities using GLiNER models. This tool provides both automatic annotation using pre-trained models and a manual annotation interface for reviewing and correcting the results.
+## Features
+- Automatic NER annotation using GLiNER models
+- Support for multiple pre-trained models
+- Interactive dataset viewer and editor
+- Export/import functionality for annotated data
+- Integration with Hugging Face Hub for dataset sharing
+- Support for various file formats (JSON, CoNLL, TXT)
+## Installation
+1. Clone the repository:
+```bash
+git clone https://github.com/yourusername/ner-annotation.git
+cd ner-annotation
+```
+2. Create and activate a virtual environment:
+```bash
+python -m venv .venv
+source .venv/bin/activate  # On Windows: .venv\Scripts\activate
+```
+3. Install the package:
+```bash
+pip install -e .
+```
+## Usage
+1. Start the application:
+```bash
+python -m ner_annotation.app
+```
+2. The application will open in your default web browser with two main tabs:
+   - **Auto Annotation**: Upload text files and automatically annotate them using GLiNER models
+   - **Dataset Viewer**: Review, edit, and validate annotated data
+### Auto Annotation
+1. Upload a text file (one sentence per line)
+2. Select a GLiNER model
+3. Enter the entity labels to detect (comma-separated)
+4. Adjust the confidence threshold
+5. Optionally add a prompt
+6. Click "Annotate Data"
+### Dataset Viewer
+1. Load a dataset (local or from Hugging Face)
+2. Navigate through examples using the slider or buttons
+3. Edit annotations as needed
+4. Validate examples
+5. Save the dataset locally or to Hugging Face Hub
+## Configuration
+Create a `.env` file in the project root with your Hugging Face token:
+```
+HUGGINGFACE_ACCESS_TOKEN=your_token_here
+```
+## Available Models
+- `BookingCare/gliner-multi-healthcare`
+- `knowledgator/gliner-multitask-large-v0.5`
+- `knowledgator/gliner-multitask-base-v0.5`
+## Contributing
+Contributions are welcome! Please feel free to submit a Pull Request.
+## License
+This project is licensed under the MIT License - see the LICENSE file for details.

app.py CHANGED Viewed

@@ -1,18 +1,19 @@
-import gradio as gr
-from huggingface_hub import HfApi, create_repo
 import os
-import re
 import json
-import torch
-import random
 from typing import List, Dict, Union, Tuple
-from gliner import GLiNER
-from datasets import load_dataset
-from dotenv import load_dotenv
-# Load environment variables from .env
-load_dotenv()
-HF_TOKEN = os.getenv("HUGGINGFACE_ACCESS_TOKEN")
 # Available models for annotation
 AVAILABLE_MODELS = [
@@ -21,132 +22,15 @@ AVAILABLE_MODELS = [
     "knowledgator/gliner-multitask-base-v0.5"
 ]
-# Dataset Viewer Classes and Functions
-class DynamicDataset:
-    def __init__(
-            self, data: List[Dict[str, Union[List[Union[int, str]], bool]]]
-                 ) -> None:
-        self.data = data
-        self.data_len = len(self.data)
-        self.current = -1
-        for example in self.data:
-            if not "validated" in example.keys():
-                example["validated"] = False
-    def next_example(self):
-        self.current += 1
-        if self.current > self.data_len-1:
-          self.current = self.data_len -1
-        elif self.current < 0:
-          self.current = 0
-    def previous_example(self):
-        self.current -= 1
-        if self.current > self.data_len-1:
-          self.current = self.data_len -1
-        elif self.current < 0:
-          self.current = 0
-    def example_by_id(self, id):
-        self.current = id
-        if self.current > self.data_len-1:
-          self.current = self.data_len -1
-        elif self.current < 0:
-          self.current = 0
-    def validate(self):
-        self.data[self.current]["validated"] = True
-    def load_current_example(self):
-        return self.data[self.current]
-def tokenize_text(text):
-    """Tokenize the input text into a list of tokens."""
-    return re.findall(r'\w+(?:[-_]\w+)*|\S', text)
-def join_tokens(tokens):
-    # Joining tokens with space, but handling special characters correctly
-    text = ""
-    for token in tokens:
-        if token in {",", ".", "!", "?", ":", ";", "..."}:
-            text = text.rstrip() + token
-        else:
-            text += " " + token
-    return text.strip()
-def prepare_for_highlight(data):
-    tokens = data["tokenized_text"]
-    ner = data["ner"]
-    highlighted_text = []
-    current_entity = None
-    entity_tokens = []
-    normal_tokens = []
-    for idx, token in enumerate(tokens):
-        # Check if the current token is the start of a new entity
-        if current_entity is None or idx > current_entity[1]:
-            if entity_tokens:
-                highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
-                entity_tokens = []
-            current_entity = next((entity for entity in ner if entity[0] == idx), None)
-        # If current token is part of an entity
-        if current_entity and current_entity[0] <= idx <= current_entity[1]:
-            if normal_tokens:
-                highlighted_text.append((" ".join(normal_tokens), None))
-                normal_tokens = []
-            entity_tokens.append(token + " ")
-        else:
-            if entity_tokens:
-                highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
-                entity_tokens = []
-            normal_tokens.append(token + " ")
-    # Append any remaining tokens
-    if entity_tokens:
-        highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
-    if normal_tokens:
-        highlighted_text.append((" ".join(normal_tokens), None))
-    # Clean up spaces before punctuation
-    cleaned_highlighted_text = []
-    for text, label in highlighted_text:
-        cleaned_text = re.sub(r'\s(?=[,\.!?…:;])', '', text)
-        cleaned_highlighted_text.append((cleaned_text, label))
-    return cleaned_highlighted_text
-def extract_tokens_and_labels(data: List[Dict[str, Union[str, None]]]) -> Dict[str, Union[List[str], List[Tuple[int, int, str]]]]:
-    tokens = []
-    ner = []
-    token_start_idx = 0
-    for entry in data:
-        char = entry['token']
-        label = entry['class_or_confidence']
-        # Tokenize the current text chunk
-        token_list = tokenize_text(char)
-        # Append tokens to the main tokens list
-        tokens.extend(token_list)
-        if label:
-            token_end_idx = token_start_idx + len(token_list) - 1
-            ner.append((token_start_idx, token_end_idx, label))
-        token_start_idx += len(token_list)
-    return tokens, ner
-# Global variables for dataset viewer
 dynamic_dataset = None
 def load_dataset():
     global dynamic_dataset
     try:
-        print('load_dataset')
         with open("data/annotated_data.json", 'rt') as dataset:
             ANNOTATED_DATA = json.load(dataset)
         dynamic_dataset = DynamicDataset(ANNOTATED_DATA)
@@ -156,11 +40,12 @@ def load_dataset():
         return [("Error loading dataset: " + str(e), None)], gr.update(value=0, maximum=1)
 def example_by_id(id):
     global dynamic_dataset
     if dynamic_dataset is None:
         return [("Please load a dataset first", None)], gr.update(value=0, maximum=1)
     try:
-        id = int(id)  # Ensure id is an integer
         dynamic_dataset.example_by_id(id)
         current = dynamic_dataset.current
         max_value = len(dynamic_dataset.data) - 1
@@ -169,6 +54,7 @@ def example_by_id(id):
         return [("Error navigating to example: " + str(e), None)], gr.update(value=0, maximum=1)
 def next_example():
     global dynamic_dataset
     if dynamic_dataset is None:
         return [("Please load a dataset first", None)], gr.update(value=0, maximum=1)
@@ -181,6 +67,7 @@ def next_example():
         return [("Error navigating to next example: " + str(e), None)], gr.update(value=0, maximum=1)
 def previous_example():
     global dynamic_dataset
     if dynamic_dataset is None:
         return [("Please load a dataset first", None)], gr.update(value=0, maximum=1)
@@ -193,6 +80,7 @@ def previous_example():
         return [("Error navigating to previous example: " + str(e), None)], gr.update(value=0, maximum=1)
 def update_example(data):
     global dynamic_dataset
     if dynamic_dataset is None:
         return [("Please load a dataset first", None)]
@@ -202,6 +90,7 @@ def update_example(data):
     return prepare_for_highlight(dynamic_dataset.load_current_example())
 def validate_example():
     global dynamic_dataset
     if dynamic_dataset is None:
         return [("Please load a dataset first", None)]
@@ -209,6 +98,7 @@ def validate_example():
     return [("The example was validated!", None)]
 def save_dataset(inp):
     global dynamic_dataset
     if dynamic_dataset is None:
         return [("Please load a dataset first", None)]
@@ -216,831 +106,437 @@ def save_dataset(inp):
         json.dump(dynamic_dataset.data, file)
     return [("The validated dataset was saved as data/annotated_data.json", None)]
-# Original annotation functions
-def transform_data(data):
-    tokens = tokenize_text(data['text'])
-    spans = []
-    for entity in data['entities']:
-        entity_tokens = tokenize_text(entity['word'])
-        entity_length = len(entity_tokens)
-        # Find the start and end indices of each entity in the tokenized text
-        for i in range(len(tokens) - entity_length + 1):
-            if tokens[i:i + entity_length] == entity_tokens:
-                spans.append([i, i + entity_length - 1, entity['entity']])
-                break
-    return {"tokenized_text": tokens, "ner": spans, "validated": False}
-def merge_entities(entities):
-    if not entities:
-        return []
-    merged = []
-    current = entities[0]
-    for next_entity in entities[1:]:
-        if next_entity['entity'] == current['entity'] and (next_entity['start'] == current['end'] + 1 or next_entity['start'] == current['end']):
-            current['word'] += ' ' + next_entity['word']
-            current['end'] = next_entity['end']
-        else:
-            merged.append(current)
-            current = next_entity
-    merged.append(current)
-    return merged
-def annotate_text(
-    model, text, labels: List[str], threshold: float, nested_ner: bool
-) -> Dict:
-    labels = [label.strip() for label in labels]
-    r = {
-        "text": text,
-        "entities": [
-            {
-                "entity": entity["label"],
-                "word": entity["text"],
-                "start": entity["start"],
-                "end": entity["end"],
-                "score": 0,
-            }
-            for entity in model.predict_entities(
-                text, labels, flat_ner=not nested_ner, threshold=threshold
-            )
-        ],
-    }
-    r["entities"] = merge_entities(r["entities"])
-    return transform_data(r)
-def batch_annotate_text(model: GLiNER, texts: List[str], labels: List[str], threshold: float, nested_ner: bool) -> List[Dict]:
-    """Annotate multiple texts in batch"""
-    labels = [label.strip() for label in labels]
-    batch_entities = model.batch_predict_entities(texts, labels, flat_ner=not nested_ner, threshold=threshold)
-    results = []
-    for text, entities in zip(texts, batch_entities):
-        r = {
-            "text": text,
-            "entities": [
-                {
-                    "entity": entity["label"],
-                    "word": entity["text"],
-                    "start": entity["start"],
-                    "end": entity["end"],
-                    "score": 0,
-                }
-                for entity in entities
-            ],
-        }
-        r["entities"] = merge_entities(r["entities"])
-        results.append(transform_data(r))
-    return results
-class AutoAnnotator:
-    def __init__(
-        self, model: str = "BookingCare/gliner-multi-healthcare",
-        # device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
-        device = torch.device('cpu')
-        ) -> None:
-        # Set PyTorch memory management settings
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
-        self.model = GLiNER.from_pretrained(model).to(device)
-        self.annotated_data = []
-        self.stat = {
-            "total": None,
-            "current": -1
-        }
-    def auto_annotate(
-            self, data: List[str], labels: List[str],
-            prompt: Union[str, List[str]] = None, threshold: float = 0.5, nested_ner: bool = False
-            ) -> List[Dict]:
-        self.stat["total"] = len(data)
-        self.stat["current"] = -1  # Reset current progress
-        # Process texts in batches
-        processed_data = []
-        batch_size = 8  # Reduced batch size to prevent OOM errors
-        for i in range(0, len(data), batch_size):
-            batch_texts = data[i:i + batch_size]
-            batch_with_prompts = []
-            # Add prompts to batch texts
-            for text in batch_texts:
-                if isinstance(prompt, list):
-                    prompt_text = random.choice(prompt)
-                else:
-                    prompt_text = prompt
-                text_with_prompt = f"{prompt_text}\n{text}" if prompt_text else text
-                batch_with_prompts.append(text_with_prompt)
-            # Process batch
-            batch_results = batch_annotate_text(self.model, batch_with_prompts, labels, threshold, nested_ner)
-            processed_data.extend(batch_results)
-            # Clear CUDA cache after each batch
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-            # Update progress
-            self.stat["current"] = min(i + batch_size, len(data))
-        self.annotated_data = processed_data
-        return self.annotated_data
-# Global variables
-annotator = None
-sentences = []
-def process_text_for_gliner(text: str, max_tokens: int = 256, overlap: int = 32) -> List[str]:
-    """
-    Process text for GLiNER by splitting long texts into overlapping chunks.
-    Preserves sentence boundaries and context when possible.
-    Args:
-        text: The input text to process
-        max_tokens: Maximum number of tokens per chunk
-        overlap: Number of tokens to overlap between chunks
-    Returns:
-        List of text chunks suitable for GLiNER
-    """
-    # First split into sentences to preserve natural boundaries
-    sentences = re.split(r'(?<=[.!?])\s+', text)
-    chunks = []
-    current_chunk = []
-    current_length = 0
-    for sentence in sentences:
-        # Tokenize the sentence
-        sentence_tokens = tokenize_text(sentence)
-        sentence_length = len(sentence_tokens)
-        # If a single sentence is too long, split it
-        if sentence_length > max_tokens:
-            # If we have accumulated tokens, add them as a chunk
-            if current_chunk:
-                chunks.append(" ".join(current_chunk))
-                current_chunk = []
-                current_length = 0
-            # Split the long sentence into smaller chunks
-            start = 0
-            while start < sentence_length:
-                end = min(start + max_tokens, sentence_length)
-                chunk_tokens = sentence_tokens[start:end]
-                chunks.append(" ".join(chunk_tokens))
-                start = end - overlap if end < sentence_length else end
-        # If adding this sentence would exceed max_tokens, start a new chunk
-        elif current_length + sentence_length > max_tokens:
-            chunks.append(" ".join(current_chunk))
-            current_chunk = sentence_tokens
-            current_length = sentence_length
-        else:
-            current_chunk.extend(sentence_tokens)
-            current_length += sentence_length
-    # Add any remaining tokens as the final chunk
-    if current_chunk:
-        chunks.append(" ".join(current_chunk))
-    return chunks
-def process_uploaded_file(file_obj):
-    if file_obj is None:
-        return "Please upload a file first!"
-    try:
-        # Read the uploaded file
-        global sentences
-        if file_obj.name.endswith('.csv'):
-            import pandas as pd
-            df = pd.read_csv(file_obj.name)
-            sentences = df['Nội dung'].dropna().tolist()
-            # Process each sentence and flatten the list
-            processed_sentences = []
-            for sentence in sentences:
-                processed_sentences.extend(process_text_for_gliner(sentence))
-            sentences = processed_sentences
-        else:
-            # Read the file content directly from the file object
-            content = file_obj.read().decode('utf-8')
-            raw_sentences = [line.strip() for line in content.splitlines() if line.strip()]
-            # Process each sentence and flatten the list
-            processed_sentences = []
-            for sentence in raw_sentences:
-                processed_sentences.extend(process_text_for_gliner(sentence))
-            sentences = processed_sentences
-        return f"Successfully loaded {len(sentences)} sentences from file!"
-    except Exception as e:
-        return f"Error reading file: {str(e)}"
-def is_valid_repo_name(repo_name):
-    # Hugging Face repo names must not contain slashes or spaces
-    return bool(re.match(r'^[A-Za-z0-9_./-]+$', repo_name))
-def create_hf_repo(repo_name: str, repo_type: str = "dataset", private: bool = False):
-    """Create a new repository on Hugging Face Hub"""
-    if not is_valid_repo_name(repo_name):
-        raise Exception("Invalid repo name: must not contain slashes, spaces, or special characters except '-', '_', '.'")
-    try:
-        api = HfApi(token=HF_TOKEN)
-        # user = api.whoami()['name']
-        # repo_id = f"{user}/{repo_name}"
-        create_repo(
-            repo_id=repo_name,
-            repo_type=repo_type,
-            private=private,
-            exist_ok=True,
-            token=HF_TOKEN
-        )
-        return repo_name
-    except Exception as e:
-        raise Exception(f"Error creating repository: {str(e)}")
 def annotate(model, labels, threshold, prompt, save_to_hub, repo_name, repo_type, is_private):
-    global annotator
     try:
         if not sentences:
             return "Please upload a file with text first!"
         if save_to_hub and not is_valid_repo_name(repo_name):
             return "Error: Invalid repo name. Only use letters, numbers, '-', '_', or '.' (no slashes or spaces)."
         labels = [label.strip() for label in labels.split(",")]
         annotator = AutoAnnotator(model)
         annotated_data = annotator.auto_annotate(sentences, labels, prompt, threshold)
         # Save annotated data locally
         os.makedirs("data", exist_ok=True)
         local_path = "data/annotated_data.json"
         with open(local_path, "wt") as file:
             json.dump(annotated_data, file, ensure_ascii=False)
         status_messages = [f"Successfully annotated and saved locally to {local_path}"]
         # Upload to Hugging Face Hub if requested
         if save_to_hub:
             try:
-                repo_id = create_hf_repo(repo_name, repo_type, is_private)
-                api = HfApi(token=HF_TOKEN)
-                api.upload_file(
-                    path_or_fileobj=local_path,
-                    path_in_repo="annotated_data.json",
-                    repo_id=repo_id,
-                    repo_type=repo_type,
-                    token=HF_TOKEN
-                )
                 status_messages.append(f"Successfully uploaded to Hugging Face Hub repository: {repo_id}")
             except Exception as e:
                 status_messages.append(f"Error with Hugging Face Hub: {str(e)}")
         return "\n".join(status_messages)
     except Exception as e:
         return f"Error during annotation: {str(e)}"
-def convert_hf_dataset_to_ner_format(dataset):
-    """Convert Hugging Face dataset to NER format"""
-    converted_data = []
-    for item in dataset:
-        # Assuming the dataset has 'tokens' and 'ner_tags' fields
-        # Adjust the field names based on your dataset structure
-        if 'tokens' in item and 'ner_tags' in item:
-            ner_spans = []
-            current_span = None
-            for i, (token, tag) in enumerate(zip(item['tokens'], item['ner_tags'])):
-                if tag != 'O':  # Not Outside
-                    if current_span is None:
-                        current_span = [i, i, tag]
-                    elif tag == current_span[2]:
-                        current_span[1] = i
-                    else:
-                        ner_spans.append(current_span)
-                        current_span = [i, i, tag]
-                elif current_span is not None:
-                    ner_spans.append(current_span)
-                    current_span = None
-            if current_span is not None:
-                ner_spans.append(current_span)
-            converted_data.append({
-                "tokenized_text": item['tokens'],
-                "ner": ner_spans,
-                "validated": False
-            })
-    return converted_data
-def load_from_huggingface(dataset_name: str):
-    """Load dataset from Hugging Face Hub"""
     try:
-        # Download the JSON file from Hugging Face
-        import requests
-        import json
-        # Construct the raw URL for the JSON file
-        raw_url = f"https://huggingface.co/datasets/{dataset_name}/raw/main/annotated_data.json"
-        # Download the file
-        response = requests.get(raw_url)
-        if response.status_code == 200:
-            print('response status', response.status_code)
-            print('response', response.text)
-            dataset = json.loads(response.text)
-            converted_data = dataset  # Data is already in the correct format
-        else:
-            raise Exception(f"Failed to download dataset: {response.status_code}")
-        # Save the converted data
-        os.makedirs("data", exist_ok=True)
-        with open("data/annotated_data.json", "wt") as file:
-            json.dump(converted_data, file, ensure_ascii=False)
-        return f"Successfully loaded and converted dataset: {dataset_name}"
     except Exception as e:
-        error_msg = f"Error loading dataset: {str(e)}"
-        return error_msg
-def load_from_local_file(file_path: str, file_format: str = "json"):
-    """Load and convert data from local file in various formats"""
-    try:
-        if file_format == "json":
-            with open(file_path, 'r', encoding='utf-8') as f:
-                data = json.load(f)
-                if isinstance(data, list):
-                    # If data is already in the correct format
-                    if all("tokenized_text" in item and "ner" in item for item in data):
-                        return data
-                    # Convert from other JSON formats
-                    converted_data = []
-                    for item in data:
-                        if "tokens" in item and "ner_tags" in item:
-                            ner_spans = []
-                            current_span = None
-                            for i, (token, tag) in enumerate(zip(item["tokens"], item["ner_tags"])):
-                                if tag != "O":
-                                    if current_span is None:
-                                        current_span = [i, i, tag]
-                                    elif tag == current_span[2]:
-                                        current_span[1] = i
-                                    else:
-                                        ner_spans.append(current_span)
-                                        current_span = [i, i, tag]
-                                elif current_span is not None:
-                                    ner_spans.append(current_span)
-                                    current_span = None
-                            if current_span is not None:
-                                ner_spans.append(current_span)
-                            converted_data.append({
-                                "tokenized_text": item["tokens"],
-                                "ner": ner_spans,
-                                "validated": False
-                            })
-                    return converted_data
-                else:
-                    raise ValueError("JSON file must contain a list of examples")
-        elif file_format == "conll":
-            converted_data = []
-            current_example = {"tokens": [], "ner_tags": []}
-            with open(file_path, 'r', encoding='utf-8') as f:
-                for line in f:
-                    line = line.strip()
-                    if line:
-                        if line.startswith("#"):
-                            continue
-                        parts = line.split()
-                        if len(parts) >= 2:
-                            token, tag = parts[0], parts[-1]
-                            current_example["tokens"].append(token)
-                            current_example["ner_tags"].append(tag)
-                    elif current_example["tokens"]:
-                        # Convert current example
-                        ner_spans = []
-                        current_span = None
-                        for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])):
-                            if tag != "O":
-                                if current_span is None:
-                                    current_span = [i, i, tag]
-                                elif tag == current_span[2]:
-                                    current_span[1] = i
-                                else:
-                                    ner_spans.append(current_span)
-                                    current_span = [i, i, tag]
-                            elif current_span is not None:
-                                ner_spans.append(current_span)
-                                current_span = None
-                        if current_span is not None:
-                            ner_spans.append(current_span)
-                        converted_data.append({
-                            "tokenized_text": current_example["tokens"],
-                            "ner": ner_spans,
-                            "validated": False
-                        })
-                        current_example = {"tokens": [], "ner_tags": []}
-                # Handle last example if exists
-                if current_example["tokens"]:
-                    ner_spans = []
-                    current_span = None
-                    for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])):
-                        if tag != "O":
-                            if current_span is None:
-                                current_span = [i, i, tag]
-                            elif tag == current_span[2]:
-                                current_span[1] = i
-                            else:
-                                ner_spans.append(current_span)
-                                current_span = [i, i, tag]
-                        elif current_span is not None:
-                            ner_spans.append(current_span)
-                            current_span = None
-                    if current_span is not None:
-                        ner_spans.append(current_span)
-                    converted_data.append({
-                        "tokenized_text": current_example["tokens"],
-                        "ner": ner_spans,
-                        "validated": False
-                    })
-            return converted_data
-        elif file_format == "txt":
-            # Simple text file with one sentence per line
-            converted_data = []
-            with open(file_path, 'r', encoding='utf-8') as f:
-                for line in f:
-                    line = line.strip()
-                    if line:
-                        tokens = tokenize_text(line)
-                        converted_data.append({
-                            "tokenized_text": tokens,
-                            "ner": [],
-                            "validated": False
-                        })
-            return converted_data
-        else:
-            raise ValueError(f"Unsupported file format: {file_format}")
-    except Exception as e:
-        raise Exception(f"Error loading file: {str(e)}")
-def process_local_file(file_obj, file_format):
-    """Process uploaded local file"""
-    if file_obj is None:
-        return "Please upload a file first!"
     try:
-        # Load and convert the data
-        data = load_from_local_file(file_obj.name, file_format)
-        # Save the converted data
         os.makedirs("data", exist_ok=True)
-        with open("data/annotated_data.json", "wt") as file:
-            json.dump(data, file, ensure_ascii=False)
-        return f"Successfully loaded and converted {len(data)} examples from {file_format} file!"
     except Exception as e:
-        return f"Error processing file: {str(e)}"
-# Add a function to download the annotated data
-def download_annotated_data():
-    file_path = "data/annotated_data.json"
-    if os.path.exists(file_path):
-        return file_path
-    else:
-        return None
-def download_to_folder():
-    """Download annotated data to a local folder"""
-    try:
-        source_path = "data/annotated_data.json"
-        if not os.path.exists(source_path):
-            return "No annotated data found!"
-        # Create downloads directory if it doesn't exist
-        download_dir = os.path.expanduser("~/Downloads")
-        os.makedirs(download_dir, exist_ok=True)
-        # Copy file to downloads folder
-        import shutil
-        dest_path = os.path.join(download_dir, "annotated_data.json")
-        shutil.copy2(source_path, dest_path)
-        return f"Successfully downloaded to {dest_path}"
-    except Exception as e:
-        return f"Error downloading file: {str(e)}"
-def update_hf_dataset(repo_name: str, repo_type: str = "dataset", is_private: bool = False):
-    """Update or create a Hugging Face dataset with the current annotated data"""
     try:
-        if not dynamic_dataset or not dynamic_dataset.data:
-            return "No data to upload! Please load or annotate data first."
-        # Save current data to local file
         os.makedirs("data", exist_ok=True)
-        local_path = "data/annotated_data.json"
-        with open(local_path, "wt") as file:
-            json.dump(dynamic_dataset.data, file, ensure_ascii=False)
-        # Create or update repository
-        try:
-            repo_id = create_hf_repo(repo_name, repo_type, is_private)
-            api = HfApi(token=HF_TOKEN)
-            api.upload_file(
-                path_or_fileobj=local_path,
-                path_in_repo="annotated_data.json",
-                repo_id=repo_id,
-                repo_type=repo_type,
-                token=HF_TOKEN
-            )
-            return f"Successfully uploaded to Hugging Face Hub repository: {repo_id}"
-        except Exception as e:
-            if "already exists" in str(e):
-                # If repo exists, just update the file
-                user = api.whoami()['name']
-                repo_id = f"{user}/{repo_name}"
-                api.upload_file(
-                    path_or_fileobj=local_path,
-                    path_in_repo="annotated_data.json",
-                    repo_id=repo_id,
-                    repo_type=repo_type,
-                    token=HF_TOKEN
-                )
-                return f"Successfully updated existing repository: {repo_id}"
-            else:
-                raise e
     except Exception as e:
-        return f"Error updating Hugging Face dataset: {str(e)}"
-# Create the main interface with tabs
-with gr.Blocks() as demo:
-    gr.Markdown("# NER Annotation Tool")
-    with gr.Tabs():
-        with gr.TabItem("Auto Annotation"):
-            with gr.Row():
-                with gr.Column():
-                    file_uploader = gr.File(label="Upload text file (one sentence per line)")
-                    upload_status = gr.Textbox(label="Upload Status")
-                    file_uploader.change(fn=process_uploaded_file, inputs=[file_uploader], outputs=[upload_status])
-                with gr.Column():
-                    model = gr.Dropdown(
-                        label="Choose the model for annotation",
-                        choices=AVAILABLE_MODELS,
-                        value=AVAILABLE_MODELS[0]
-                    )
-                    labels = gr.Textbox(
-                        label="Labels",
-                        placeholder="Enter comma-separated labels (e.g., PERSON,ORG,LOC)",
-                        scale=2
-                    )
-                    threshold = gr.Slider(
-                        0, 1,
-                        value=0.3,
-                        step=0.01,
-                        label="Threshold",
-                        info="Lower threshold increases entity predictions"
-                    )
-                    prompt = gr.Textbox(
-                        label="Prompt",
-                        placeholder="Enter your annotation prompt (optional)",
-                        scale=2
-                    )
-                    with gr.Group():
-                        gr.Markdown("### Save Options")
-                        save_to_hub = gr.Checkbox(
-                            label="Save to Hugging Face Hub",
-                            value=False
                         )
-                        with gr.Group(visible=False) as hub_settings:
-                            gr.Markdown("#### Hugging Face Hub Settings")
-                            repo_name = gr.Textbox(
-                                label="Repository Name",
-                                placeholder="Enter repository name (e.g., my-ner-dataset)",
-                                scale=2
-                            )
-                            repo_type = gr.Dropdown(
-                                choices=["dataset", "model", "space"],
-                                value="dataset",
-                                label="Repository Type"
-                            )
-                            is_private = gr.Checkbox(
-                                label="Private Repository",
                                 value=False
                             )
-                    annotate_btn = gr.Button("Annotate Data")
-                    output_info = gr.Textbox(label="Processing Status")
-                    # Add download buttons for annotated data
-                    with gr.Row():
-                        download_btn_annot = gr.Button("Download Annotated Data", visible=False)
-                    download_file_annot = gr.File(label="Download", interactive=False, visible=False)
-                    download_status = gr.Textbox(label="Download Status", visible=False)
-                    def toggle_hub_settings(save_to_hub):
-                        return {
-                            hub_settings: gr.update(visible=save_to_hub)
-                        }
-                    save_to_hub.change(
-                        fn=toggle_hub_settings,
-                        inputs=[save_to_hub],
-                        outputs=[hub_settings]
-                    )
-                    def show_download_buttons(status):
-                        # Show download buttons only if annotation was successful
-                        if status and status.startswith("Successfully annotated and saved locally"):
-                            return gr.update(visible=True), gr.update(visible=True)
-                        return gr.update(visible=False), gr.update(visible=False)
-                    annotate_btn.click(
-                        fn=annotate,
-                        inputs=[
-                            model, labels, threshold, prompt,
-                            save_to_hub, repo_name, repo_type, is_private
-                        ],
-                        outputs=[output_info]
-                    )
-                    output_info.change(
-                        fn=show_download_buttons,
-                        inputs=[output_info],
-                        outputs=[download_btn_annot, download_status]
-                    )
-                    def handle_download_annot():
-                        file_path = download_annotated_data()
-                        if file_path:
-                            return gr.update(value=file_path, visible=True)
-                        else:
-                            return gr.update(visible=False)
-                    download_btn_annot.click(fn=handle_download_annot, inputs=None, outputs=[download_file_annot])
-        with gr.TabItem("Dataset Viewer"):
-            with gr.Row():
-                with gr.Column():
-                    with gr.Row():
-                        load_local_btn = gr.Button("Load Local Dataset")
-                        load_hf_btn = gr.Button("Load from Hugging Face")
-                    local_file = gr.File(label="Upload Local Dataset", visible=False)
-                    file_format = gr.Dropdown(
-                        choices=["json", "conll", "txt"],
-                        value="json",
-                        label="File Format",
-                        visible=False
-                    )
-                    local_status = gr.Textbox(label="Local File Status", visible=False)
-                    with gr.Group(visible=False) as hf_inputs:
                         with gr.Row():
-                            dataset_name = gr.Textbox(
-                                label="Hugging Face Dataset Name",
-                                placeholder="Enter dataset name (e.g., conll2003)",
-                                scale=3
                             )
-                            load_dataset_btn = gr.Button("Load Dataset", scale=1)
-                    bar = gr.Slider(
-                        minimum=0,
-                        maximum=1,
-                        step=1,
-                        label="Progress",
-                        interactive=True,
-                        info="Use slider to navigate through examples"
-                    )
-                    with gr.Row():
-                        previous_btn = gr.Button("Previous example")
-                        apply_btn = gr.Button("Apply changes")
-                        next_btn = gr.Button("Next example")
-                    validate_btn = gr.Button("Validate")
-                    save_btn = gr.Button("Save validated dataset")
-                    # Add Hugging Face upload section
-                    with gr.Group(visible=False) as hf_upload_group:
-                        gr.Markdown("### Upload to Hugging Face")
-                        hf_repo_name = gr.Textbox(
-                            label="Repository Name",
-                            placeholder="Enter repository name (e.g., my-ner-dataset)",
-                            scale=2
                         )
-                        hf_repo_type = gr.Dropdown(
-                            choices=["dataset", "model", "space"],
-                            value="dataset",
-                            label="Repository Type"
                         )
-                        hf_is_private = gr.Checkbox(
-                            label="Private Repository",
-                            value=False
                         )
-                        upload_to_hf_btn = gr.Button("Upload to Hugging Face")
-                        hf_upload_status = gr.Textbox(label="Upload Status")
-                    with gr.Row():
-                        show_hf_upload_btn = gr.Button("Show Upload Options")
-                        hide_hf_upload_btn = gr.Button("Hide Upload Options", visible=False)
-                    def toggle_hf_upload(show: bool):
-                        return {
-                            hf_upload_group: gr.update(visible=show),
-                            show_hf_upload_btn: gr.update(visible=not show),
-                            hide_hf_upload_btn: gr.update(visible=show)
-                        }
-                    show_hf_upload_btn.click(
-                        fn=lambda: toggle_hf_upload(True),
-                        inputs=None,
-                        outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn]
-                    )
-                    hide_hf_upload_btn.click(
-                        fn=lambda: toggle_hf_upload(False),
-                        inputs=None,
-                        outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn]
-                    )
-                    inp_box = gr.HighlightedText(value=None, interactive=True)
-                    def toggle_local_inputs():
-                        return {
-                            local_file: gr.update(visible=True),
-                            file_format: gr.update(visible=True),
-                            local_status: gr.update(visible=True),
-                            hf_inputs: gr.update(visible=False)
-                        }
-                    def toggle_hf_inputs():
-                        return {
-                            local_file: gr.update(visible=False),
-                            file_format: gr.update(visible=False),
-                            local_status: gr.update(visible=False),
-                            hf_inputs: gr.update(visible=True)
-                        }
-                    load_local_btn.click(
-                        fn=toggle_local_inputs,
-                        inputs=None,
-                        outputs=[local_file, file_format, local_status, hf_inputs]
-                    )
-                    load_hf_btn.click(
-                        fn=toggle_hf_inputs,
-                        inputs=None,
-                        outputs=[local_file, file_format, local_status, hf_inputs]
-                    )
-                    def process_and_load_local(file_obj, format):
-                        status = process_local_file(file_obj, format)
-                        if "Successfully" in status:
-                            return load_dataset()
-                        return [status], 0, 0
-                    local_file.change(
-                        fn=process_and_load_local,
-                        inputs=[local_file, file_format],
-                        outputs=[inp_box, bar]
-                    )
-                    def load_hf_dataset(name):
-                        status = load_from_huggingface(name)
-                        print('status', status)
-                        if "Successfully" in status:
-                            return load_dataset()
-                        return [("Error loading dataset: " + status, None)], gr.update(value=0, maximum=1)
-                    load_dataset_btn.click(
-                        fn=load_hf_dataset,
-                        inputs=[dataset_name],
-                        outputs=[inp_box, bar]
-                    )
-                    apply_btn.click(fn=update_example, inputs=inp_box, outputs=inp_box)
-                    save_btn.click(fn=save_dataset, inputs=inp_box, outputs=inp_box)
-                    validate_btn.click(fn=validate_example, inputs=None, outputs=inp_box)
-                    next_btn.click(fn=next_example, inputs=None, outputs=[inp_box, bar])
-                    previous_btn.click(fn=previous_example, inputs=None, outputs=[inp_box, bar])
-                    bar.change(
-                        fn=example_by_id,
-                        inputs=[bar],
-                        outputs=[inp_box, bar],
-                        api_name="example_by_id"
-                    )
-                    # Add Hugging Face upload functionality
-                    upload_to_hf_btn.click(
-                        fn=update_hf_dataset,
-                        inputs=[hf_repo_name, hf_repo_type, hf_is_private],
-                        outputs=[hf_upload_status]
-                    )
-demo.launch()

+"""Main application module for NER annotation tool."""
 import os
 import json
+import gradio as gr
 from typing import List, Dict, Union, Tuple
+from src.ner_annotation.core.dataset import DynamicDataset, prepare_for_highlight
+from src.ner_annotation.core.annotator import AutoAnnotator
+from src.ner_annotation.utils.text_processing import extract_tokens_and_labels
+from src.ner_annotation.utils.file_processing import process_uploaded_file, load_from_local_file
+from src.ner_annotation.utils.huggingface import (
+    is_valid_repo_name,
+    upload_to_hf,
+    download_from_hf
+)
 # Available models for annotation
 AVAILABLE_MODELS = [
     "knowledgator/gliner-multitask-base-v0.5"
 ]
+# Global variables
 dynamic_dataset = None
+annotator = None
+sentences = []
 def load_dataset():
+    """Load the dataset and return the first example."""
     global dynamic_dataset
     try:
         with open("data/annotated_data.json", 'rt') as dataset:
             ANNOTATED_DATA = json.load(dataset)
         dynamic_dataset = DynamicDataset(ANNOTATED_DATA)
         return [("Error loading dataset: " + str(e), None)], gr.update(value=0, maximum=1)
 def example_by_id(id):
+    """Navigate to a specific example by ID."""
     global dynamic_dataset
     if dynamic_dataset is None:
         return [("Please load a dataset first", None)], gr.update(value=0, maximum=1)
     try:
+        id = int(id)
         dynamic_dataset.example_by_id(id)
         current = dynamic_dataset.current
         max_value = len(dynamic_dataset.data) - 1
         return [("Error navigating to example: " + str(e), None)], gr.update(value=0, maximum=1)
 def next_example():
+    """Move to the next example."""
     global dynamic_dataset
     if dynamic_dataset is None:
         return [("Please load a dataset first", None)], gr.update(value=0, maximum=1)
         return [("Error navigating to next example: " + str(e), None)], gr.update(value=0, maximum=1)
 def previous_example():
+    """Move to the previous example."""
     global dynamic_dataset
     if dynamic_dataset is None:
         return [("Please load a dataset first", None)], gr.update(value=0, maximum=1)
         return [("Error navigating to previous example: " + str(e), None)], gr.update(value=0, maximum=1)
 def update_example(data):
+    """Update the current example with new annotations."""
     global dynamic_dataset
     if dynamic_dataset is None:
         return [("Please load a dataset first", None)]
     return prepare_for_highlight(dynamic_dataset.load_current_example())
 def validate_example():
+    """Mark the current example as validated."""
     global dynamic_dataset
     if dynamic_dataset is None:
         return [("Please load a dataset first", None)]
     return [("The example was validated!", None)]
 def save_dataset(inp):
+    """Save the dataset to a file."""
     global dynamic_dataset
     if dynamic_dataset is None:
         return [("Please load a dataset first", None)]
         json.dump(dynamic_dataset.data, file)
     return [("The validated dataset was saved as data/annotated_data.json", None)]
 def annotate(model, labels, threshold, prompt, save_to_hub, repo_name, repo_type, is_private):
+    """Annotate the uploaded text using the selected model."""
+    global annotator, sentences
     try:
         if not sentences:
             return "Please upload a file with text first!"
         if save_to_hub and not is_valid_repo_name(repo_name):
             return "Error: Invalid repo name. Only use letters, numbers, '-', '_', or '.' (no slashes or spaces)."
         labels = [label.strip() for label in labels.split(",")]
         annotator = AutoAnnotator(model)
         annotated_data = annotator.auto_annotate(sentences, labels, prompt, threshold)
         # Save annotated data locally
         os.makedirs("data", exist_ok=True)
         local_path = "data/annotated_data.json"
         with open(local_path, "wt") as file:
             json.dump(annotated_data, file, ensure_ascii=False)
         status_messages = [f"Successfully annotated and saved locally to {local_path}"]
         # Upload to Hugging Face Hub if requested
         if save_to_hub:
             try:
+                repo_id = upload_to_hf(local_path, repo_name, repo_type, is_private)
                 status_messages.append(f"Successfully uploaded to Hugging Face Hub repository: {repo_id}")
             except Exception as e:
                 status_messages.append(f"Error with Hugging Face Hub: {str(e)}")
         return "\n".join(status_messages)
     except Exception as e:
         return f"Error during annotation: {str(e)}"
+def load_from_huggingface(name):
+    """Load a dataset from Hugging Face Hub."""
+    global dynamic_dataset
     try:
+        # Download dataset from Hugging Face Hub
+        local_path = download_from_hf(name, "annotated_data.json")
+        # Load the downloaded dataset
+        with open(local_path, 'rt') as dataset:
+            data = json.load(dataset)
+        # Initialize the dataset
+        dynamic_dataset = DynamicDataset(data)
+        return "Successfully loaded dataset from Hugging Face Hub"
     except Exception as e:
+        return f"Error loading dataset from Hugging Face Hub: {str(e)}"
+def update_hf_dataset(repo_name, repo_type, is_private):
+    """Upload the current dataset to Hugging Face Hub."""
+    global dynamic_dataset
+    if dynamic_dataset is None:
+        return "Please load a dataset first"
     try:
+        if not is_valid_repo_name(repo_name):
+            return "Error: Invalid repo name. Only use letters, numbers, '-', '_', or '.' (no slashes or spaces)."
+        # Save dataset locally first
         os.makedirs("data", exist_ok=True)
+        local_path = "data/annotated_data.json"
+        with open(local_path, "wt") as file:
+            json.dump(dynamic_dataset.data, file, ensure_ascii=False)
+        # Upload to Hugging Face Hub
+        repo_id = upload_to_hf(local_path, repo_name, repo_type, is_private)
+        return f"Successfully uploaded to Hugging Face Hub repository: {repo_id}"
     except Exception as e:
+        return f"Error uploading to Hugging Face Hub: {str(e)}"
+def process_conll(content):
+    """Convert CoNLL format to JSON."""
+    sentences = []
+    current_sentence = {"text": "", "tokenized_text": [], "ner": []}
+    for line in content.split('\n'):
+        if not line.strip():
+            if current_sentence["text"]:
+                sentences.append(current_sentence)
+                current_sentence = {"text": "", "tokenized_text": [], "ner": []}
+            continue
+        parts = line.split()
+        if len(parts) >= 2:
+            token, label = parts[0], parts[-1]
+            current_sentence["tokenized_text"].append(token)
+            current_sentence["ner"].append(label)
+            current_sentence["text"] += token + " "
+    if current_sentence["text"]:
+        sentences.append(current_sentence)
+    return sentences
+def process_txt(content):
+    """Convert plain text to JSON format."""
+    sentences = []
+    for line in content.split('\n'):
+        if line.strip():
+            sentences.append({
+                "text": line.strip(),
+                "tokenized_text": line.strip().split(),
+                "ner": ["O"] * len(line.strip().split())
+            })
+    return sentences
+def process_local_file(file_obj, format):
+    """Process a local file and save it as JSON."""
     try:
+        if file_obj is None:
+            return "No file uploaded"
+        # Get the file content from the Gradio file object
+        content = file_obj.name
+        with open(content, 'r', encoding='utf-8') as f:
+            content = f.read()
+        if format == "json":
+            data = json.loads(content)
+        elif format == "conll":
+            data = process_conll(content)
+        elif format == "txt":
+            data = process_txt(content)
+        else:
+            return "Unsupported file format"
         os.makedirs("data", exist_ok=True)
+        with open("data/annotated_data.json", "wt") as f:
+            json.dump(data, f, ensure_ascii=False)
+        return "Successfully processed and saved file"
     except Exception as e:
+        return f"Error processing file: {str(e)}"
+def create_interface():
+    """Create and return the Gradio interface."""
+    with gr.Blocks() as demo:
+        gr.Markdown("# NER Annotation Tool")
+        with gr.Tabs():
+            with gr.TabItem("Auto Annotation"):
+                with gr.Row():
+                    with gr.Column():
+                        file_uploader = gr.File(label="Upload text file (one sentence per line)")
+                        upload_status = gr.Textbox(label="Upload Status")
+                        file_uploader.change(fn=process_uploaded_file, inputs=[file_uploader], outputs=[upload_status])
+                    with gr.Column():
+                        model = gr.Dropdown(
+                            label="Choose the model for annotation",
+                            choices=AVAILABLE_MODELS,
+                            value=AVAILABLE_MODELS[0]
+                        )
+                        labels = gr.Textbox(
+                            label="Labels",
+                            placeholder="Enter comma-separated labels (e.g., PERSON,ORG,LOC)",
+                            scale=2
+                        )
+                        threshold = gr.Slider(
+                            0, 1,
+                            value=0.3,
+                            step=0.01,
+                            label="Threshold",
+                            info="Lower threshold increases entity predictions"
+                        )
+                        prompt = gr.Textbox(
+                            label="Prompt",
+                            placeholder="Enter your annotation prompt (optional)",
+                            scale=2
                         )
+                        with gr.Group():
+                            gr.Markdown("### Save Options")
+                            save_to_hub = gr.Checkbox(
+                                label="Save to Hugging Face Hub",
                                 value=False
                             )
+                            with gr.Group(visible=False) as hub_settings:
+                                gr.Markdown("#### Hugging Face Hub Settings")
+                                repo_name = gr.Textbox(
+                                    label="Repository Name",
+                                    placeholder="Enter repository name (e.g., my-ner-dataset)",
+                                    scale=2
+                                )
+                                repo_type = gr.Dropdown(
+                                    choices=["dataset", "model", "space"],
+                                    value="dataset",
+                                    label="Repository Type"
+                                )
+                                is_private = gr.Checkbox(
+                                    label="Private Repository",
+                                    value=False
+                                )
+                        annotate_btn = gr.Button("Annotate Data")
+                        output_info = gr.Textbox(label="Processing Status")
+                        # Add download buttons for annotated data
                         with gr.Row():
+                            download_btn_annot = gr.Button("Download Annotated Data", visible=False)
+                        download_file_annot = gr.File(label="Download", interactive=False, visible=False)
+                        download_status = gr.Textbox(label="Download Status", visible=False)
+                        def toggle_hub_settings(save_to_hub):
+                            return {
+                                hub_settings: gr.update(visible=save_to_hub)
+                            }
+                        save_to_hub.change(
+                            fn=toggle_hub_settings,
+                            inputs=[save_to_hub],
+                            outputs=[hub_settings]
+                        )
+                        def show_download_buttons(status):
+                            if status and status.startswith("Successfully annotated and saved locally"):
+                                return gr.update(visible=True), gr.update(visible=True)
+                            return gr.update(visible=False), gr.update(visible=False)
+                        annotate_btn.click(
+                            fn=annotate,
+                            inputs=[
+                                model, labels, threshold, prompt,
+                                save_to_hub, repo_name, repo_type, is_private
+                            ],
+                            outputs=[output_info]
+                        )
+                        output_info.change(
+                            fn=show_download_buttons,
+                            inputs=[output_info],
+                            outputs=[download_btn_annot, download_status]
+                        )
+                        def handle_download_annot():
+                            file_path = "data/annotated_data.json"
+                            if os.path.exists(file_path):
+                                return gr.update(value=file_path, visible=True)
+                            return gr.update(visible=False)
+                        download_btn_annot.click(
+                            fn=handle_download_annot,
+                            inputs=None,
+                            outputs=[download_file_annot]
+                        )
+            with gr.TabItem("Dataset Viewer"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        gr.Markdown("### Dataset Controls")
+                        with gr.Group():
+                            with gr.Row():
+                                load_local_btn = gr.Button("Load Local Dataset", variant="primary")
+                                load_hf_btn = gr.Button("Load from Hugging Face", variant="secondary")
+                            with gr.Group() as local_inputs:
+                                local_file = gr.File(label="Upload Local Dataset")
+                                file_format = gr.Dropdown(
+                                    choices=["json", "conll", "txt"],
+                                    value="json",
+                                    label="File Format"
+                                )
+                                local_status = gr.Textbox(label="Status", interactive=False)
+                            with gr.Group(visible=False) as hf_inputs:
+                                with gr.Row():
+                                    dataset_name = gr.Textbox(
+                                        label="Dataset Name",
+                                        placeholder="Enter dataset name (e.g., conll2003)",
+                                        scale=4
+                                    )
+                                with gr.Row():
+                                    gr.Column(scale=1)
+                                    load_dataset_btn = gr.Button("📥 Load Dataset", variant="primary")
+                                    gr.Column(scale=1)
+                                with gr.Row():
+                                    gr.Markdown(
+                                        "💡 Tip: Enter a valid Hugging Face dataset name",
+                                        elem_classes=["text-sm", "text-gray-500"]
+                                    )
+                        gr.Markdown("### Navigation")
+                        with gr.Group():
+                            bar = gr.Slider(
+                                minimum=0,
+                                maximum=1,
+                                step=1,
+                                label="Progress",
+                                interactive=True,
+                                info="Use slider to navigate through examples"
                             )
+                            with gr.Row():
+                                previous_btn = gr.Button("← Previous", variant="secondary")
+                                next_btn = gr.Button("Next →", variant="secondary")
+                        gr.Markdown("### Actions")
+                        with gr.Group():
+                            with gr.Row():
+                                apply_btn = gr.Button("Apply Changes", variant="primary")
+                                validate_btn = gr.Button("Validate", variant="secondary")
+                            save_btn = gr.Button("Save Dataset", variant="primary")
+                        gr.Markdown("### Hugging Face Upload")
+                        with gr.Group():
+                            with gr.Row():
+                                show_hf_upload_btn = gr.Button("📤 Show Upload Options", variant="secondary", scale=1)
+                                hide_hf_upload_btn = gr.Button("📥 Hide Upload Options", visible=False, variant="secondary", scale=1)
+                            with gr.Group(visible=False) as hf_upload_group:
+                                with gr.Row():
+                                    hf_repo_name = gr.Textbox(
+                                        label="Repository Name",
+                                        placeholder="Enter repository name (e.g., my-ner-dataset)",
+                                        scale=2
+                                    )
+                                    hf_repo_type = gr.Dropdown(
+                                        choices=["dataset", "model", "space"],
+                                        value="dataset",
+                                        label="Repository Type",
+                                        scale=1
+                                    )
+                                with gr.Row():
+                                    hf_is_private = gr.Checkbox(
+                                        label="Private Repository",
+                                        value=False,
+                                        scale=1
+                                    )
+                                    upload_to_hf_btn = gr.Button("Upload to Hugging Face", variant="primary", scale=2)
+                                hf_upload_status = gr.Textbox(
+                                    label="Upload Status",
+                                    interactive=False,
+                                    show_label=True
+                                )
+                        def toggle_upload_options(show: bool):
+                            return {
+                                hf_upload_group: gr.update(visible=show),
+                                show_hf_upload_btn: gr.update(visible=not show),
+                                hide_hf_upload_btn: gr.update(visible=show)
+                            }
+                        show_hf_upload_btn.click(
+                            fn=lambda: toggle_upload_options(True),
+                            inputs=None,
+                            outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn]
                         )
+                        hide_hf_upload_btn.click(
+                            fn=lambda: toggle_upload_options(False),
+                            inputs=None,
+                            outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn]
                         )
+                    with gr.Column(scale=2):
+                        gr.Markdown("### Current Example")
+                        inp_box = gr.HighlightedText(value=None, interactive=True)
+                        def toggle_local_inputs():
+                            return {
+                                local_inputs: gr.update(visible=True),
+                                hf_inputs: gr.update(visible=False)
+                            }
+                        def toggle_hf_inputs():
+                            return {
+                                local_inputs: gr.update(visible=False),
+                                hf_inputs: gr.update(visible=True)
+                            }
+                        load_local_btn.click(
+                            fn=toggle_local_inputs,
+                            inputs=None,
+                            outputs=[local_inputs, hf_inputs]
                         )
+                        load_hf_btn.click(
+                            fn=toggle_hf_inputs,
+                            inputs=None,
+                            outputs=[local_inputs, hf_inputs]
+                        )
+                        def process_and_load_local(file_obj, format):
+                            status = process_local_file(file_obj, format)
+                            if "Successfully" in status:
+                                result = load_dataset()
+                                return result[0], result[1], status
+                            return [("Error loading dataset: " + status, None)], gr.update(value=0, maximum=1), status
+                        local_file.change(
+                            fn=process_and_load_local,
+                            inputs=[local_file, file_format],
+                            outputs=[inp_box, bar, local_status]
+                        )
+                        def load_hf_dataset(name):
+                            status = load_from_huggingface(name)
+                            if "Successfully" in status:
+                                return load_dataset()
+                            return [("Error loading dataset: " + status, None)], gr.update(value=0, maximum=1)
+                        load_dataset_btn.click(
+                            fn=load_hf_dataset,
+                            inputs=[dataset_name],
+                            outputs=[inp_box, bar]
+                        )
+                        apply_btn.click(fn=update_example, inputs=inp_box, outputs=inp_box)
+                        save_btn.click(fn=save_dataset, inputs=inp_box, outputs=inp_box)
+                        validate_btn.click(fn=validate_example, inputs=None, outputs=inp_box)
+                        next_btn.click(fn=next_example, inputs=None, outputs=[inp_box, bar])
+                        previous_btn.click(fn=previous_example, inputs=None, outputs=[inp_box, bar])
+                        bar.change(
+                            fn=example_by_id,
+                            inputs=[bar],
+                            outputs=[inp_box, bar],
+                            api_name="example_by_id"
+                        )
+                        upload_to_hf_btn.click(
+                            fn=update_hf_dataset,
+                            inputs=[hf_repo_name, hf_repo_type, hf_is_private],
+                            outputs=[hf_upload_status]
+                        )
+        return demo
+def main():
+    """Run the application."""
+    demo = create_interface()
+    demo.launch()
+if __name__ == "__main__":
+    main()

data/annotated_data.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml CHANGED Viewed

@@ -1,13 +1,27 @@
 [project]
 name = "ner-annotation"
 version = "0.1.0"
-description = "Add your description here"
-readme = "README.md"
-requires-python = ">=3.10"
 dependencies = [
-    "datasets>=3.6.0",
-    "gliner>=0.2.20",
-    "gradio>=5.31.0",
-    "huggingface-hub>=0.32.1",
-    "python-dotenv>=1.1.0",
 ]

 [project]
 name = "ner-annotation"
 version = "0.1.0"
+description = "A tool for annotating text with named entities using GLiNER models"
+authors = [
+    {name = "Your Name", email = "your.[email protected]"}
+]
 dependencies = [
+    "gradio>=4.0.0",
+    "torch>=2.0.0",
+    "gliner>=0.1.0",
+    "huggingface-hub>=0.19.0",
+    "pandas>=2.0.0",
+    "python-dotenv>=1.0.0",
+    "requests>=2.31.0"
 ]
+requires-python = ">=3.8"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.metadata]
+allow-direct-references = true
+[tool.hatch.build.targets.wheel]
+packages = ["src/ner_annotation"]

src/ner_annotation/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """NER Annotation Tool - A tool for annotating text with named entities."""
2	+
3	+ __version__ = "0.1.0"

src/ner_annotation/__main__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Main entry point for the NER annotation tool."""
+from .app import main
+if __name__ == "__main__":
+    main()

src/ner_annotation/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (266 Bytes). View file

src/ner_annotation/core/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Core functionality for NER annotation."""
+from .dataset import DynamicDataset, prepare_for_highlight
+from .annotator import AutoAnnotator
+__all__ = ['DynamicDataset', 'prepare_for_highlight', 'AutoAnnotator']

src/ner_annotation/core/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (380 Bytes). View file

src/ner_annotation/core/__pycache__/annotator.cpython-310.pyc ADDED Viewed

Binary file (5.03 kB). View file

src/ner_annotation/core/__pycache__/dataset.cpython-310.pyc ADDED Viewed

Binary file (5.16 kB). View file

src/ner_annotation/core/annotator.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""NER annotation module using GLiNER models."""
+from typing import List, Dict, Union, Optional
+import torch
+import random
+from gliner import GLiNER
+from ..utils.text_processing import tokenize_text
+class AutoAnnotator:
+    """A class for automatic NER annotation using GLiNER models."""
+    def __init__(
+        self,
+        model: str = "BookingCare/gliner-multi-healthcare",
+        device: Optional[torch.device] = None
+    ) -> None:
+        """Initialize the annotator with a GLiNER model.
+        Args:
+            model: Name or path of the GLiNER model to use
+            device: Device to run the model on (CPU/GPU)
+        """
+        if device is None:
+            device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+        # Set PyTorch memory management settings
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.set_per_process_memory_fraction(0.8)  # Use 80% of available GPU memory
+        self.model = GLiNER.from_pretrained(model).to(device)
+        self.annotated_data = []
+        self.stat = {
+            "total": None,
+            "current": -1
+        }
+    def auto_annotate(
+        self,
+        data: List[str],
+        labels: List[str],
+        prompt: Optional[Union[str, List[str]]] = None,
+        threshold: float = 0.5,
+        nested_ner: bool = False
+    ) -> List[Dict]:
+        """Annotate a list of texts with NER labels.
+        Args:
+            data: List of texts to annotate
+            labels: List of entity labels to detect
+            prompt: Optional prompt or list of prompts to use
+            threshold: Confidence threshold for entity detection
+            nested_ner: Whether to allow nested entities
+        Returns:
+            List of annotated examples
+        """
+        self.stat["total"] = len(data)
+        self.stat["current"] = -1
+        # Process texts in batches
+        processed_data = []
+        batch_size = 8  # Reduced batch size to prevent OOM errors
+        for i in range(0, len(data), batch_size):
+            batch_texts = data[i:i + batch_size]
+            batch_with_prompts = []
+            # Add prompts to batch texts
+            for text in batch_texts:
+                if isinstance(prompt, list):
+                    prompt_text = random.choice(prompt)
+                else:
+                    prompt_text = prompt
+                text_with_prompt = f"{prompt_text}\n{text}" if prompt_text else text
+                batch_with_prompts.append(text_with_prompt)
+            # Process batch
+            batch_results = self._batch_annotate_text(
+                batch_with_prompts,
+                labels,
+                threshold,
+                nested_ner
+            )
+            processed_data.extend(batch_results)
+            # Clear CUDA cache after each batch
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            # Update progress
+            self.stat["current"] = min(i + batch_size, len(data))
+        self.annotated_data = processed_data
+        return self.annotated_data
+    def _batch_annotate_text(
+        self,
+        texts: List[str],
+        labels: List[str],
+        threshold: float,
+        nested_ner: bool
+    ) -> List[Dict]:
+        """Annotate multiple texts in batch.
+        Args:
+            texts: List of texts to annotate
+            labels: List of entity labels
+            threshold: Confidence threshold
+            nested_ner: Whether to allow nested entities
+        Returns:
+            List of annotated examples
+        """
+        batch_entities = self.model.batch_predict_entities(
+            texts,
+            labels,
+            flat_ner=not nested_ner,
+            threshold=threshold
+        )
+        results = []
+        for text, entities in zip(texts, batch_entities):
+            r = {
+                "text": text,
+                "entities": [
+                    {
+                        "entity": entity["label"],
+                        "word": entity["text"],
+                        "start": entity["start"],
+                        "end": entity["end"],
+                        "score": 0,
+                    }
+                    for entity in entities
+                ],
+            }
+            r["entities"] = self._merge_entities(r["entities"])
+            results.append(self._transform_data(r))
+        return results
+    def _merge_entities(self, entities: List[Dict]) -> List[Dict]:
+        """Merge adjacent entities of the same type.
+        Args:
+            entities: List of entity dictionaries
+        Returns:
+            List of merged entities
+        """
+        if not entities:
+            return []
+        merged = []
+        current = entities[0]
+        for next_entity in entities[1:]:
+            if (next_entity['entity'] == current['entity'] and
+                (next_entity['start'] == current['end'] + 1 or
+                 next_entity['start'] == current['end'])):
+                current['word'] += ' ' + next_entity['word']
+                current['end'] = next_entity['end']
+            else:
+                merged.append(current)
+                current = next_entity
+        merged.append(current)
+        return merged
+    def _transform_data(self, data: Dict) -> Dict:
+        """Transform raw annotation data into tokenized format.
+        Args:
+            data: Raw annotation data
+        Returns:
+            Transformed data with tokenized text and NER spans
+        """
+        tokens = tokenize_text(data['text'])
+        spans = []
+        for entity in data['entities']:
+            entity_tokens = tokenize_text(entity['word'])
+            entity_length = len(entity_tokens)
+            # Find the start and end indices of each entity in the tokenized text
+            for i in range(len(tokens) - entity_length + 1):
+                if tokens[i:i + entity_length] == entity_tokens:
+                    spans.append([i, i + entity_length - 1, entity['entity']])
+                    break
+        return {
+            "tokenized_text": tokens,
+            "ner": spans,
+            "validated": False
+        }

src/ner_annotation/core/dataset.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""Dataset management module for NER annotation."""
+from typing import List, Dict, Union, Tuple
+import json
+import os
+import re
+class DynamicDataset:
+    """A class to manage and navigate through annotated dataset examples."""
+    def __init__(
+            self, data: List[Dict[str, Union[List[Union[int, str]], bool]]]
+    ) -> None:
+        """Initialize the dataset with examples.
+        Args:
+            data: List of examples, each containing tokenized text and NER annotations
+        """
+        self.data = data
+        self.data_len = len(self.data)
+        self.current = -1
+        for example in self.data:
+            if "validated" not in example:
+                example["validated"] = False
+    def next_example(self) -> None:
+        """Move to the next example in the dataset."""
+        self.current += 1
+        if self.current > self.data_len - 1:
+            self.current = self.data_len - 1
+        elif self.current < 0:
+            self.current = 0
+    def previous_example(self) -> None:
+        """Move to the previous example in the dataset."""
+        self.current -= 1
+        if self.current > self.data_len - 1:
+            self.current = self.data_len - 1
+        elif self.current < 0:
+            self.current = 0
+    def example_by_id(self, id: int) -> None:
+        """Navigate to a specific example by its ID.
+        Args:
+            id: The index of the example to navigate to
+        """
+        self.current = id
+        if self.current > self.data_len - 1:
+            self.current = self.data_len - 1
+        elif self.current < 0:
+            self.current = 0
+    def validate(self) -> None:
+        """Mark the current example as validated."""
+        self.data[self.current]["validated"] = True
+    def load_current_example(self) -> Dict:
+        """Get the current example.
+        Returns:
+            The current example data
+        """
+        return self.data[self.current]
+def tokenize_text(text: str) -> List[str]:
+    """Tokenize the input text into a list of tokens.
+    Args:
+        text: The input text to tokenize
+    Returns:
+        List of tokens
+    """
+    return re.findall(r'\w+(?:[-_]\w+)*|\S', text)
+def join_tokens(tokens: List[str]) -> str:
+    """Join tokens with proper spacing.
+    Args:
+        tokens: List of tokens to join
+    Returns:
+        Joined text string
+    """
+    text = ""
+    for token in tokens:
+        if token in {",", ".", "!", "?", ":", ";", "..."}:
+            text = text.rstrip() + token
+        else:
+            text += " " + token
+    return text.strip()
+def prepare_for_highlight(data: Dict) -> List[Tuple[str, str]]:
+    """Prepare text for highlighting with NER annotations.
+    Args:
+        data: Dictionary containing tokenized text and NER annotations
+    Returns:
+        List of tuples containing text segments and their entity labels
+    """
+    tokens = data["tokenized_text"]
+    ner = data["ner"]
+    highlighted_text = []
+    current_entity = None
+    entity_tokens = []
+    normal_tokens = []
+    for idx, token in enumerate(tokens):
+        if current_entity is None or idx > current_entity[1]:
+            if entity_tokens:
+                highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
+                entity_tokens = []
+            current_entity = next((entity for entity in ner if entity[0] == idx), None)
+        if current_entity and current_entity[0] <= idx <= current_entity[1]:
+            if normal_tokens:
+                highlighted_text.append((" ".join(normal_tokens), None))
+                normal_tokens = []
+            entity_tokens.append(token + " ")
+        else:
+            if entity_tokens:
+                highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
+                entity_tokens = []
+            normal_tokens.append(token + " ")
+    if entity_tokens:
+        highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
+    if normal_tokens:
+        highlighted_text.append((" ".join(normal_tokens), None))
+    cleaned_highlighted_text = []
+    for text, label in highlighted_text:
+        cleaned_text = re.sub(r'\s(?=[,\.!?…:;])', '', text)
+        cleaned_highlighted_text.append((cleaned_text, label))
+    return cleaned_highlighted_text
+def save_dataset(data: List[Dict], filepath: str) -> None:
+    """Save the dataset to a JSON file.
+    Args:
+        data: The dataset to save
+        filepath: Path to save the dataset
+    """
+    os.makedirs(os.path.dirname(filepath), exist_ok=True)
+    with open(filepath, "wt") as file:
+        json.dump(data, file, ensure_ascii=False)
+def load_dataset(filepath: str) -> List[Dict]:
+    """Load a dataset from a JSON file.
+    Args:
+        filepath: Path to the dataset file
+    Returns:
+        The loaded dataset
+    """
+    with open(filepath, "rt") as file:
+        return json.load(file)

src/ner_annotation/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""Utility functions for NER annotation."""
+from .text_processing import (
+    tokenize_text,
+    join_tokens,
+    process_text_for_gliner,
+    extract_tokens_and_labels
+)
+from .file_processing import (
+    process_uploaded_file,
+    load_from_local_file
+)
+from .huggingface import (
+    is_valid_repo_name,
+    create_hf_repo,
+    upload_to_hf,
+    download_from_hf
+)
+__all__ = [
+    'tokenize_text',
+    'join_tokens',
+    'process_text_for_gliner',
+    'extract_tokens_and_labels',
+    'process_uploaded_file',
+    'load_from_local_file',
+    'is_valid_repo_name',
+    'create_hf_repo',
+    'upload_to_hf',
+    'download_from_hf'
+]

src/ner_annotation/utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (656 Bytes). View file

src/ner_annotation/utils/__pycache__/file_processing.cpython-310.pyc ADDED Viewed

Binary file (5.01 kB). View file

src/ner_annotation/utils/__pycache__/huggingface.cpython-310.pyc ADDED Viewed

Binary file (3.68 kB). View file

src/ner_annotation/utils/__pycache__/text_processing.cpython-310.pyc ADDED Viewed

Binary file (2.78 kB). View file

src/ner_annotation/utils/file_processing.py ADDED Viewed

	@@ -0,0 +1,215 @@

+"""File processing utilities for NER annotation."""
+import os
+import json
+import pandas as pd
+from typing import List, Dict, Union, Optional
+from .text_processing import tokenize_text, process_text_for_gliner
+def process_uploaded_file(file_obj) -> List[str]:
+    """Process an uploaded file into a list of sentences.
+    Args:
+        file_obj: The uploaded file object
+    Returns:
+        List of processed sentences
+    Raises:
+        Exception: If file processing fails
+    """
+    if file_obj is None:
+        raise ValueError("Please upload a file first!")
+    try:
+        if file_obj.name.endswith('.csv'):
+            # Process CSV file
+            df = pd.read_csv(file_obj.name)
+            sentences = df['Nội dung'].dropna().tolist()
+        else:
+            # Process text file
+            content = file_obj.read().decode('utf-8')
+            sentences = [line.strip() for line in content.splitlines() if line.strip()]
+        # Process each sentence and flatten the list
+        processed_sentences = []
+        for sentence in sentences:
+            processed_sentences.extend(process_text_for_gliner(sentence))
+        return processed_sentences
+    except Exception as e:
+        raise Exception(f"Error reading file: {str(e)}")
+def load_from_local_file(
+    file_path: str,
+    file_format: str = "json"
+) -> List[Dict]:
+    """Load and convert data from local file in various formats.
+    Args:
+        file_path: Path to the file to load
+        file_format: Format of the file (json, conll, or txt)
+    Returns:
+        List of converted examples
+    Raises:
+        Exception: If file loading fails
+    """
+    try:
+        if file_format == "json":
+            with open(file_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                if isinstance(data, list):
+                    # If data is already in the correct format
+                    if all("tokenized_text" in item and "ner" in item for item in data):
+                        return data
+                    # Convert from other JSON formats
+                    return _convert_json_format(data)
+                else:
+                    raise ValueError("JSON file must contain a list of examples")
+        elif file_format == "conll":
+            return _load_conll_file(file_path)
+        elif file_format == "txt":
+            return _load_txt_file(file_path)
+        else:
+            raise ValueError(f"Unsupported file format: {file_format}")
+    except Exception as e:
+        raise Exception(f"Error loading file: {str(e)}")
+def _convert_json_format(data: List[Dict]) -> List[Dict]:
+    """Convert JSON data from various formats to the standard format.
+    Args:
+        data: List of examples in various JSON formats
+    Returns:
+        List of examples in the standard format
+    """
+    converted_data = []
+    for item in data:
+        if "tokens" in item and "ner_tags" in item:
+            ner_spans = []
+            current_span = None
+            for i, (token, tag) in enumerate(zip(item["tokens"], item["ner_tags"])):
+                if tag != "O":
+                    if current_span is None:
+                        current_span = [i, i, tag]
+                    elif tag == current_span[2]:
+                        current_span[1] = i
+                    else:
+                        ner_spans.append(current_span)
+                        current_span = [i, i, tag]
+                elif current_span is not None:
+                    ner_spans.append(current_span)
+                    current_span = None
+            if current_span is not None:
+                ner_spans.append(current_span)
+            converted_data.append({
+                "tokenized_text": item["tokens"],
+                "ner": ner_spans,
+                "validated": False
+            })
+    return converted_data
+def _load_conll_file(file_path: str) -> List[Dict]:
+    """Load and convert data from a CoNLL format file.
+    Args:
+        file_path: Path to the CoNLL file
+    Returns:
+        List of converted examples
+    """
+    converted_data = []
+    current_example = {"tokens": [], "ner_tags": []}
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                if line.startswith("#"):
+                    continue
+                parts = line.split()
+                if len(parts) >= 2:
+                    token, tag = parts[0], parts[-1]
+                    current_example["tokens"].append(token)
+                    current_example["ner_tags"].append(tag)
+            elif current_example["tokens"]:
+                # Convert current example
+                ner_spans = []
+                current_span = None
+                for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])):
+                    if tag != "O":
+                        if current_span is None:
+                            current_span = [i, i, tag]
+                        elif tag == current_span[2]:
+                            current_span[1] = i
+                        else:
+                            ner_spans.append(current_span)
+                            current_span = [i, i, tag]
+                    elif current_span is not None:
+                        ner_spans.append(current_span)
+                        current_span = None
+                if current_span is not None:
+                    ner_spans.append(current_span)
+                converted_data.append({
+                    "tokenized_text": current_example["tokens"],
+                    "ner": ner_spans,
+                    "validated": False
+                })
+                current_example = {"tokens": [], "ner_tags": []}
+        # Handle last example if exists
+        if current_example["tokens"]:
+            ner_spans = []
+            current_span = None
+            for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])):
+                if tag != "O":
+                    if current_span is None:
+                        current_span = [i, i, tag]
+                    elif tag == current_span[2]:
+                        current_span[1] = i
+                    else:
+                        ner_spans.append(current_span)
+                        current_span = [i, i, tag]
+                elif current_span is not None:
+                    ner_spans.append(current_span)
+                    current_span = None
+            if current_span is not None:
+                ner_spans.append(current_span)
+            converted_data.append({
+                "tokenized_text": current_example["tokens"],
+                "ner": ner_spans,
+                "validated": False
+            })
+    return converted_data
+def _load_txt_file(file_path: str) -> List[Dict]:
+    """Load and convert data from a text file.
+    Args:
+        file_path: Path to the text file
+    Returns:
+        List of converted examples
+    """
+    converted_data = []
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                tokens = tokenize_text(line)
+                converted_data.append({
+                    "tokenized_text": tokens,
+                    "ner": [],
+                    "validated": False
+                })
+    return converted_data

src/ner_annotation/utils/huggingface.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""Hugging Face Hub integration utilities."""
+import re
+import os
+from typing import Optional
+from huggingface_hub import HfApi, create_repo
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+HF_TOKEN = os.getenv("HUGGINGFACE_ACCESS_TOKEN")
+def is_valid_repo_name(repo_name: str) -> bool:
+    """Check if a repository name is valid for Hugging Face Hub.
+    Args:
+        repo_name: The repository name to validate
+    Returns:
+        True if the name is valid, False otherwise
+    """
+    return bool(re.match(r'^[A-Za-z0-9_./-]+$', repo_name))
+def create_hf_repo(
+    repo_name: str,
+    repo_type: str = "dataset",
+    private: bool = False
+) -> str:
+    """Create a new repository on Hugging Face Hub.
+    Args:
+        repo_name: Name of the repository to create
+        repo_type: Type of repository (dataset, model, or space)
+        private: Whether the repository should be private
+    Returns:
+        The repository ID
+    Raises:
+        Exception: If the repository name is invalid or creation fails
+    """
+    if not is_valid_repo_name(repo_name):
+        raise Exception(
+            "Invalid repo name: must not contain slashes, spaces, or special "
+            "characters except '-', '_', '.'"
+        )
+    try:
+        api = HfApi(token=HF_TOKEN)
+        create_repo(
+            repo_id=repo_name,
+            repo_type=repo_type,
+            private=private,
+            exist_ok=True,
+            token=HF_TOKEN
+        )
+        return repo_name
+    except Exception as e:
+        raise Exception(f"Error creating repository: {str(e)}")
+def upload_to_hf(
+    file_path: str,
+    repo_name: str,
+    repo_type: str = "dataset",
+    private: bool = False
+) -> str:
+    """Upload a file to Hugging Face Hub.
+    Args:
+        file_path: Path to the file to upload
+        repo_name: Name of the repository to upload to
+        repo_type: Type of repository
+        private: Whether the repository should be private
+    Returns:
+        The repository ID
+    Raises:
+        Exception: If the upload fails
+    """
+    try:
+        # Create or get repository
+        repo_id = create_hf_repo(repo_name, repo_type, private)
+        # Upload file
+        api = HfApi(token=HF_TOKEN)
+        api.upload_file(
+            path_or_fileobj=file_path,
+            path_in_repo=os.path.basename(file_path),
+            repo_id=repo_id,
+            repo_type=repo_type,
+            token=HF_TOKEN
+        )
+        return repo_id
+    except Exception as e:
+        raise Exception(f"Error uploading to Hugging Face Hub: {str(e)}")
+def download_from_hf(
+    repo_name: str,
+    file_name: str,
+    local_path: Optional[str] = None
+) -> str:
+    """Download a file from Hugging Face Hub.
+    Args:
+        repo_name: Name of the repository to download from
+        file_name: Name of the file to download
+        local_path: Optional local path to save the file to
+    Returns:
+        Path to the downloaded file
+    Raises:
+        Exception: If the download fails
+    """
+    try:
+        import requests
+        # Construct the raw URL for the file
+        raw_url = f"https://huggingface.co/datasets/{repo_name}/raw/main/{file_name}"
+        # Download the file
+        response = requests.get(raw_url)
+        if response.status_code != 200:
+            raise Exception(f"Failed to download file: {response.status_code}")
+        # Save the file
+        if local_path is None:
+            local_path = os.path.join("data", file_name)
+        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+        with open(local_path, "wb") as f:
+            f.write(response.content)
+        return local_path
+    except Exception as e:
+        raise Exception(f"Error downloading from Hugging Face Hub: {str(e)}")

src/ner_annotation/utils/text_processing.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""Text processing utilities for NER annotation."""
+import re
+from typing import List, Dict, Union, Tuple
+def tokenize_text(text: str) -> List[str]:
+    """Tokenize the input text into a list of tokens.
+    Args:
+        text: The input text to tokenize
+    Returns:
+        List of tokens
+    """
+    return re.findall(r'\w+(?:[-_]\w+)*|\S', text)
+def join_tokens(tokens: List[str]) -> str:
+    """Join tokens with proper spacing.
+    Args:
+        tokens: List of tokens to join
+    Returns:
+        Joined text string
+    """
+    text = ""
+    for token in tokens:
+        if token in {",", ".", "!", "?", ":", ";", "..."}:
+            text = text.rstrip() + token
+        else:
+            text += " " + token
+    return text.strip()
+def process_text_for_gliner(
+    text: str,
+    max_tokens: int = 256,
+    overlap: int = 32
+) -> List[str]:
+    """Process text for GLiNER by splitting long texts into overlapping chunks.
+    Preserves sentence boundaries and context when possible.
+    Args:
+        text: The input text to process
+        max_tokens: Maximum number of tokens per chunk
+        overlap: Number of tokens to overlap between chunks
+    Returns:
+        List of text chunks suitable for GLiNER
+    """
+    # First split into sentences to preserve natural boundaries
+    sentences = re.split(r'(?<=[.!?])\s+', text)
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    for sentence in sentences:
+        # Tokenize the sentence
+        sentence_tokens = tokenize_text(sentence)
+        sentence_length = len(sentence_tokens)
+        # If a single sentence is too long, split it
+        if sentence_length > max_tokens:
+            # If we have accumulated tokens, add them as a chunk
+            if current_chunk:
+                chunks.append(" ".join(current_chunk))
+                current_chunk = []
+                current_length = 0
+            # Split the long sentence into smaller chunks
+            start = 0
+            while start < sentence_length:
+                end = min(start + max_tokens, sentence_length)
+                chunk_tokens = sentence_tokens[start:end]
+                chunks.append(" ".join(chunk_tokens))
+                start = end - overlap if end < sentence_length else end
+        # If adding this sentence would exceed max_tokens, start a new chunk
+        elif current_length + sentence_length > max_tokens:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = sentence_tokens
+            current_length = sentence_length
+        else:
+            current_chunk.extend(sentence_tokens)
+            current_length += sentence_length
+    # Add any remaining tokens as the final chunk
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
+    return chunks
+def extract_tokens_and_labels(
+    data: List[Dict[str, Union[str, None]]]
+) -> Tuple[List[str], List[Tuple[int, int, str]]]:
+    """Extract tokens and NER labels from annotation data.
+    Args:
+        data: List of token-label pairs
+    Returns:
+        Tuple of (tokens, ner_spans)
+    """
+    tokens = []
+    ner = []
+    token_start_idx = 0
+    for entry in data:
+        char = entry['token']
+        label = entry['class_or_confidence']
+        # Tokenize the current text chunk
+        token_list = tokenize_text(char)
+        # Append tokens to the main tokens list
+        tokens.extend(token_list)
+        if label:
+            token_end_idx = token_start_idx + len(token_list) - 1
+            ner.append((token_start_idx, token_end_idx, label))
+        token_start_idx += len(token_list)
+    return tokens, ner