nam pham commited on
Commit
a33a001
·
1 Parent(s): 9faf7cc

feat: improve ui/ux

Browse files
README.md CHANGED
@@ -11,3 +11,85 @@ short_description: the ui for annotation ner for healthcare
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+ # NER Annotation Tool
16
+
17
+ A powerful tool for annotating text with named entities using GLiNER models. This tool provides both automatic annotation using pre-trained models and a manual annotation interface for reviewing and correcting the results.
18
+
19
+ ## Features
20
+
21
+ - Automatic NER annotation using GLiNER models
22
+ - Support for multiple pre-trained models
23
+ - Interactive dataset viewer and editor
24
+ - Export/import functionality for annotated data
25
+ - Integration with Hugging Face Hub for dataset sharing
26
+ - Support for various file formats (JSON, CoNLL, TXT)
27
+
28
+ ## Installation
29
+
30
+ 1. Clone the repository:
31
+ ```bash
32
+ git clone https://github.com/yourusername/ner-annotation.git
33
+ cd ner-annotation
34
+ ```
35
+
36
+ 2. Create and activate a virtual environment:
37
+ ```bash
38
+ python -m venv .venv
39
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
40
+ ```
41
+
42
+ 3. Install the package:
43
+ ```bash
44
+ pip install -e .
45
+ ```
46
+
47
+ ## Usage
48
+
49
+ 1. Start the application:
50
+ ```bash
51
+ python -m ner_annotation.app
52
+ ```
53
+
54
+ 2. The application will open in your default web browser with two main tabs:
55
+
56
+ - **Auto Annotation**: Upload text files and automatically annotate them using GLiNER models
57
+ - **Dataset Viewer**: Review, edit, and validate annotated data
58
+
59
+ ### Auto Annotation
60
+
61
+ 1. Upload a text file (one sentence per line)
62
+ 2. Select a GLiNER model
63
+ 3. Enter the entity labels to detect (comma-separated)
64
+ 4. Adjust the confidence threshold
65
+ 5. Optionally add a prompt
66
+ 6. Click "Annotate Data"
67
+
68
+ ### Dataset Viewer
69
+
70
+ 1. Load a dataset (local or from Hugging Face)
71
+ 2. Navigate through examples using the slider or buttons
72
+ 3. Edit annotations as needed
73
+ 4. Validate examples
74
+ 5. Save the dataset locally or to Hugging Face Hub
75
+
76
+ ## Configuration
77
+
78
+ Create a `.env` file in the project root with your Hugging Face token:
79
+ ```
80
+ HUGGINGFACE_ACCESS_TOKEN=your_token_here
81
+ ```
82
+
83
+ ## Available Models
84
+
85
+ - `BookingCare/gliner-multi-healthcare`
86
+ - `knowledgator/gliner-multitask-large-v0.5`
87
+ - `knowledgator/gliner-multitask-base-v0.5`
88
+
89
+ ## Contributing
90
+
91
+ Contributions are welcome! Please feel free to submit a Pull Request.
92
+
93
+ ## License
94
+
95
+ This project is licensed under the MIT License - see the LICENSE file for details.
app.py CHANGED
@@ -1,18 +1,19 @@
1
- import gradio as gr
2
- from huggingface_hub import HfApi, create_repo
3
  import os
4
- import re
5
  import json
6
- import torch
7
- import random
8
  from typing import List, Dict, Union, Tuple
9
- from gliner import GLiNER
10
- from datasets import load_dataset
11
- from dotenv import load_dotenv
12
 
13
- # Load environment variables from .env
14
- load_dotenv()
15
- HF_TOKEN = os.getenv("HUGGINGFACE_ACCESS_TOKEN")
 
 
 
 
 
 
16
 
17
  # Available models for annotation
18
  AVAILABLE_MODELS = [
@@ -21,132 +22,15 @@ AVAILABLE_MODELS = [
21
  "knowledgator/gliner-multitask-base-v0.5"
22
  ]
23
 
24
- # Dataset Viewer Classes and Functions
25
- class DynamicDataset:
26
- def __init__(
27
- self, data: List[Dict[str, Union[List[Union[int, str]], bool]]]
28
- ) -> None:
29
- self.data = data
30
- self.data_len = len(self.data)
31
- self.current = -1
32
- for example in self.data:
33
- if not "validated" in example.keys():
34
- example["validated"] = False
35
-
36
- def next_example(self):
37
- self.current += 1
38
- if self.current > self.data_len-1:
39
- self.current = self.data_len -1
40
- elif self.current < 0:
41
- self.current = 0
42
-
43
- def previous_example(self):
44
- self.current -= 1
45
- if self.current > self.data_len-1:
46
- self.current = self.data_len -1
47
- elif self.current < 0:
48
- self.current = 0
49
-
50
- def example_by_id(self, id):
51
- self.current = id
52
- if self.current > self.data_len-1:
53
- self.current = self.data_len -1
54
- elif self.current < 0:
55
- self.current = 0
56
-
57
- def validate(self):
58
- self.data[self.current]["validated"] = True
59
-
60
- def load_current_example(self):
61
- return self.data[self.current]
62
-
63
- def tokenize_text(text):
64
- """Tokenize the input text into a list of tokens."""
65
- return re.findall(r'\w+(?:[-_]\w+)*|\S', text)
66
-
67
- def join_tokens(tokens):
68
- # Joining tokens with space, but handling special characters correctly
69
- text = ""
70
- for token in tokens:
71
- if token in {",", ".", "!", "?", ":", ";", "..."}:
72
- text = text.rstrip() + token
73
- else:
74
- text += " " + token
75
- return text.strip()
76
-
77
- def prepare_for_highlight(data):
78
- tokens = data["tokenized_text"]
79
- ner = data["ner"]
80
-
81
- highlighted_text = []
82
- current_entity = None
83
- entity_tokens = []
84
- normal_tokens = []
85
-
86
- for idx, token in enumerate(tokens):
87
- # Check if the current token is the start of a new entity
88
- if current_entity is None or idx > current_entity[1]:
89
- if entity_tokens:
90
- highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
91
- entity_tokens = []
92
- current_entity = next((entity for entity in ner if entity[0] == idx), None)
93
-
94
- # If current token is part of an entity
95
- if current_entity and current_entity[0] <= idx <= current_entity[1]:
96
- if normal_tokens:
97
- highlighted_text.append((" ".join(normal_tokens), None))
98
- normal_tokens = []
99
- entity_tokens.append(token + " ")
100
- else:
101
- if entity_tokens:
102
- highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
103
- entity_tokens = []
104
- normal_tokens.append(token + " ")
105
-
106
- # Append any remaining tokens
107
- if entity_tokens:
108
- highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
109
- if normal_tokens:
110
- highlighted_text.append((" ".join(normal_tokens), None))
111
- # Clean up spaces before punctuation
112
- cleaned_highlighted_text = []
113
- for text, label in highlighted_text:
114
- cleaned_text = re.sub(r'\s(?=[,\.!?…:;])', '', text)
115
- cleaned_highlighted_text.append((cleaned_text, label))
116
-
117
- return cleaned_highlighted_text
118
-
119
- def extract_tokens_and_labels(data: List[Dict[str, Union[str, None]]]) -> Dict[str, Union[List[str], List[Tuple[int, int, str]]]]:
120
- tokens = []
121
- ner = []
122
-
123
- token_start_idx = 0
124
-
125
- for entry in data:
126
- char = entry['token']
127
- label = entry['class_or_confidence']
128
-
129
- # Tokenize the current text chunk
130
- token_list = tokenize_text(char)
131
-
132
- # Append tokens to the main tokens list
133
- tokens.extend(token_list)
134
-
135
- if label:
136
- token_end_idx = token_start_idx + len(token_list) - 1
137
- ner.append((token_start_idx, token_end_idx, label))
138
-
139
- token_start_idx += len(token_list)
140
-
141
- return tokens, ner
142
-
143
- # Global variables for dataset viewer
144
  dynamic_dataset = None
 
 
145
 
146
  def load_dataset():
 
147
  global dynamic_dataset
148
  try:
149
- print('load_dataset')
150
  with open("data/annotated_data.json", 'rt') as dataset:
151
  ANNOTATED_DATA = json.load(dataset)
152
  dynamic_dataset = DynamicDataset(ANNOTATED_DATA)
@@ -156,11 +40,12 @@ def load_dataset():
156
  return [("Error loading dataset: " + str(e), None)], gr.update(value=0, maximum=1)
157
 
158
  def example_by_id(id):
 
159
  global dynamic_dataset
160
  if dynamic_dataset is None:
161
  return [("Please load a dataset first", None)], gr.update(value=0, maximum=1)
162
  try:
163
- id = int(id) # Ensure id is an integer
164
  dynamic_dataset.example_by_id(id)
165
  current = dynamic_dataset.current
166
  max_value = len(dynamic_dataset.data) - 1
@@ -169,6 +54,7 @@ def example_by_id(id):
169
  return [("Error navigating to example: " + str(e), None)], gr.update(value=0, maximum=1)
170
 
171
  def next_example():
 
172
  global dynamic_dataset
173
  if dynamic_dataset is None:
174
  return [("Please load a dataset first", None)], gr.update(value=0, maximum=1)
@@ -181,6 +67,7 @@ def next_example():
181
  return [("Error navigating to next example: " + str(e), None)], gr.update(value=0, maximum=1)
182
 
183
  def previous_example():
 
184
  global dynamic_dataset
185
  if dynamic_dataset is None:
186
  return [("Please load a dataset first", None)], gr.update(value=0, maximum=1)
@@ -193,6 +80,7 @@ def previous_example():
193
  return [("Error navigating to previous example: " + str(e), None)], gr.update(value=0, maximum=1)
194
 
195
  def update_example(data):
 
196
  global dynamic_dataset
197
  if dynamic_dataset is None:
198
  return [("Please load a dataset first", None)]
@@ -202,6 +90,7 @@ def update_example(data):
202
  return prepare_for_highlight(dynamic_dataset.load_current_example())
203
 
204
  def validate_example():
 
205
  global dynamic_dataset
206
  if dynamic_dataset is None:
207
  return [("Please load a dataset first", None)]
@@ -209,6 +98,7 @@ def validate_example():
209
  return [("The example was validated!", None)]
210
 
211
  def save_dataset(inp):
 
212
  global dynamic_dataset
213
  if dynamic_dataset is None:
214
  return [("Please load a dataset first", None)]
@@ -216,831 +106,437 @@ def save_dataset(inp):
216
  json.dump(dynamic_dataset.data, file)
217
  return [("The validated dataset was saved as data/annotated_data.json", None)]
218
 
219
- # Original annotation functions
220
- def transform_data(data):
221
- tokens = tokenize_text(data['text'])
222
- spans = []
223
-
224
- for entity in data['entities']:
225
- entity_tokens = tokenize_text(entity['word'])
226
- entity_length = len(entity_tokens)
227
-
228
- # Find the start and end indices of each entity in the tokenized text
229
- for i in range(len(tokens) - entity_length + 1):
230
- if tokens[i:i + entity_length] == entity_tokens:
231
- spans.append([i, i + entity_length - 1, entity['entity']])
232
- break
233
-
234
- return {"tokenized_text": tokens, "ner": spans, "validated": False}
235
-
236
- def merge_entities(entities):
237
- if not entities:
238
- return []
239
- merged = []
240
- current = entities[0]
241
- for next_entity in entities[1:]:
242
- if next_entity['entity'] == current['entity'] and (next_entity['start'] == current['end'] + 1 or next_entity['start'] == current['end']):
243
- current['word'] += ' ' + next_entity['word']
244
- current['end'] = next_entity['end']
245
- else:
246
- merged.append(current)
247
- current = next_entity
248
- merged.append(current)
249
- return merged
250
-
251
- def annotate_text(
252
- model, text, labels: List[str], threshold: float, nested_ner: bool
253
- ) -> Dict:
254
- labels = [label.strip() for label in labels]
255
- r = {
256
- "text": text,
257
- "entities": [
258
- {
259
- "entity": entity["label"],
260
- "word": entity["text"],
261
- "start": entity["start"],
262
- "end": entity["end"],
263
- "score": 0,
264
- }
265
- for entity in model.predict_entities(
266
- text, labels, flat_ner=not nested_ner, threshold=threshold
267
- )
268
- ],
269
- }
270
- r["entities"] = merge_entities(r["entities"])
271
- return transform_data(r)
272
-
273
- def batch_annotate_text(model: GLiNER, texts: List[str], labels: List[str], threshold: float, nested_ner: bool) -> List[Dict]:
274
- """Annotate multiple texts in batch"""
275
- labels = [label.strip() for label in labels]
276
- batch_entities = model.batch_predict_entities(texts, labels, flat_ner=not nested_ner, threshold=threshold)
277
-
278
- results = []
279
- for text, entities in zip(texts, batch_entities):
280
- r = {
281
- "text": text,
282
- "entities": [
283
- {
284
- "entity": entity["label"],
285
- "word": entity["text"],
286
- "start": entity["start"],
287
- "end": entity["end"],
288
- "score": 0,
289
- }
290
- for entity in entities
291
- ],
292
- }
293
- r["entities"] = merge_entities(r["entities"])
294
- results.append(transform_data(r))
295
- return results
296
-
297
- class AutoAnnotator:
298
- def __init__(
299
- self, model: str = "BookingCare/gliner-multi-healthcare",
300
- # device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
301
- device = torch.device('cpu')
302
- ) -> None:
303
-
304
- # Set PyTorch memory management settings
305
- if torch.cuda.is_available():
306
- torch.cuda.empty_cache()
307
- os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
308
-
309
- self.model = GLiNER.from_pretrained(model).to(device)
310
- self.annotated_data = []
311
- self.stat = {
312
- "total": None,
313
- "current": -1
314
- }
315
-
316
- def auto_annotate(
317
- self, data: List[str], labels: List[str],
318
- prompt: Union[str, List[str]] = None, threshold: float = 0.5, nested_ner: bool = False
319
- ) -> List[Dict]:
320
- self.stat["total"] = len(data)
321
- self.stat["current"] = -1 # Reset current progress
322
-
323
- # Process texts in batches
324
- processed_data = []
325
- batch_size = 8 # Reduced batch size to prevent OOM errors
326
-
327
- for i in range(0, len(data), batch_size):
328
- batch_texts = data[i:i + batch_size]
329
- batch_with_prompts = []
330
-
331
- # Add prompts to batch texts
332
- for text in batch_texts:
333
- if isinstance(prompt, list):
334
- prompt_text = random.choice(prompt)
335
- else:
336
- prompt_text = prompt
337
- text_with_prompt = f"{prompt_text}\n{text}" if prompt_text else text
338
- batch_with_prompts.append(text_with_prompt)
339
-
340
- # Process batch
341
- batch_results = batch_annotate_text(self.model, batch_with_prompts, labels, threshold, nested_ner)
342
- processed_data.extend(batch_results)
343
-
344
- # Clear CUDA cache after each batch
345
- if torch.cuda.is_available():
346
- torch.cuda.empty_cache()
347
-
348
- # Update progress
349
- self.stat["current"] = min(i + batch_size, len(data))
350
-
351
- self.annotated_data = processed_data
352
- return self.annotated_data
353
-
354
- # Global variables
355
- annotator = None
356
- sentences = []
357
-
358
- def process_text_for_gliner(text: str, max_tokens: int = 256, overlap: int = 32) -> List[str]:
359
- """
360
- Process text for GLiNER by splitting long texts into overlapping chunks.
361
- Preserves sentence boundaries and context when possible.
362
-
363
- Args:
364
- text: The input text to process
365
- max_tokens: Maximum number of tokens per chunk
366
- overlap: Number of tokens to overlap between chunks
367
-
368
- Returns:
369
- List of text chunks suitable for GLiNER
370
- """
371
- # First split into sentences to preserve natural boundaries
372
- sentences = re.split(r'(?<=[.!?])\s+', text)
373
- chunks = []
374
- current_chunk = []
375
- current_length = 0
376
-
377
- for sentence in sentences:
378
- # Tokenize the sentence
379
- sentence_tokens = tokenize_text(sentence)
380
- sentence_length = len(sentence_tokens)
381
-
382
- # If a single sentence is too long, split it
383
- if sentence_length > max_tokens:
384
- # If we have accumulated tokens, add them as a chunk
385
- if current_chunk:
386
- chunks.append(" ".join(current_chunk))
387
- current_chunk = []
388
- current_length = 0
389
-
390
- # Split the long sentence into smaller chunks
391
- start = 0
392
- while start < sentence_length:
393
- end = min(start + max_tokens, sentence_length)
394
- chunk_tokens = sentence_tokens[start:end]
395
- chunks.append(" ".join(chunk_tokens))
396
- start = end - overlap if end < sentence_length else end
397
-
398
- # If adding this sentence would exceed max_tokens, start a new chunk
399
- elif current_length + sentence_length > max_tokens:
400
- chunks.append(" ".join(current_chunk))
401
- current_chunk = sentence_tokens
402
- current_length = sentence_length
403
- else:
404
- current_chunk.extend(sentence_tokens)
405
- current_length += sentence_length
406
-
407
- # Add any remaining tokens as the final chunk
408
- if current_chunk:
409
- chunks.append(" ".join(current_chunk))
410
-
411
- return chunks
412
-
413
- def process_uploaded_file(file_obj):
414
- if file_obj is None:
415
- return "Please upload a file first!"
416
-
417
- try:
418
- # Read the uploaded file
419
- global sentences
420
- if file_obj.name.endswith('.csv'):
421
- import pandas as pd
422
- df = pd.read_csv(file_obj.name)
423
- sentences = df['Nội dung'].dropna().tolist()
424
- # Process each sentence and flatten the list
425
- processed_sentences = []
426
- for sentence in sentences:
427
- processed_sentences.extend(process_text_for_gliner(sentence))
428
- sentences = processed_sentences
429
- else:
430
- # Read the file content directly from the file object
431
- content = file_obj.read().decode('utf-8')
432
- raw_sentences = [line.strip() for line in content.splitlines() if line.strip()]
433
- # Process each sentence and flatten the list
434
- processed_sentences = []
435
- for sentence in raw_sentences:
436
- processed_sentences.extend(process_text_for_gliner(sentence))
437
- sentences = processed_sentences
438
- return f"Successfully loaded {len(sentences)} sentences from file!"
439
- except Exception as e:
440
- return f"Error reading file: {str(e)}"
441
-
442
- def is_valid_repo_name(repo_name):
443
- # Hugging Face repo names must not contain slashes or spaces
444
- return bool(re.match(r'^[A-Za-z0-9_./-]+$', repo_name))
445
-
446
- def create_hf_repo(repo_name: str, repo_type: str = "dataset", private: bool = False):
447
- """Create a new repository on Hugging Face Hub"""
448
- if not is_valid_repo_name(repo_name):
449
- raise Exception("Invalid repo name: must not contain slashes, spaces, or special characters except '-', '_', '.'")
450
- try:
451
- api = HfApi(token=HF_TOKEN)
452
- # user = api.whoami()['name']
453
- # repo_id = f"{user}/{repo_name}"
454
- create_repo(
455
- repo_id=repo_name,
456
- repo_type=repo_type,
457
- private=private,
458
- exist_ok=True,
459
- token=HF_TOKEN
460
- )
461
- return repo_name
462
- except Exception as e:
463
- raise Exception(f"Error creating repository: {str(e)}")
464
-
465
  def annotate(model, labels, threshold, prompt, save_to_hub, repo_name, repo_type, is_private):
466
- global annotator
 
467
  try:
468
  if not sentences:
469
  return "Please upload a file with text first!"
470
  if save_to_hub and not is_valid_repo_name(repo_name):
471
  return "Error: Invalid repo name. Only use letters, numbers, '-', '_', or '.' (no slashes or spaces)."
 
472
  labels = [label.strip() for label in labels.split(",")]
473
  annotator = AutoAnnotator(model)
474
  annotated_data = annotator.auto_annotate(sentences, labels, prompt, threshold)
 
475
  # Save annotated data locally
476
  os.makedirs("data", exist_ok=True)
477
  local_path = "data/annotated_data.json"
478
  with open(local_path, "wt") as file:
479
  json.dump(annotated_data, file, ensure_ascii=False)
 
480
  status_messages = [f"Successfully annotated and saved locally to {local_path}"]
 
481
  # Upload to Hugging Face Hub if requested
482
  if save_to_hub:
483
  try:
484
- repo_id = create_hf_repo(repo_name, repo_type, is_private)
485
- api = HfApi(token=HF_TOKEN)
486
- api.upload_file(
487
- path_or_fileobj=local_path,
488
- path_in_repo="annotated_data.json",
489
- repo_id=repo_id,
490
- repo_type=repo_type,
491
- token=HF_TOKEN
492
- )
493
  status_messages.append(f"Successfully uploaded to Hugging Face Hub repository: {repo_id}")
494
  except Exception as e:
495
  status_messages.append(f"Error with Hugging Face Hub: {str(e)}")
 
496
  return "\n".join(status_messages)
497
  except Exception as e:
498
  return f"Error during annotation: {str(e)}"
499
 
500
- def convert_hf_dataset_to_ner_format(dataset):
501
- """Convert Hugging Face dataset to NER format"""
502
- converted_data = []
503
- for item in dataset:
504
- # Assuming the dataset has 'tokens' and 'ner_tags' fields
505
- # Adjust the field names based on your dataset structure
506
- if 'tokens' in item and 'ner_tags' in item:
507
- ner_spans = []
508
- current_span = None
509
-
510
- for i, (token, tag) in enumerate(zip(item['tokens'], item['ner_tags'])):
511
- if tag != 'O': # Not Outside
512
- if current_span is None:
513
- current_span = [i, i, tag]
514
- elif tag == current_span[2]:
515
- current_span[1] = i
516
- else:
517
- ner_spans.append(current_span)
518
- current_span = [i, i, tag]
519
- elif current_span is not None:
520
- ner_spans.append(current_span)
521
- current_span = None
522
-
523
- if current_span is not None:
524
- ner_spans.append(current_span)
525
-
526
- converted_data.append({
527
- "tokenized_text": item['tokens'],
528
- "ner": ner_spans,
529
- "validated": False
530
- })
531
-
532
- return converted_data
533
-
534
- def load_from_huggingface(dataset_name: str):
535
- """Load dataset from Hugging Face Hub"""
536
  try:
537
- # Download the JSON file from Hugging Face
538
- import requests
539
- import json
540
-
541
- # Construct the raw URL for the JSON file
542
- raw_url = f"https://huggingface.co/datasets/{dataset_name}/raw/main/annotated_data.json"
543
 
544
- # Download the file
545
- response = requests.get(raw_url)
546
- if response.status_code == 200:
547
- print('response status', response.status_code)
548
- print('response', response.text)
549
- dataset = json.loads(response.text)
550
- converted_data = dataset # Data is already in the correct format
551
- else:
552
- raise Exception(f"Failed to download dataset: {response.status_code}")
553
 
554
- # Save the converted data
555
- os.makedirs("data", exist_ok=True)
556
- with open("data/annotated_data.json", "wt") as file:
557
- json.dump(converted_data, file, ensure_ascii=False)
558
-
559
- return f"Successfully loaded and converted dataset: {dataset_name}"
560
  except Exception as e:
561
- error_msg = f"Error loading dataset: {str(e)}"
562
- return error_msg
563
 
564
- def load_from_local_file(file_path: str, file_format: str = "json"):
565
- """Load and convert data from local file in various formats"""
566
- try:
567
- if file_format == "json":
568
- with open(file_path, 'r', encoding='utf-8') as f:
569
- data = json.load(f)
570
- if isinstance(data, list):
571
- # If data is already in the correct format
572
- if all("tokenized_text" in item and "ner" in item for item in data):
573
- return data
574
- # Convert from other JSON formats
575
- converted_data = []
576
- for item in data:
577
- if "tokens" in item and "ner_tags" in item:
578
- ner_spans = []
579
- current_span = None
580
- for i, (token, tag) in enumerate(zip(item["tokens"], item["ner_tags"])):
581
- if tag != "O":
582
- if current_span is None:
583
- current_span = [i, i, tag]
584
- elif tag == current_span[2]:
585
- current_span[1] = i
586
- else:
587
- ner_spans.append(current_span)
588
- current_span = [i, i, tag]
589
- elif current_span is not None:
590
- ner_spans.append(current_span)
591
- current_span = None
592
- if current_span is not None:
593
- ner_spans.append(current_span)
594
- converted_data.append({
595
- "tokenized_text": item["tokens"],
596
- "ner": ner_spans,
597
- "validated": False
598
- })
599
- return converted_data
600
- else:
601
- raise ValueError("JSON file must contain a list of examples")
602
-
603
- elif file_format == "conll":
604
- converted_data = []
605
- current_example = {"tokens": [], "ner_tags": []}
606
-
607
- with open(file_path, 'r', encoding='utf-8') as f:
608
- for line in f:
609
- line = line.strip()
610
- if line:
611
- if line.startswith("#"):
612
- continue
613
- parts = line.split()
614
- if len(parts) >= 2:
615
- token, tag = parts[0], parts[-1]
616
- current_example["tokens"].append(token)
617
- current_example["ner_tags"].append(tag)
618
- elif current_example["tokens"]:
619
- # Convert current example
620
- ner_spans = []
621
- current_span = None
622
- for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])):
623
- if tag != "O":
624
- if current_span is None:
625
- current_span = [i, i, tag]
626
- elif tag == current_span[2]:
627
- current_span[1] = i
628
- else:
629
- ner_spans.append(current_span)
630
- current_span = [i, i, tag]
631
- elif current_span is not None:
632
- ner_spans.append(current_span)
633
- current_span = None
634
- if current_span is not None:
635
- ner_spans.append(current_span)
636
-
637
- converted_data.append({
638
- "tokenized_text": current_example["tokens"],
639
- "ner": ner_spans,
640
- "validated": False
641
- })
642
- current_example = {"tokens": [], "ner_tags": []}
643
-
644
- # Handle last example if exists
645
- if current_example["tokens"]:
646
- ner_spans = []
647
- current_span = None
648
- for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])):
649
- if tag != "O":
650
- if current_span is None:
651
- current_span = [i, i, tag]
652
- elif tag == current_span[2]:
653
- current_span[1] = i
654
- else:
655
- ner_spans.append(current_span)
656
- current_span = [i, i, tag]
657
- elif current_span is not None:
658
- ner_spans.append(current_span)
659
- current_span = None
660
- if current_span is not None:
661
- ner_spans.append(current_span)
662
-
663
- converted_data.append({
664
- "tokenized_text": current_example["tokens"],
665
- "ner": ner_spans,
666
- "validated": False
667
- })
668
-
669
- return converted_data
670
-
671
- elif file_format == "txt":
672
- # Simple text file with one sentence per line
673
- converted_data = []
674
- with open(file_path, 'r', encoding='utf-8') as f:
675
- for line in f:
676
- line = line.strip()
677
- if line:
678
- tokens = tokenize_text(line)
679
- converted_data.append({
680
- "tokenized_text": tokens,
681
- "ner": [],
682
- "validated": False
683
- })
684
- return converted_data
685
-
686
- else:
687
- raise ValueError(f"Unsupported file format: {file_format}")
688
-
689
- except Exception as e:
690
- raise Exception(f"Error loading file: {str(e)}")
691
-
692
- def process_local_file(file_obj, file_format):
693
- """Process uploaded local file"""
694
- if file_obj is None:
695
- return "Please upload a file first!"
696
-
697
  try:
698
- # Load and convert the data
699
- data = load_from_local_file(file_obj.name, file_format)
700
 
701
- # Save the converted data
702
  os.makedirs("data", exist_ok=True)
703
- with open("data/annotated_data.json", "wt") as file:
704
- json.dump(data, file, ensure_ascii=False)
 
705
 
706
- return f"Successfully loaded and converted {len(data)} examples from {file_format} file!"
 
 
707
  except Exception as e:
708
- return f"Error processing file: {str(e)}"
709
 
710
- # Add a function to download the annotated data
711
-
712
- def download_annotated_data():
713
- file_path = "data/annotated_data.json"
714
- if os.path.exists(file_path):
715
- return file_path
716
- else:
717
- return None
718
-
719
- def download_to_folder():
720
- """Download annotated data to a local folder"""
721
- try:
722
- source_path = "data/annotated_data.json"
723
- if not os.path.exists(source_path):
724
- return "No annotated data found!"
725
-
726
- # Create downloads directory if it doesn't exist
727
- download_dir = os.path.expanduser("~/Downloads")
728
- os.makedirs(download_dir, exist_ok=True)
729
 
730
- # Copy file to downloads folder
731
- import shutil
732
- dest_path = os.path.join(download_dir, "annotated_data.json")
733
- shutil.copy2(source_path, dest_path)
734
- return f"Successfully downloaded to {dest_path}"
735
- except Exception as e:
736
- return f"Error downloading file: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
737
 
738
- def update_hf_dataset(repo_name: str, repo_type: str = "dataset", is_private: bool = False):
739
- """Update or create a Hugging Face dataset with the current annotated data"""
740
  try:
741
- if not dynamic_dataset or not dynamic_dataset.data:
742
- return "No data to upload! Please load or annotate data first."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
743
 
744
- # Save current data to local file
745
  os.makedirs("data", exist_ok=True)
746
- local_path = "data/annotated_data.json"
747
- with open(local_path, "wt") as file:
748
- json.dump(dynamic_dataset.data, file, ensure_ascii=False)
749
-
750
- # Create or update repository
751
- try:
752
- repo_id = create_hf_repo(repo_name, repo_type, is_private)
753
- api = HfApi(token=HF_TOKEN)
754
- api.upload_file(
755
- path_or_fileobj=local_path,
756
- path_in_repo="annotated_data.json",
757
- repo_id=repo_id,
758
- repo_type=repo_type,
759
- token=HF_TOKEN
760
- )
761
- return f"Successfully uploaded to Hugging Face Hub repository: {repo_id}"
762
- except Exception as e:
763
- if "already exists" in str(e):
764
- # If repo exists, just update the file
765
- user = api.whoami()['name']
766
- repo_id = f"{user}/{repo_name}"
767
- api.upload_file(
768
- path_or_fileobj=local_path,
769
- path_in_repo="annotated_data.json",
770
- repo_id=repo_id,
771
- repo_type=repo_type,
772
- token=HF_TOKEN
773
- )
774
- return f"Successfully updated existing repository: {repo_id}"
775
- else:
776
- raise e
777
  except Exception as e:
778
- return f"Error updating Hugging Face dataset: {str(e)}"
779
 
780
- # Create the main interface with tabs
781
- with gr.Blocks() as demo:
782
- gr.Markdown("# NER Annotation Tool")
783
-
784
- with gr.Tabs():
785
- with gr.TabItem("Auto Annotation"):
786
- with gr.Row():
787
- with gr.Column():
788
- file_uploader = gr.File(label="Upload text file (one sentence per line)")
789
- upload_status = gr.Textbox(label="Upload Status")
790
- file_uploader.change(fn=process_uploaded_file, inputs=[file_uploader], outputs=[upload_status])
791
-
792
- with gr.Column():
793
- model = gr.Dropdown(
794
- label="Choose the model for annotation",
795
- choices=AVAILABLE_MODELS,
796
- value=AVAILABLE_MODELS[0]
797
- )
798
- labels = gr.Textbox(
799
- label="Labels",
800
- placeholder="Enter comma-separated labels (e.g., PERSON,ORG,LOC)",
801
- scale=2
802
- )
803
- threshold = gr.Slider(
804
- 0, 1,
805
- value=0.3,
806
- step=0.01,
807
- label="Threshold",
808
- info="Lower threshold increases entity predictions"
809
- )
810
- prompt = gr.Textbox(
811
- label="Prompt",
812
- placeholder="Enter your annotation prompt (optional)",
813
- scale=2
814
- )
815
 
816
- with gr.Group():
817
- gr.Markdown("### Save Options")
818
- save_to_hub = gr.Checkbox(
819
- label="Save to Hugging Face Hub",
820
- value=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
821
  )
822
 
823
- with gr.Group(visible=False) as hub_settings:
824
- gr.Markdown("#### Hugging Face Hub Settings")
825
- repo_name = gr.Textbox(
826
- label="Repository Name",
827
- placeholder="Enter repository name (e.g., my-ner-dataset)",
828
- scale=2
829
- )
830
- repo_type = gr.Dropdown(
831
- choices=["dataset", "model", "space"],
832
- value="dataset",
833
- label="Repository Type"
834
- )
835
- is_private = gr.Checkbox(
836
- label="Private Repository",
837
  value=False
838
  )
839
-
840
- annotate_btn = gr.Button("Annotate Data")
841
- output_info = gr.Textbox(label="Processing Status")
842
-
843
- # Add download buttons for annotated data
844
- with gr.Row():
845
- download_btn_annot = gr.Button("Download Annotated Data", visible=False)
846
- download_file_annot = gr.File(label="Download", interactive=False, visible=False)
847
- download_status = gr.Textbox(label="Download Status", visible=False)
848
-
849
- def toggle_hub_settings(save_to_hub):
850
- return {
851
- hub_settings: gr.update(visible=save_to_hub)
852
- }
853
-
854
- save_to_hub.change(
855
- fn=toggle_hub_settings,
856
- inputs=[save_to_hub],
857
- outputs=[hub_settings]
858
- )
859
-
860
- def show_download_buttons(status):
861
- # Show download buttons only if annotation was successful
862
- if status and status.startswith("Successfully annotated and saved locally"):
863
- return gr.update(visible=True), gr.update(visible=True)
864
- return gr.update(visible=False), gr.update(visible=False)
865
-
866
- annotate_btn.click(
867
- fn=annotate,
868
- inputs=[
869
- model, labels, threshold, prompt,
870
- save_to_hub, repo_name, repo_type, is_private
871
- ],
872
- outputs=[output_info]
873
- )
874
- output_info.change(
875
- fn=show_download_buttons,
876
- inputs=[output_info],
877
- outputs=[download_btn_annot, download_status]
878
- )
879
- def handle_download_annot():
880
- file_path = download_annotated_data()
881
- if file_path:
882
- return gr.update(value=file_path, visible=True)
883
- else:
884
- return gr.update(visible=False)
885
- download_btn_annot.click(fn=handle_download_annot, inputs=None, outputs=[download_file_annot])
886
-
887
- with gr.TabItem("Dataset Viewer"):
888
- with gr.Row():
889
- with gr.Column():
890
- with gr.Row():
891
- load_local_btn = gr.Button("Load Local Dataset")
892
- load_hf_btn = gr.Button("Load from Hugging Face")
893
-
894
- local_file = gr.File(label="Upload Local Dataset", visible=False)
895
- file_format = gr.Dropdown(
896
- choices=["json", "conll", "txt"],
897
- value="json",
898
- label="File Format",
899
- visible=False
900
- )
901
- local_status = gr.Textbox(label="Local File Status", visible=False)
902
-
903
- with gr.Group(visible=False) as hf_inputs:
904
  with gr.Row():
905
- dataset_name = gr.Textbox(
906
- label="Hugging Face Dataset Name",
907
- placeholder="Enter dataset name (e.g., conll2003)",
908
- scale=3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
909
  )
910
- load_dataset_btn = gr.Button("Load Dataset", scale=1)
911
-
912
- bar = gr.Slider(
913
- minimum=0,
914
- maximum=1,
915
- step=1,
916
- label="Progress",
917
- interactive=True,
918
- info="Use slider to navigate through examples"
919
- )
920
-
921
- with gr.Row():
922
- previous_btn = gr.Button("Previous example")
923
- apply_btn = gr.Button("Apply changes")
924
- next_btn = gr.Button("Next example")
925
-
926
- validate_btn = gr.Button("Validate")
927
- save_btn = gr.Button("Save validated dataset")
928
-
929
- # Add Hugging Face upload section
930
- with gr.Group(visible=False) as hf_upload_group:
931
- gr.Markdown("### Upload to Hugging Face")
932
- hf_repo_name = gr.Textbox(
933
- label="Repository Name",
934
- placeholder="Enter repository name (e.g., my-ner-dataset)",
935
- scale=2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
936
  )
937
- hf_repo_type = gr.Dropdown(
938
- choices=["dataset", "model", "space"],
939
- value="dataset",
940
- label="Repository Type"
 
941
  )
942
- hf_is_private = gr.Checkbox(
943
- label="Private Repository",
944
- value=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
945
  )
946
- upload_to_hf_btn = gr.Button("Upload to Hugging Face")
947
- hf_upload_status = gr.Textbox(label="Upload Status")
948
-
949
- with gr.Row():
950
- show_hf_upload_btn = gr.Button("Show Upload Options")
951
- hide_hf_upload_btn = gr.Button("Hide Upload Options", visible=False)
952
-
953
- def toggle_hf_upload(show: bool):
954
- return {
955
- hf_upload_group: gr.update(visible=show),
956
- show_hf_upload_btn: gr.update(visible=not show),
957
- hide_hf_upload_btn: gr.update(visible=show)
958
- }
959
-
960
- show_hf_upload_btn.click(
961
- fn=lambda: toggle_hf_upload(True),
962
- inputs=None,
963
- outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn]
964
- )
965
-
966
- hide_hf_upload_btn.click(
967
- fn=lambda: toggle_hf_upload(False),
968
- inputs=None,
969
- outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn]
970
- )
971
-
972
- inp_box = gr.HighlightedText(value=None, interactive=True)
973
-
974
- def toggle_local_inputs():
975
- return {
976
- local_file: gr.update(visible=True),
977
- file_format: gr.update(visible=True),
978
- local_status: gr.update(visible=True),
979
- hf_inputs: gr.update(visible=False)
980
- }
981
-
982
- def toggle_hf_inputs():
983
- return {
984
- local_file: gr.update(visible=False),
985
- file_format: gr.update(visible=False),
986
- local_status: gr.update(visible=False),
987
- hf_inputs: gr.update(visible=True)
988
- }
989
-
990
- load_local_btn.click(
991
- fn=toggle_local_inputs,
992
- inputs=None,
993
- outputs=[local_file, file_format, local_status, hf_inputs]
994
- )
995
-
996
- load_hf_btn.click(
997
- fn=toggle_hf_inputs,
998
- inputs=None,
999
- outputs=[local_file, file_format, local_status, hf_inputs]
1000
- )
1001
-
1002
- def process_and_load_local(file_obj, format):
1003
- status = process_local_file(file_obj, format)
1004
- if "Successfully" in status:
1005
- return load_dataset()
1006
- return [status], 0, 0
1007
-
1008
- local_file.change(
1009
- fn=process_and_load_local,
1010
- inputs=[local_file, file_format],
1011
- outputs=[inp_box, bar]
1012
- )
1013
-
1014
- def load_hf_dataset(name):
1015
- status = load_from_huggingface(name)
1016
- print('status', status)
1017
- if "Successfully" in status:
1018
- return load_dataset()
1019
- return [("Error loading dataset: " + status, None)], gr.update(value=0, maximum=1)
1020
-
1021
- load_dataset_btn.click(
1022
- fn=load_hf_dataset,
1023
- inputs=[dataset_name],
1024
- outputs=[inp_box, bar]
1025
- )
1026
-
1027
- apply_btn.click(fn=update_example, inputs=inp_box, outputs=inp_box)
1028
- save_btn.click(fn=save_dataset, inputs=inp_box, outputs=inp_box)
1029
- validate_btn.click(fn=validate_example, inputs=None, outputs=inp_box)
1030
- next_btn.click(fn=next_example, inputs=None, outputs=[inp_box, bar])
1031
- previous_btn.click(fn=previous_example, inputs=None, outputs=[inp_box, bar])
1032
- bar.change(
1033
- fn=example_by_id,
1034
- inputs=[bar],
1035
- outputs=[inp_box, bar],
1036
- api_name="example_by_id"
1037
- )
1038
-
1039
- # Add Hugging Face upload functionality
1040
- upload_to_hf_btn.click(
1041
- fn=update_hf_dataset,
1042
- inputs=[hf_repo_name, hf_repo_type, hf_is_private],
1043
- outputs=[hf_upload_status]
1044
- )
1045
 
1046
- demo.launch()
 
 
1
+ """Main application module for NER annotation tool."""
2
+
3
  import os
 
4
  import json
5
+ import gradio as gr
 
6
  from typing import List, Dict, Union, Tuple
 
 
 
7
 
8
+ from src.ner_annotation.core.dataset import DynamicDataset, prepare_for_highlight
9
+ from src.ner_annotation.core.annotator import AutoAnnotator
10
+ from src.ner_annotation.utils.text_processing import extract_tokens_and_labels
11
+ from src.ner_annotation.utils.file_processing import process_uploaded_file, load_from_local_file
12
+ from src.ner_annotation.utils.huggingface import (
13
+ is_valid_repo_name,
14
+ upload_to_hf,
15
+ download_from_hf
16
+ )
17
 
18
  # Available models for annotation
19
  AVAILABLE_MODELS = [
 
22
  "knowledgator/gliner-multitask-base-v0.5"
23
  ]
24
 
25
+ # Global variables
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  dynamic_dataset = None
27
+ annotator = None
28
+ sentences = []
29
 
30
  def load_dataset():
31
+ """Load the dataset and return the first example."""
32
  global dynamic_dataset
33
  try:
 
34
  with open("data/annotated_data.json", 'rt') as dataset:
35
  ANNOTATED_DATA = json.load(dataset)
36
  dynamic_dataset = DynamicDataset(ANNOTATED_DATA)
 
40
  return [("Error loading dataset: " + str(e), None)], gr.update(value=0, maximum=1)
41
 
42
  def example_by_id(id):
43
+ """Navigate to a specific example by ID."""
44
  global dynamic_dataset
45
  if dynamic_dataset is None:
46
  return [("Please load a dataset first", None)], gr.update(value=0, maximum=1)
47
  try:
48
+ id = int(id)
49
  dynamic_dataset.example_by_id(id)
50
  current = dynamic_dataset.current
51
  max_value = len(dynamic_dataset.data) - 1
 
54
  return [("Error navigating to example: " + str(e), None)], gr.update(value=0, maximum=1)
55
 
56
  def next_example():
57
+ """Move to the next example."""
58
  global dynamic_dataset
59
  if dynamic_dataset is None:
60
  return [("Please load a dataset first", None)], gr.update(value=0, maximum=1)
 
67
  return [("Error navigating to next example: " + str(e), None)], gr.update(value=0, maximum=1)
68
 
69
  def previous_example():
70
+ """Move to the previous example."""
71
  global dynamic_dataset
72
  if dynamic_dataset is None:
73
  return [("Please load a dataset first", None)], gr.update(value=0, maximum=1)
 
80
  return [("Error navigating to previous example: " + str(e), None)], gr.update(value=0, maximum=1)
81
 
82
  def update_example(data):
83
+ """Update the current example with new annotations."""
84
  global dynamic_dataset
85
  if dynamic_dataset is None:
86
  return [("Please load a dataset first", None)]
 
90
  return prepare_for_highlight(dynamic_dataset.load_current_example())
91
 
92
  def validate_example():
93
+ """Mark the current example as validated."""
94
  global dynamic_dataset
95
  if dynamic_dataset is None:
96
  return [("Please load a dataset first", None)]
 
98
  return [("The example was validated!", None)]
99
 
100
  def save_dataset(inp):
101
+ """Save the dataset to a file."""
102
  global dynamic_dataset
103
  if dynamic_dataset is None:
104
  return [("Please load a dataset first", None)]
 
106
  json.dump(dynamic_dataset.data, file)
107
  return [("The validated dataset was saved as data/annotated_data.json", None)]
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  def annotate(model, labels, threshold, prompt, save_to_hub, repo_name, repo_type, is_private):
110
+ """Annotate the uploaded text using the selected model."""
111
+ global annotator, sentences
112
  try:
113
  if not sentences:
114
  return "Please upload a file with text first!"
115
  if save_to_hub and not is_valid_repo_name(repo_name):
116
  return "Error: Invalid repo name. Only use letters, numbers, '-', '_', or '.' (no slashes or spaces)."
117
+
118
  labels = [label.strip() for label in labels.split(",")]
119
  annotator = AutoAnnotator(model)
120
  annotated_data = annotator.auto_annotate(sentences, labels, prompt, threshold)
121
+
122
  # Save annotated data locally
123
  os.makedirs("data", exist_ok=True)
124
  local_path = "data/annotated_data.json"
125
  with open(local_path, "wt") as file:
126
  json.dump(annotated_data, file, ensure_ascii=False)
127
+
128
  status_messages = [f"Successfully annotated and saved locally to {local_path}"]
129
+
130
  # Upload to Hugging Face Hub if requested
131
  if save_to_hub:
132
  try:
133
+ repo_id = upload_to_hf(local_path, repo_name, repo_type, is_private)
 
 
 
 
 
 
 
 
134
  status_messages.append(f"Successfully uploaded to Hugging Face Hub repository: {repo_id}")
135
  except Exception as e:
136
  status_messages.append(f"Error with Hugging Face Hub: {str(e)}")
137
+
138
  return "\n".join(status_messages)
139
  except Exception as e:
140
  return f"Error during annotation: {str(e)}"
141
 
142
+ def load_from_huggingface(name):
143
+ """Load a dataset from Hugging Face Hub."""
144
+ global dynamic_dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  try:
146
+ # Download dataset from Hugging Face Hub
147
+ local_path = download_from_hf(name, "annotated_data.json")
 
 
 
 
148
 
149
+ # Load the downloaded dataset
150
+ with open(local_path, 'rt') as dataset:
151
+ data = json.load(dataset)
 
 
 
 
 
 
152
 
153
+ # Initialize the dataset
154
+ dynamic_dataset = DynamicDataset(data)
155
+ return "Successfully loaded dataset from Hugging Face Hub"
 
 
 
156
  except Exception as e:
157
+ return f"Error loading dataset from Hugging Face Hub: {str(e)}"
 
158
 
159
+ def update_hf_dataset(repo_name, repo_type, is_private):
160
+ """Upload the current dataset to Hugging Face Hub."""
161
+ global dynamic_dataset
162
+ if dynamic_dataset is None:
163
+ return "Please load a dataset first"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  try:
165
+ if not is_valid_repo_name(repo_name):
166
+ return "Error: Invalid repo name. Only use letters, numbers, '-', '_', or '.' (no slashes or spaces)."
167
 
168
+ # Save dataset locally first
169
  os.makedirs("data", exist_ok=True)
170
+ local_path = "data/annotated_data.json"
171
+ with open(local_path, "wt") as file:
172
+ json.dump(dynamic_dataset.data, file, ensure_ascii=False)
173
 
174
+ # Upload to Hugging Face Hub
175
+ repo_id = upload_to_hf(local_path, repo_name, repo_type, is_private)
176
+ return f"Successfully uploaded to Hugging Face Hub repository: {repo_id}"
177
  except Exception as e:
178
+ return f"Error uploading to Hugging Face Hub: {str(e)}"
179
 
180
+ def process_conll(content):
181
+ """Convert CoNLL format to JSON."""
182
+ sentences = []
183
+ current_sentence = {"text": "", "tokenized_text": [], "ner": []}
184
+
185
+ for line in content.split('\n'):
186
+ if not line.strip():
187
+ if current_sentence["text"]:
188
+ sentences.append(current_sentence)
189
+ current_sentence = {"text": "", "tokenized_text": [], "ner": []}
190
+ continue
 
 
 
 
 
 
 
 
191
 
192
+ parts = line.split()
193
+ if len(parts) >= 2:
194
+ token, label = parts[0], parts[-1]
195
+ current_sentence["tokenized_text"].append(token)
196
+ current_sentence["ner"].append(label)
197
+ current_sentence["text"] += token + " "
198
+
199
+ if current_sentence["text"]:
200
+ sentences.append(current_sentence)
201
+
202
+ return sentences
203
+
204
+ def process_txt(content):
205
+ """Convert plain text to JSON format."""
206
+ sentences = []
207
+ for line in content.split('\n'):
208
+ if line.strip():
209
+ sentences.append({
210
+ "text": line.strip(),
211
+ "tokenized_text": line.strip().split(),
212
+ "ner": ["O"] * len(line.strip().split())
213
+ })
214
+ return sentences
215
 
216
+ def process_local_file(file_obj, format):
217
+ """Process a local file and save it as JSON."""
218
  try:
219
+ if file_obj is None:
220
+ return "No file uploaded"
221
+
222
+ # Get the file content from the Gradio file object
223
+ content = file_obj.name
224
+ with open(content, 'r', encoding='utf-8') as f:
225
+ content = f.read()
226
+
227
+ if format == "json":
228
+ data = json.loads(content)
229
+ elif format == "conll":
230
+ data = process_conll(content)
231
+ elif format == "txt":
232
+ data = process_txt(content)
233
+ else:
234
+ return "Unsupported file format"
235
 
 
236
  os.makedirs("data", exist_ok=True)
237
+ with open("data/annotated_data.json", "wt") as f:
238
+ json.dump(data, f, ensure_ascii=False)
239
+ return "Successfully processed and saved file"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  except Exception as e:
241
+ return f"Error processing file: {str(e)}"
242
 
243
+ def create_interface():
244
+ """Create and return the Gradio interface."""
245
+ with gr.Blocks() as demo:
246
+ gr.Markdown("# NER Annotation Tool")
247
+
248
+ with gr.Tabs():
249
+ with gr.TabItem("Auto Annotation"):
250
+ with gr.Row():
251
+ with gr.Column():
252
+ file_uploader = gr.File(label="Upload text file (one sentence per line)")
253
+ upload_status = gr.Textbox(label="Upload Status")
254
+ file_uploader.change(fn=process_uploaded_file, inputs=[file_uploader], outputs=[upload_status])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
+ with gr.Column():
257
+ model = gr.Dropdown(
258
+ label="Choose the model for annotation",
259
+ choices=AVAILABLE_MODELS,
260
+ value=AVAILABLE_MODELS[0]
261
+ )
262
+ labels = gr.Textbox(
263
+ label="Labels",
264
+ placeholder="Enter comma-separated labels (e.g., PERSON,ORG,LOC)",
265
+ scale=2
266
+ )
267
+ threshold = gr.Slider(
268
+ 0, 1,
269
+ value=0.3,
270
+ step=0.01,
271
+ label="Threshold",
272
+ info="Lower threshold increases entity predictions"
273
+ )
274
+ prompt = gr.Textbox(
275
+ label="Prompt",
276
+ placeholder="Enter your annotation prompt (optional)",
277
+ scale=2
278
  )
279
 
280
+ with gr.Group():
281
+ gr.Markdown("### Save Options")
282
+ save_to_hub = gr.Checkbox(
283
+ label="Save to Hugging Face Hub",
 
 
 
 
 
 
 
 
 
 
284
  value=False
285
  )
286
+
287
+ with gr.Group(visible=False) as hub_settings:
288
+ gr.Markdown("#### Hugging Face Hub Settings")
289
+ repo_name = gr.Textbox(
290
+ label="Repository Name",
291
+ placeholder="Enter repository name (e.g., my-ner-dataset)",
292
+ scale=2
293
+ )
294
+ repo_type = gr.Dropdown(
295
+ choices=["dataset", "model", "space"],
296
+ value="dataset",
297
+ label="Repository Type"
298
+ )
299
+ is_private = gr.Checkbox(
300
+ label="Private Repository",
301
+ value=False
302
+ )
303
+
304
+ annotate_btn = gr.Button("Annotate Data")
305
+ output_info = gr.Textbox(label="Processing Status")
306
+
307
+ # Add download buttons for annotated data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  with gr.Row():
309
+ download_btn_annot = gr.Button("Download Annotated Data", visible=False)
310
+ download_file_annot = gr.File(label="Download", interactive=False, visible=False)
311
+ download_status = gr.Textbox(label="Download Status", visible=False)
312
+
313
+ def toggle_hub_settings(save_to_hub):
314
+ return {
315
+ hub_settings: gr.update(visible=save_to_hub)
316
+ }
317
+
318
+ save_to_hub.change(
319
+ fn=toggle_hub_settings,
320
+ inputs=[save_to_hub],
321
+ outputs=[hub_settings]
322
+ )
323
+
324
+ def show_download_buttons(status):
325
+ if status and status.startswith("Successfully annotated and saved locally"):
326
+ return gr.update(visible=True), gr.update(visible=True)
327
+ return gr.update(visible=False), gr.update(visible=False)
328
+
329
+ annotate_btn.click(
330
+ fn=annotate,
331
+ inputs=[
332
+ model, labels, threshold, prompt,
333
+ save_to_hub, repo_name, repo_type, is_private
334
+ ],
335
+ outputs=[output_info]
336
+ )
337
+ output_info.change(
338
+ fn=show_download_buttons,
339
+ inputs=[output_info],
340
+ outputs=[download_btn_annot, download_status]
341
+ )
342
+
343
+ def handle_download_annot():
344
+ file_path = "data/annotated_data.json"
345
+ if os.path.exists(file_path):
346
+ return gr.update(value=file_path, visible=True)
347
+ return gr.update(visible=False)
348
+
349
+ download_btn_annot.click(
350
+ fn=handle_download_annot,
351
+ inputs=None,
352
+ outputs=[download_file_annot]
353
+ )
354
+
355
+ with gr.TabItem("Dataset Viewer"):
356
+ with gr.Row():
357
+ with gr.Column(scale=1):
358
+ gr.Markdown("### Dataset Controls")
359
+ with gr.Group():
360
+ with gr.Row():
361
+ load_local_btn = gr.Button("Load Local Dataset", variant="primary")
362
+ load_hf_btn = gr.Button("Load from Hugging Face", variant="secondary")
363
+
364
+ with gr.Group() as local_inputs:
365
+ local_file = gr.File(label="Upload Local Dataset")
366
+ file_format = gr.Dropdown(
367
+ choices=["json", "conll", "txt"],
368
+ value="json",
369
+ label="File Format"
370
+ )
371
+ local_status = gr.Textbox(label="Status", interactive=False)
372
+
373
+ with gr.Group(visible=False) as hf_inputs:
374
+ with gr.Row():
375
+ dataset_name = gr.Textbox(
376
+ label="Dataset Name",
377
+ placeholder="Enter dataset name (e.g., conll2003)",
378
+ scale=4
379
+ )
380
+ with gr.Row():
381
+ gr.Column(scale=1)
382
+ load_dataset_btn = gr.Button("📥 Load Dataset", variant="primary")
383
+ gr.Column(scale=1)
384
+ with gr.Row():
385
+ gr.Markdown(
386
+ "💡 Tip: Enter a valid Hugging Face dataset name",
387
+ elem_classes=["text-sm", "text-gray-500"]
388
+ )
389
+
390
+ gr.Markdown("### Navigation")
391
+ with gr.Group():
392
+ bar = gr.Slider(
393
+ minimum=0,
394
+ maximum=1,
395
+ step=1,
396
+ label="Progress",
397
+ interactive=True,
398
+ info="Use slider to navigate through examples"
399
  )
400
+
401
+ with gr.Row():
402
+ previous_btn = gr.Button("← Previous", variant="secondary")
403
+ next_btn = gr.Button("Next →", variant="secondary")
404
+
405
+ gr.Markdown("### Actions")
406
+ with gr.Group():
407
+ with gr.Row():
408
+ apply_btn = gr.Button("Apply Changes", variant="primary")
409
+ validate_btn = gr.Button("Validate", variant="secondary")
410
+ save_btn = gr.Button("Save Dataset", variant="primary")
411
+
412
+ gr.Markdown("### Hugging Face Upload")
413
+ with gr.Group():
414
+ with gr.Row():
415
+ show_hf_upload_btn = gr.Button("📤 Show Upload Options", variant="secondary", scale=1)
416
+ hide_hf_upload_btn = gr.Button("📥 Hide Upload Options", visible=False, variant="secondary", scale=1)
417
+
418
+ with gr.Group(visible=False) as hf_upload_group:
419
+ with gr.Row():
420
+ hf_repo_name = gr.Textbox(
421
+ label="Repository Name",
422
+ placeholder="Enter repository name (e.g., my-ner-dataset)",
423
+ scale=2
424
+ )
425
+ hf_repo_type = gr.Dropdown(
426
+ choices=["dataset", "model", "space"],
427
+ value="dataset",
428
+ label="Repository Type",
429
+ scale=1
430
+ )
431
+ with gr.Row():
432
+ hf_is_private = gr.Checkbox(
433
+ label="Private Repository",
434
+ value=False,
435
+ scale=1
436
+ )
437
+ upload_to_hf_btn = gr.Button("Upload to Hugging Face", variant="primary", scale=2)
438
+ hf_upload_status = gr.Textbox(
439
+ label="Upload Status",
440
+ interactive=False,
441
+ show_label=True
442
+ )
443
+
444
+ def toggle_upload_options(show: bool):
445
+ return {
446
+ hf_upload_group: gr.update(visible=show),
447
+ show_hf_upload_btn: gr.update(visible=not show),
448
+ hide_hf_upload_btn: gr.update(visible=show)
449
+ }
450
+
451
+ show_hf_upload_btn.click(
452
+ fn=lambda: toggle_upload_options(True),
453
+ inputs=None,
454
+ outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn]
455
  )
456
+
457
+ hide_hf_upload_btn.click(
458
+ fn=lambda: toggle_upload_options(False),
459
+ inputs=None,
460
+ outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn]
461
  )
462
+
463
+ with gr.Column(scale=2):
464
+ gr.Markdown("### Current Example")
465
+ inp_box = gr.HighlightedText(value=None, interactive=True)
466
+
467
+ def toggle_local_inputs():
468
+ return {
469
+ local_inputs: gr.update(visible=True),
470
+ hf_inputs: gr.update(visible=False)
471
+ }
472
+
473
+ def toggle_hf_inputs():
474
+ return {
475
+ local_inputs: gr.update(visible=False),
476
+ hf_inputs: gr.update(visible=True)
477
+ }
478
+
479
+ load_local_btn.click(
480
+ fn=toggle_local_inputs,
481
+ inputs=None,
482
+ outputs=[local_inputs, hf_inputs]
483
  )
484
+
485
+ load_hf_btn.click(
486
+ fn=toggle_hf_inputs,
487
+ inputs=None,
488
+ outputs=[local_inputs, hf_inputs]
489
+ )
490
+
491
+ def process_and_load_local(file_obj, format):
492
+ status = process_local_file(file_obj, format)
493
+ if "Successfully" in status:
494
+ result = load_dataset()
495
+ return result[0], result[1], status
496
+ return [("Error loading dataset: " + status, None)], gr.update(value=0, maximum=1), status
497
+
498
+ local_file.change(
499
+ fn=process_and_load_local,
500
+ inputs=[local_file, file_format],
501
+ outputs=[inp_box, bar, local_status]
502
+ )
503
+
504
+ def load_hf_dataset(name):
505
+ status = load_from_huggingface(name)
506
+ if "Successfully" in status:
507
+ return load_dataset()
508
+ return [("Error loading dataset: " + status, None)], gr.update(value=0, maximum=1)
509
+
510
+ load_dataset_btn.click(
511
+ fn=load_hf_dataset,
512
+ inputs=[dataset_name],
513
+ outputs=[inp_box, bar]
514
+ )
515
+
516
+ apply_btn.click(fn=update_example, inputs=inp_box, outputs=inp_box)
517
+ save_btn.click(fn=save_dataset, inputs=inp_box, outputs=inp_box)
518
+ validate_btn.click(fn=validate_example, inputs=None, outputs=inp_box)
519
+ next_btn.click(fn=next_example, inputs=None, outputs=[inp_box, bar])
520
+ previous_btn.click(fn=previous_example, inputs=None, outputs=[inp_box, bar])
521
+ bar.change(
522
+ fn=example_by_id,
523
+ inputs=[bar],
524
+ outputs=[inp_box, bar],
525
+ api_name="example_by_id"
526
+ )
527
+
528
+ upload_to_hf_btn.click(
529
+ fn=update_hf_dataset,
530
+ inputs=[hf_repo_name, hf_repo_type, hf_is_private],
531
+ outputs=[hf_upload_status]
532
+ )
533
+
534
+ return demo
535
+
536
+ def main():
537
+ """Run the application."""
538
+ demo = create_interface()
539
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
540
 
541
+ if __name__ == "__main__":
542
+ main()
data/annotated_data.json CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -1,13 +1,27 @@
1
  [project]
2
  name = "ner-annotation"
3
  version = "0.1.0"
4
- description = "Add your description here"
5
- readme = "README.md"
6
- requires-python = ">=3.10"
 
7
  dependencies = [
8
- "datasets>=3.6.0",
9
- "gliner>=0.2.20",
10
- "gradio>=5.31.0",
11
- "huggingface-hub>=0.32.1",
12
- "python-dotenv>=1.1.0",
 
 
13
  ]
 
 
 
 
 
 
 
 
 
 
 
 
1
  [project]
2
  name = "ner-annotation"
3
  version = "0.1.0"
4
+ description = "A tool for annotating text with named entities using GLiNER models"
5
+ authors = [
6
+ {name = "Your Name", email = "your.[email protected]"}
7
+ ]
8
  dependencies = [
9
+ "gradio>=4.0.0",
10
+ "torch>=2.0.0",
11
+ "gliner>=0.1.0",
12
+ "huggingface-hub>=0.19.0",
13
+ "pandas>=2.0.0",
14
+ "python-dotenv>=1.0.0",
15
+ "requests>=2.31.0"
16
  ]
17
+ requires-python = ">=3.8"
18
+
19
+ [build-system]
20
+ requires = ["hatchling"]
21
+ build-backend = "hatchling.build"
22
+
23
+ [tool.hatch.metadata]
24
+ allow-direct-references = true
25
+
26
+ [tool.hatch.build.targets.wheel]
27
+ packages = ["src/ner_annotation"]
src/ner_annotation/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """NER Annotation Tool - A tool for annotating text with named entities."""
2
+
3
+ __version__ = "0.1.0"
src/ner_annotation/__main__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Main entry point for the NER annotation tool."""
2
+
3
+ from .app import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
src/ner_annotation/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (266 Bytes). View file
 
src/ner_annotation/core/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Core functionality for NER annotation."""
2
+
3
+ from .dataset import DynamicDataset, prepare_for_highlight
4
+ from .annotator import AutoAnnotator
5
+
6
+ __all__ = ['DynamicDataset', 'prepare_for_highlight', 'AutoAnnotator']
src/ner_annotation/core/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (380 Bytes). View file
 
src/ner_annotation/core/__pycache__/annotator.cpython-310.pyc ADDED
Binary file (5.03 kB). View file
 
src/ner_annotation/core/__pycache__/dataset.cpython-310.pyc ADDED
Binary file (5.16 kB). View file
 
src/ner_annotation/core/annotator.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """NER annotation module using GLiNER models."""
2
+
3
+ from typing import List, Dict, Union, Optional
4
+ import torch
5
+ import random
6
+ from gliner import GLiNER
7
+ from ..utils.text_processing import tokenize_text
8
+
9
+ class AutoAnnotator:
10
+ """A class for automatic NER annotation using GLiNER models."""
11
+
12
+ def __init__(
13
+ self,
14
+ model: str = "BookingCare/gliner-multi-healthcare",
15
+ device: Optional[torch.device] = None
16
+ ) -> None:
17
+ """Initialize the annotator with a GLiNER model.
18
+
19
+ Args:
20
+ model: Name or path of the GLiNER model to use
21
+ device: Device to run the model on (CPU/GPU)
22
+ """
23
+ if device is None:
24
+ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
25
+
26
+ # Set PyTorch memory management settings
27
+ if torch.cuda.is_available():
28
+ torch.cuda.empty_cache()
29
+ torch.cuda.set_per_process_memory_fraction(0.8) # Use 80% of available GPU memory
30
+
31
+ self.model = GLiNER.from_pretrained(model).to(device)
32
+ self.annotated_data = []
33
+ self.stat = {
34
+ "total": None,
35
+ "current": -1
36
+ }
37
+
38
+ def auto_annotate(
39
+ self,
40
+ data: List[str],
41
+ labels: List[str],
42
+ prompt: Optional[Union[str, List[str]]] = None,
43
+ threshold: float = 0.5,
44
+ nested_ner: bool = False
45
+ ) -> List[Dict]:
46
+ """Annotate a list of texts with NER labels.
47
+
48
+ Args:
49
+ data: List of texts to annotate
50
+ labels: List of entity labels to detect
51
+ prompt: Optional prompt or list of prompts to use
52
+ threshold: Confidence threshold for entity detection
53
+ nested_ner: Whether to allow nested entities
54
+
55
+ Returns:
56
+ List of annotated examples
57
+ """
58
+ self.stat["total"] = len(data)
59
+ self.stat["current"] = -1
60
+
61
+ # Process texts in batches
62
+ processed_data = []
63
+ batch_size = 8 # Reduced batch size to prevent OOM errors
64
+
65
+ for i in range(0, len(data), batch_size):
66
+ batch_texts = data[i:i + batch_size]
67
+ batch_with_prompts = []
68
+
69
+ # Add prompts to batch texts
70
+ for text in batch_texts:
71
+ if isinstance(prompt, list):
72
+ prompt_text = random.choice(prompt)
73
+ else:
74
+ prompt_text = prompt
75
+ text_with_prompt = f"{prompt_text}\n{text}" if prompt_text else text
76
+ batch_with_prompts.append(text_with_prompt)
77
+
78
+ # Process batch
79
+ batch_results = self._batch_annotate_text(
80
+ batch_with_prompts,
81
+ labels,
82
+ threshold,
83
+ nested_ner
84
+ )
85
+ processed_data.extend(batch_results)
86
+
87
+ # Clear CUDA cache after each batch
88
+ if torch.cuda.is_available():
89
+ torch.cuda.empty_cache()
90
+
91
+ # Update progress
92
+ self.stat["current"] = min(i + batch_size, len(data))
93
+
94
+ self.annotated_data = processed_data
95
+ return self.annotated_data
96
+
97
+ def _batch_annotate_text(
98
+ self,
99
+ texts: List[str],
100
+ labels: List[str],
101
+ threshold: float,
102
+ nested_ner: bool
103
+ ) -> List[Dict]:
104
+ """Annotate multiple texts in batch.
105
+
106
+ Args:
107
+ texts: List of texts to annotate
108
+ labels: List of entity labels
109
+ threshold: Confidence threshold
110
+ nested_ner: Whether to allow nested entities
111
+
112
+ Returns:
113
+ List of annotated examples
114
+ """
115
+ batch_entities = self.model.batch_predict_entities(
116
+ texts,
117
+ labels,
118
+ flat_ner=not nested_ner,
119
+ threshold=threshold
120
+ )
121
+
122
+ results = []
123
+ for text, entities in zip(texts, batch_entities):
124
+ r = {
125
+ "text": text,
126
+ "entities": [
127
+ {
128
+ "entity": entity["label"],
129
+ "word": entity["text"],
130
+ "start": entity["start"],
131
+ "end": entity["end"],
132
+ "score": 0,
133
+ }
134
+ for entity in entities
135
+ ],
136
+ }
137
+ r["entities"] = self._merge_entities(r["entities"])
138
+ results.append(self._transform_data(r))
139
+ return results
140
+
141
+ def _merge_entities(self, entities: List[Dict]) -> List[Dict]:
142
+ """Merge adjacent entities of the same type.
143
+
144
+ Args:
145
+ entities: List of entity dictionaries
146
+
147
+ Returns:
148
+ List of merged entities
149
+ """
150
+ if not entities:
151
+ return []
152
+ merged = []
153
+ current = entities[0]
154
+ for next_entity in entities[1:]:
155
+ if (next_entity['entity'] == current['entity'] and
156
+ (next_entity['start'] == current['end'] + 1 or
157
+ next_entity['start'] == current['end'])):
158
+ current['word'] += ' ' + next_entity['word']
159
+ current['end'] = next_entity['end']
160
+ else:
161
+ merged.append(current)
162
+ current = next_entity
163
+ merged.append(current)
164
+ return merged
165
+
166
+ def _transform_data(self, data: Dict) -> Dict:
167
+ """Transform raw annotation data into tokenized format.
168
+
169
+ Args:
170
+ data: Raw annotation data
171
+
172
+ Returns:
173
+ Transformed data with tokenized text and NER spans
174
+ """
175
+ tokens = tokenize_text(data['text'])
176
+ spans = []
177
+
178
+ for entity in data['entities']:
179
+ entity_tokens = tokenize_text(entity['word'])
180
+ entity_length = len(entity_tokens)
181
+
182
+ # Find the start and end indices of each entity in the tokenized text
183
+ for i in range(len(tokens) - entity_length + 1):
184
+ if tokens[i:i + entity_length] == entity_tokens:
185
+ spans.append([i, i + entity_length - 1, entity['entity']])
186
+ break
187
+
188
+ return {
189
+ "tokenized_text": tokens,
190
+ "ner": spans,
191
+ "validated": False
192
+ }
src/ner_annotation/core/dataset.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Dataset management module for NER annotation."""
2
+
3
+ from typing import List, Dict, Union, Tuple
4
+ import json
5
+ import os
6
+ import re
7
+
8
+ class DynamicDataset:
9
+ """A class to manage and navigate through annotated dataset examples."""
10
+
11
+ def __init__(
12
+ self, data: List[Dict[str, Union[List[Union[int, str]], bool]]]
13
+ ) -> None:
14
+ """Initialize the dataset with examples.
15
+
16
+ Args:
17
+ data: List of examples, each containing tokenized text and NER annotations
18
+ """
19
+ self.data = data
20
+ self.data_len = len(self.data)
21
+ self.current = -1
22
+ for example in self.data:
23
+ if "validated" not in example:
24
+ example["validated"] = False
25
+
26
+ def next_example(self) -> None:
27
+ """Move to the next example in the dataset."""
28
+ self.current += 1
29
+ if self.current > self.data_len - 1:
30
+ self.current = self.data_len - 1
31
+ elif self.current < 0:
32
+ self.current = 0
33
+
34
+ def previous_example(self) -> None:
35
+ """Move to the previous example in the dataset."""
36
+ self.current -= 1
37
+ if self.current > self.data_len - 1:
38
+ self.current = self.data_len - 1
39
+ elif self.current < 0:
40
+ self.current = 0
41
+
42
+ def example_by_id(self, id: int) -> None:
43
+ """Navigate to a specific example by its ID.
44
+
45
+ Args:
46
+ id: The index of the example to navigate to
47
+ """
48
+ self.current = id
49
+ if self.current > self.data_len - 1:
50
+ self.current = self.data_len - 1
51
+ elif self.current < 0:
52
+ self.current = 0
53
+
54
+ def validate(self) -> None:
55
+ """Mark the current example as validated."""
56
+ self.data[self.current]["validated"] = True
57
+
58
+ def load_current_example(self) -> Dict:
59
+ """Get the current example.
60
+
61
+ Returns:
62
+ The current example data
63
+ """
64
+ return self.data[self.current]
65
+
66
+ def tokenize_text(text: str) -> List[str]:
67
+ """Tokenize the input text into a list of tokens.
68
+
69
+ Args:
70
+ text: The input text to tokenize
71
+
72
+ Returns:
73
+ List of tokens
74
+ """
75
+ return re.findall(r'\w+(?:[-_]\w+)*|\S', text)
76
+
77
+ def join_tokens(tokens: List[str]) -> str:
78
+ """Join tokens with proper spacing.
79
+
80
+ Args:
81
+ tokens: List of tokens to join
82
+
83
+ Returns:
84
+ Joined text string
85
+ """
86
+ text = ""
87
+ for token in tokens:
88
+ if token in {",", ".", "!", "?", ":", ";", "..."}:
89
+ text = text.rstrip() + token
90
+ else:
91
+ text += " " + token
92
+ return text.strip()
93
+
94
+ def prepare_for_highlight(data: Dict) -> List[Tuple[str, str]]:
95
+ """Prepare text for highlighting with NER annotations.
96
+
97
+ Args:
98
+ data: Dictionary containing tokenized text and NER annotations
99
+
100
+ Returns:
101
+ List of tuples containing text segments and their entity labels
102
+ """
103
+ tokens = data["tokenized_text"]
104
+ ner = data["ner"]
105
+
106
+ highlighted_text = []
107
+ current_entity = None
108
+ entity_tokens = []
109
+ normal_tokens = []
110
+
111
+ for idx, token in enumerate(tokens):
112
+ if current_entity is None or idx > current_entity[1]:
113
+ if entity_tokens:
114
+ highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
115
+ entity_tokens = []
116
+ current_entity = next((entity for entity in ner if entity[0] == idx), None)
117
+
118
+ if current_entity and current_entity[0] <= idx <= current_entity[1]:
119
+ if normal_tokens:
120
+ highlighted_text.append((" ".join(normal_tokens), None))
121
+ normal_tokens = []
122
+ entity_tokens.append(token + " ")
123
+ else:
124
+ if entity_tokens:
125
+ highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
126
+ entity_tokens = []
127
+ normal_tokens.append(token + " ")
128
+
129
+ if entity_tokens:
130
+ highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
131
+ if normal_tokens:
132
+ highlighted_text.append((" ".join(normal_tokens), None))
133
+
134
+ cleaned_highlighted_text = []
135
+ for text, label in highlighted_text:
136
+ cleaned_text = re.sub(r'\s(?=[,\.!?…:;])', '', text)
137
+ cleaned_highlighted_text.append((cleaned_text, label))
138
+
139
+ return cleaned_highlighted_text
140
+
141
+ def save_dataset(data: List[Dict], filepath: str) -> None:
142
+ """Save the dataset to a JSON file.
143
+
144
+ Args:
145
+ data: The dataset to save
146
+ filepath: Path to save the dataset
147
+ """
148
+ os.makedirs(os.path.dirname(filepath), exist_ok=True)
149
+ with open(filepath, "wt") as file:
150
+ json.dump(data, file, ensure_ascii=False)
151
+
152
+ def load_dataset(filepath: str) -> List[Dict]:
153
+ """Load a dataset from a JSON file.
154
+
155
+ Args:
156
+ filepath: Path to the dataset file
157
+
158
+ Returns:
159
+ The loaded dataset
160
+ """
161
+ with open(filepath, "rt") as file:
162
+ return json.load(file)
src/ner_annotation/utils/__init__.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utility functions for NER annotation."""
2
+
3
+ from .text_processing import (
4
+ tokenize_text,
5
+ join_tokens,
6
+ process_text_for_gliner,
7
+ extract_tokens_and_labels
8
+ )
9
+ from .file_processing import (
10
+ process_uploaded_file,
11
+ load_from_local_file
12
+ )
13
+ from .huggingface import (
14
+ is_valid_repo_name,
15
+ create_hf_repo,
16
+ upload_to_hf,
17
+ download_from_hf
18
+ )
19
+
20
+ __all__ = [
21
+ 'tokenize_text',
22
+ 'join_tokens',
23
+ 'process_text_for_gliner',
24
+ 'extract_tokens_and_labels',
25
+ 'process_uploaded_file',
26
+ 'load_from_local_file',
27
+ 'is_valid_repo_name',
28
+ 'create_hf_repo',
29
+ 'upload_to_hf',
30
+ 'download_from_hf'
31
+ ]
src/ner_annotation/utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (656 Bytes). View file
 
src/ner_annotation/utils/__pycache__/file_processing.cpython-310.pyc ADDED
Binary file (5.01 kB). View file
 
src/ner_annotation/utils/__pycache__/huggingface.cpython-310.pyc ADDED
Binary file (3.68 kB). View file
 
src/ner_annotation/utils/__pycache__/text_processing.cpython-310.pyc ADDED
Binary file (2.78 kB). View file
 
src/ner_annotation/utils/file_processing.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """File processing utilities for NER annotation."""
2
+
3
+ import os
4
+ import json
5
+ import pandas as pd
6
+ from typing import List, Dict, Union, Optional
7
+ from .text_processing import tokenize_text, process_text_for_gliner
8
+
9
+ def process_uploaded_file(file_obj) -> List[str]:
10
+ """Process an uploaded file into a list of sentences.
11
+
12
+ Args:
13
+ file_obj: The uploaded file object
14
+
15
+ Returns:
16
+ List of processed sentences
17
+
18
+ Raises:
19
+ Exception: If file processing fails
20
+ """
21
+ if file_obj is None:
22
+ raise ValueError("Please upload a file first!")
23
+
24
+ try:
25
+ if file_obj.name.endswith('.csv'):
26
+ # Process CSV file
27
+ df = pd.read_csv(file_obj.name)
28
+ sentences = df['Nội dung'].dropna().tolist()
29
+ else:
30
+ # Process text file
31
+ content = file_obj.read().decode('utf-8')
32
+ sentences = [line.strip() for line in content.splitlines() if line.strip()]
33
+
34
+ # Process each sentence and flatten the list
35
+ processed_sentences = []
36
+ for sentence in sentences:
37
+ processed_sentences.extend(process_text_for_gliner(sentence))
38
+
39
+ return processed_sentences
40
+ except Exception as e:
41
+ raise Exception(f"Error reading file: {str(e)}")
42
+
43
+ def load_from_local_file(
44
+ file_path: str,
45
+ file_format: str = "json"
46
+ ) -> List[Dict]:
47
+ """Load and convert data from local file in various formats.
48
+
49
+ Args:
50
+ file_path: Path to the file to load
51
+ file_format: Format of the file (json, conll, or txt)
52
+
53
+ Returns:
54
+ List of converted examples
55
+
56
+ Raises:
57
+ Exception: If file loading fails
58
+ """
59
+ try:
60
+ if file_format == "json":
61
+ with open(file_path, 'r', encoding='utf-8') as f:
62
+ data = json.load(f)
63
+ if isinstance(data, list):
64
+ # If data is already in the correct format
65
+ if all("tokenized_text" in item and "ner" in item for item in data):
66
+ return data
67
+ # Convert from other JSON formats
68
+ return _convert_json_format(data)
69
+ else:
70
+ raise ValueError("JSON file must contain a list of examples")
71
+
72
+ elif file_format == "conll":
73
+ return _load_conll_file(file_path)
74
+
75
+ elif file_format == "txt":
76
+ return _load_txt_file(file_path)
77
+
78
+ else:
79
+ raise ValueError(f"Unsupported file format: {file_format}")
80
+
81
+ except Exception as e:
82
+ raise Exception(f"Error loading file: {str(e)}")
83
+
84
+ def _convert_json_format(data: List[Dict]) -> List[Dict]:
85
+ """Convert JSON data from various formats to the standard format.
86
+
87
+ Args:
88
+ data: List of examples in various JSON formats
89
+
90
+ Returns:
91
+ List of examples in the standard format
92
+ """
93
+ converted_data = []
94
+ for item in data:
95
+ if "tokens" in item and "ner_tags" in item:
96
+ ner_spans = []
97
+ current_span = None
98
+ for i, (token, tag) in enumerate(zip(item["tokens"], item["ner_tags"])):
99
+ if tag != "O":
100
+ if current_span is None:
101
+ current_span = [i, i, tag]
102
+ elif tag == current_span[2]:
103
+ current_span[1] = i
104
+ else:
105
+ ner_spans.append(current_span)
106
+ current_span = [i, i, tag]
107
+ elif current_span is not None:
108
+ ner_spans.append(current_span)
109
+ current_span = None
110
+ if current_span is not None:
111
+ ner_spans.append(current_span)
112
+ converted_data.append({
113
+ "tokenized_text": item["tokens"],
114
+ "ner": ner_spans,
115
+ "validated": False
116
+ })
117
+ return converted_data
118
+
119
+ def _load_conll_file(file_path: str) -> List[Dict]:
120
+ """Load and convert data from a CoNLL format file.
121
+
122
+ Args:
123
+ file_path: Path to the CoNLL file
124
+
125
+ Returns:
126
+ List of converted examples
127
+ """
128
+ converted_data = []
129
+ current_example = {"tokens": [], "ner_tags": []}
130
+
131
+ with open(file_path, 'r', encoding='utf-8') as f:
132
+ for line in f:
133
+ line = line.strip()
134
+ if line:
135
+ if line.startswith("#"):
136
+ continue
137
+ parts = line.split()
138
+ if len(parts) >= 2:
139
+ token, tag = parts[0], parts[-1]
140
+ current_example["tokens"].append(token)
141
+ current_example["ner_tags"].append(tag)
142
+ elif current_example["tokens"]:
143
+ # Convert current example
144
+ ner_spans = []
145
+ current_span = None
146
+ for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])):
147
+ if tag != "O":
148
+ if current_span is None:
149
+ current_span = [i, i, tag]
150
+ elif tag == current_span[2]:
151
+ current_span[1] = i
152
+ else:
153
+ ner_spans.append(current_span)
154
+ current_span = [i, i, tag]
155
+ elif current_span is not None:
156
+ ner_spans.append(current_span)
157
+ current_span = None
158
+ if current_span is not None:
159
+ ner_spans.append(current_span)
160
+
161
+ converted_data.append({
162
+ "tokenized_text": current_example["tokens"],
163
+ "ner": ner_spans,
164
+ "validated": False
165
+ })
166
+ current_example = {"tokens": [], "ner_tags": []}
167
+
168
+ # Handle last example if exists
169
+ if current_example["tokens"]:
170
+ ner_spans = []
171
+ current_span = None
172
+ for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])):
173
+ if tag != "O":
174
+ if current_span is None:
175
+ current_span = [i, i, tag]
176
+ elif tag == current_span[2]:
177
+ current_span[1] = i
178
+ else:
179
+ ner_spans.append(current_span)
180
+ current_span = [i, i, tag]
181
+ elif current_span is not None:
182
+ ner_spans.append(current_span)
183
+ current_span = None
184
+ if current_span is not None:
185
+ ner_spans.append(current_span)
186
+
187
+ converted_data.append({
188
+ "tokenized_text": current_example["tokens"],
189
+ "ner": ner_spans,
190
+ "validated": False
191
+ })
192
+
193
+ return converted_data
194
+
195
+ def _load_txt_file(file_path: str) -> List[Dict]:
196
+ """Load and convert data from a text file.
197
+
198
+ Args:
199
+ file_path: Path to the text file
200
+
201
+ Returns:
202
+ List of converted examples
203
+ """
204
+ converted_data = []
205
+ with open(file_path, 'r', encoding='utf-8') as f:
206
+ for line in f:
207
+ line = line.strip()
208
+ if line:
209
+ tokens = tokenize_text(line)
210
+ converted_data.append({
211
+ "tokenized_text": tokens,
212
+ "ner": [],
213
+ "validated": False
214
+ })
215
+ return converted_data
src/ner_annotation/utils/huggingface.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hugging Face Hub integration utilities."""
2
+
3
+ import re
4
+ import os
5
+ from typing import Optional
6
+ from huggingface_hub import HfApi, create_repo
7
+ from dotenv import load_dotenv
8
+
9
+ # Load environment variables
10
+ load_dotenv()
11
+ HF_TOKEN = os.getenv("HUGGINGFACE_ACCESS_TOKEN")
12
+
13
+ def is_valid_repo_name(repo_name: str) -> bool:
14
+ """Check if a repository name is valid for Hugging Face Hub.
15
+
16
+ Args:
17
+ repo_name: The repository name to validate
18
+
19
+ Returns:
20
+ True if the name is valid, False otherwise
21
+ """
22
+ return bool(re.match(r'^[A-Za-z0-9_./-]+$', repo_name))
23
+
24
+ def create_hf_repo(
25
+ repo_name: str,
26
+ repo_type: str = "dataset",
27
+ private: bool = False
28
+ ) -> str:
29
+ """Create a new repository on Hugging Face Hub.
30
+
31
+ Args:
32
+ repo_name: Name of the repository to create
33
+ repo_type: Type of repository (dataset, model, or space)
34
+ private: Whether the repository should be private
35
+
36
+ Returns:
37
+ The repository ID
38
+
39
+ Raises:
40
+ Exception: If the repository name is invalid or creation fails
41
+ """
42
+ if not is_valid_repo_name(repo_name):
43
+ raise Exception(
44
+ "Invalid repo name: must not contain slashes, spaces, or special "
45
+ "characters except '-', '_', '.'"
46
+ )
47
+
48
+ try:
49
+ api = HfApi(token=HF_TOKEN)
50
+ create_repo(
51
+ repo_id=repo_name,
52
+ repo_type=repo_type,
53
+ private=private,
54
+ exist_ok=True,
55
+ token=HF_TOKEN
56
+ )
57
+ return repo_name
58
+ except Exception as e:
59
+ raise Exception(f"Error creating repository: {str(e)}")
60
+
61
+ def upload_to_hf(
62
+ file_path: str,
63
+ repo_name: str,
64
+ repo_type: str = "dataset",
65
+ private: bool = False
66
+ ) -> str:
67
+ """Upload a file to Hugging Face Hub.
68
+
69
+ Args:
70
+ file_path: Path to the file to upload
71
+ repo_name: Name of the repository to upload to
72
+ repo_type: Type of repository
73
+ private: Whether the repository should be private
74
+
75
+ Returns:
76
+ The repository ID
77
+
78
+ Raises:
79
+ Exception: If the upload fails
80
+ """
81
+ try:
82
+ # Create or get repository
83
+ repo_id = create_hf_repo(repo_name, repo_type, private)
84
+
85
+ # Upload file
86
+ api = HfApi(token=HF_TOKEN)
87
+ api.upload_file(
88
+ path_or_fileobj=file_path,
89
+ path_in_repo=os.path.basename(file_path),
90
+ repo_id=repo_id,
91
+ repo_type=repo_type,
92
+ token=HF_TOKEN
93
+ )
94
+ return repo_id
95
+ except Exception as e:
96
+ raise Exception(f"Error uploading to Hugging Face Hub: {str(e)}")
97
+
98
+ def download_from_hf(
99
+ repo_name: str,
100
+ file_name: str,
101
+ local_path: Optional[str] = None
102
+ ) -> str:
103
+ """Download a file from Hugging Face Hub.
104
+
105
+ Args:
106
+ repo_name: Name of the repository to download from
107
+ file_name: Name of the file to download
108
+ local_path: Optional local path to save the file to
109
+
110
+ Returns:
111
+ Path to the downloaded file
112
+
113
+ Raises:
114
+ Exception: If the download fails
115
+ """
116
+ try:
117
+ import requests
118
+
119
+ # Construct the raw URL for the file
120
+ raw_url = f"https://huggingface.co/datasets/{repo_name}/raw/main/{file_name}"
121
+
122
+ # Download the file
123
+ response = requests.get(raw_url)
124
+ if response.status_code != 200:
125
+ raise Exception(f"Failed to download file: {response.status_code}")
126
+
127
+ # Save the file
128
+ if local_path is None:
129
+ local_path = os.path.join("data", file_name)
130
+
131
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
132
+ with open(local_path, "wb") as f:
133
+ f.write(response.content)
134
+
135
+ return local_path
136
+ except Exception as e:
137
+ raise Exception(f"Error downloading from Hugging Face Hub: {str(e)}")
src/ner_annotation/utils/text_processing.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Text processing utilities for NER annotation."""
2
+
3
+ import re
4
+ from typing import List, Dict, Union, Tuple
5
+
6
+ def tokenize_text(text: str) -> List[str]:
7
+ """Tokenize the input text into a list of tokens.
8
+
9
+ Args:
10
+ text: The input text to tokenize
11
+
12
+ Returns:
13
+ List of tokens
14
+ """
15
+ return re.findall(r'\w+(?:[-_]\w+)*|\S', text)
16
+
17
+ def join_tokens(tokens: List[str]) -> str:
18
+ """Join tokens with proper spacing.
19
+
20
+ Args:
21
+ tokens: List of tokens to join
22
+
23
+ Returns:
24
+ Joined text string
25
+ """
26
+ text = ""
27
+ for token in tokens:
28
+ if token in {",", ".", "!", "?", ":", ";", "..."}:
29
+ text = text.rstrip() + token
30
+ else:
31
+ text += " " + token
32
+ return text.strip()
33
+
34
+ def process_text_for_gliner(
35
+ text: str,
36
+ max_tokens: int = 256,
37
+ overlap: int = 32
38
+ ) -> List[str]:
39
+ """Process text for GLiNER by splitting long texts into overlapping chunks.
40
+
41
+ Preserves sentence boundaries and context when possible.
42
+
43
+ Args:
44
+ text: The input text to process
45
+ max_tokens: Maximum number of tokens per chunk
46
+ overlap: Number of tokens to overlap between chunks
47
+
48
+ Returns:
49
+ List of text chunks suitable for GLiNER
50
+ """
51
+ # First split into sentences to preserve natural boundaries
52
+ sentences = re.split(r'(?<=[.!?])\s+', text)
53
+ chunks = []
54
+ current_chunk = []
55
+ current_length = 0
56
+
57
+ for sentence in sentences:
58
+ # Tokenize the sentence
59
+ sentence_tokens = tokenize_text(sentence)
60
+ sentence_length = len(sentence_tokens)
61
+
62
+ # If a single sentence is too long, split it
63
+ if sentence_length > max_tokens:
64
+ # If we have accumulated tokens, add them as a chunk
65
+ if current_chunk:
66
+ chunks.append(" ".join(current_chunk))
67
+ current_chunk = []
68
+ current_length = 0
69
+
70
+ # Split the long sentence into smaller chunks
71
+ start = 0
72
+ while start < sentence_length:
73
+ end = min(start + max_tokens, sentence_length)
74
+ chunk_tokens = sentence_tokens[start:end]
75
+ chunks.append(" ".join(chunk_tokens))
76
+ start = end - overlap if end < sentence_length else end
77
+
78
+ # If adding this sentence would exceed max_tokens, start a new chunk
79
+ elif current_length + sentence_length > max_tokens:
80
+ chunks.append(" ".join(current_chunk))
81
+ current_chunk = sentence_tokens
82
+ current_length = sentence_length
83
+ else:
84
+ current_chunk.extend(sentence_tokens)
85
+ current_length += sentence_length
86
+
87
+ # Add any remaining tokens as the final chunk
88
+ if current_chunk:
89
+ chunks.append(" ".join(current_chunk))
90
+
91
+ return chunks
92
+
93
+ def extract_tokens_and_labels(
94
+ data: List[Dict[str, Union[str, None]]]
95
+ ) -> Tuple[List[str], List[Tuple[int, int, str]]]:
96
+ """Extract tokens and NER labels from annotation data.
97
+
98
+ Args:
99
+ data: List of token-label pairs
100
+
101
+ Returns:
102
+ Tuple of (tokens, ner_spans)
103
+ """
104
+ tokens = []
105
+ ner = []
106
+ token_start_idx = 0
107
+
108
+ for entry in data:
109
+ char = entry['token']
110
+ label = entry['class_or_confidence']
111
+
112
+ # Tokenize the current text chunk
113
+ token_list = tokenize_text(char)
114
+
115
+ # Append tokens to the main tokens list
116
+ tokens.extend(token_list)
117
+
118
+ if label:
119
+ token_end_idx = token_start_idx + len(token_list) - 1
120
+ ner.append((token_start_idx, token_end_idx, label))
121
+
122
+ token_start_idx += len(token_list)
123
+
124
+ return tokens, ner