CaPE / pii_ner.py
amartyasaran's picture
Dev Ready
d242fb9
raw
history blame contribute delete
881 Bytes
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from typing import List, Dict, Any
import os
class PIINER:
def __init__(self, model_name: str = "dslim/bert-base-NER"):
"""
Initialize the HuggingFace NER pipeline using the specified model.
"""
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForTokenClassification.from_pretrained(model_name)
self.ner_pipeline = pipeline("ner", model=self.model, tokenizer=self.tokenizer, aggregation_strategy="simple")
def extract_entities(self, text: str) -> List[Dict[str, Any]]:
"""
Perform NER on the input text and return list of detected entities.
Each entity includes: entity_group, word, start, end, and score.
"""
entities = self.ner_pipeline(text)
return entities