language-identifier / langident.py

Simon Clematide

Fix confidence score rounding in model output for JSON compatibility

996ea64 9 months ago

970 Bytes

	from transformers import Pipeline


	class LangDetectionPipeline(Pipeline):

	def _sanitize_parameters(self, **kwargs):
	preprocess_kwargs = {}
	if "text" in kwargs:
	preprocess_kwargs["text"] = kwargs["text"]
	return preprocess_kwargs, {}, {}

	def preprocess(self, text, **kwargs):
	# Nothing to preprocess
	return text

	def _forward(self, text, **kwargs):
	predictions, probabilities = self.model(text.replace("\n", " "))
	return predictions, probabilities

	def postprocess(self, outputs, **kwargs):
	predictions, probabilities = outputs
	label = predictions[0][0].replace("__label__", "") # Remove __label__ prefix
	confidence = float(
	probabilities[0][0]
	) # Convert to float for JSON serialization

	# Format as JSON-compatible dictionary
	model_output = {"language": label, "score": round(confidence, 2)}
	return model_output