Spaces:

guymorlan
/

levanti_en_ar

Running

levanti_en_ar / tts.py

Guy Mor-Lan

add files

e35836c over 1 year ago

3.46 kB

	#%%
	import azure.cognitiveservices.speech as speechsdk
	import re
	import os
	import hashlib
	import random
	from dotenv import load_dotenv
	load_dotenv(".env")

	print(os.environ.get('SPEECH_KEY'))
	print(os.environ.get('SPEECH_REGION'))
	speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'),
	region=os.environ.get('SPEECH_REGION'))

	def do_cleanup(dir='wavs', num_files=100):
	files = os.listdir(dir)
	if len(files) > num_files:
	for file in files[:len(files) - num_files]:
	os.remove(f"{dir}/{file}")

	def add_sukun(text):
	# Define Arabic letters and sukun
	arabic_letters = 'اأإآةبتثجحخدذرزسشصضطظعغفقكلمنهوي'
	shadda = 'ّ'
	arabic_letters += shadda
	sukun = 'ْ'
	punctuation = '.,;!?،؛؟'

	def process_word(word):
	# If the last character is punctuation, process the letter before it
	if word[-1] in punctuation:
	if len(word) > 1 and word[-2] in arabic_letters and word[-2] != sukun:
	return word[:-2] + word[-2] + sukun + word[-1]
	return word
	# If the last character is an Arabic letter and does not have a sukun, add one
	elif word[-1] in arabic_letters and word[-1] != sukun:
	return word + sukun
	return word

	# Use regex to split text into words and punctuation
	words = re.findall(r'\S+\|[.,;!?،؛؟]', text)
	processed_text = ' '.join(process_word(word) for word in words)
	return processed_text

	def get_ssml(text, voice='de-DE-SeraphinaMultilingualNeural'):
	return f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="ar-SA"><voice name="{voice}"><lang xml:lang="ar-SA">{text}</lang></voice></speak>'


	def get_audio(input_text, voice='de-DE-FlorianMultilingualNeural', use_ssml=True):

	input_text = add_sukun(input_text)
	hash = hashlib.md5(input_text.encode()).hexdigest()

	if os.path.exists(f"wavs/{hash}.wav"):
	return f"wavs/{hash}.wav"

	audio_config = speechsdk.audio.AudioOutputConfig(filename=f"wavs/{hash}.wav")
	# speech_config.speech_synthesis_voice_name=voice
	# speech_config.speech_synthesis_language = "ar-EG"
	speech_config.set_speech_synthesis_output_format(
	speechsdk.SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm
	)

	speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config,
	audio_config=audio_config)
	if use_ssml:
	# print("Using SSML")
	ssml = get_ssml(input_text, voice=voice)
	result = speech_synthesizer.speak_ssml_async(ssml).get()
	else:
	# print("Using text")
	result = speech_synthesizer.speak_text_async(input_text).get()

	if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
	print("Speech synthesized for text [{}]".format(input_text))
	elif result.reason == speechsdk.ResultReason.Canceled:
	cancellation_details = result.cancellation_details
	print("Speech synthesis canceled: {}".format(cancellation_details.reason))
	if cancellation_details.reason == speechsdk.CancellationReason.Error:
	print("Error details: {}".format(cancellation_details.error_details))

	# randomly every 50 calls, clean up the wavs folder
	if random.randint(1, 50) == 1:
	do_cleanup()

	return f"wavs/{hash}.wav"