Spaces:

JotunnBurton
/

wuwa-bert-vits2

Sleeping

App Files Files Community

JotunnBurton commited on Apr 16

Commit

a126173

verified ·

1 Parent(s): 6ccb2b2

Upload re_matching.py

Browse files

Files changed (1) hide show

re_matching.py +81 -0

re_matching.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import re
+def extract_language_and_text_updated(speaker, dialogue):
+    # 使用正则表达式匹配<语言>标签和其后的文本
+    pattern_language_text = r"<(\S+?)>([^<]+)"
+    matches = re.findall(pattern_language_text, dialogue, re.DOTALL)
+    speaker = speaker[1:-1]
+    # 清理文本：去除两边的空白字符
+    matches_cleaned = [(lang.upper(), text.strip()) for lang, text in matches]
+    matches_cleaned.append(speaker)
+    return matches_cleaned
+def validate_text(input_text):
+    # 验证说话人的正则表达式
+    pattern_speaker = r"(\[\S+?\])((?:\s*<\S+?>[^<\[\]]+?)+)"
+    # 使用re.DOTALL标志使.匹配包括换行符在内的所有字符
+    matches = re.findall(pattern_speaker, input_text, re.DOTALL)
+    # 对每个匹配到的说话人内容进行进一步验证
+    for _, dialogue in matches:
+        language_text_matches = extract_language_and_text_updated(_, dialogue)
+        if not language_text_matches:
+            return (
+                False,
+                "Error: Invalid format detected in dialogue content. Please check your input.",
+            )
+    # 如果输入的文本中没有找到任何匹配项
+    if not matches:
+        return (
+            False,
+            "Error: No valid speaker format detected. Please check your input.",
+        )
+    return True, "Input is valid."
+def text_matching(text: str) -> list:
+    speaker_pattern = r"(\[\S+?\])(.+?)(?=\[\S+?\]|$)"
+    matches = re.findall(speaker_pattern, text, re.DOTALL)
+    result = []
+    for speaker, dialogue in matches:
+        result.append(extract_language_and_text_updated(speaker, dialogue))
+    return result
+def cut_para(text):
+    splitted_para = re.split("[\n]", text)  # 按段分
+    splitted_para = [
+        sentence.strip() for sentence in splitted_para if sentence.strip()
+    ]  # 删除空字符串
+    return splitted_para
+def cut_sent(para):
+    para = re.sub("([。！;？\?])([^”’])", r"\1\n\2", para)  # 单字符断句符
+    para = re.sub("(\.{6})([^”’])", r"\1\n\2", para)  # 英文省略号
+    para = re.sub("(\…{2})([^”’])", r"\1\n\2", para)  # 中文省略号
+    para = re.sub("([。！？\?][”’])([^，。！？\?])", r"\1\n\2", para)
+    para = para.rstrip()  # 段尾如果有多余的\n就去掉它
+    return para.split("\n")
+if __name__ == "__main__":
+    text = """
+    [说话人1]
+    [说话人2]<zh>你好吗？<jp>元気ですか？<jp>こんにちは，世界。<zh>你好吗？
+    [说话人3]<zh>谢谢。<jp>どういたしまして。
+    """
+    text_matching(text)
+    # 测试函数
+    test_text = """
+    [说话人1]<zh>你好，こんにちは！<jp>こんにちは，世界。
+    [说话人2]<zh>你好吗？
+    """
+    text_matching(test_text)
+    res = validate_text(test_text)
+    print(res)