Spaces:
Runtime error
Runtime error
| # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| from typing import List | |
| from typing import Tuple | |
| import jieba_fast as jieba | |
| from pypinyin import lazy_pinyin | |
| from pypinyin import Style | |
| class ToneSandhi: | |
| def __init__(self): | |
| self.must_neural_tone_words = { | |
| "้บป็ฆ", | |
| "้บปๅฉ", | |
| "้ธณ้ธฏ", | |
| "้ซ็ฒฑ", | |
| "้ชจๅคด", | |
| "้ช้ฉผ", | |
| "้ฉฌ่", | |
| "้ฆ้ฅฐ", | |
| "้ฆๅคด", | |
| "้ฆ้ฅจ", | |
| "้ฃ็ญ", | |
| "้พไธบ", | |
| "้ไผ", | |
| "้ๆฐ", | |
| "้บๅฅณ", | |
| "้จ้", | |
| "้ๅคด", | |
| "้บ็", | |
| "้้", | |
| "้ๅ ", | |
| "้ฅๅ", | |
| "้่", | |
| "้ๅคด", | |
| "้จๅ", | |
| "้ฃไน", | |
| "้ๅฃซ", | |
| "้ ๅ", | |
| "่ฟท็ณ", | |
| "่ฟ็ดฏ", | |
| "่ฟไน", | |
| "่ฟไธช", | |
| "่ฟๆฐ", | |
| "่ฟๅป", | |
| "่ฝฏๅ", | |
| "่ฝฌๆ ", | |
| "่ธๅฎ", | |
| "่ทณ่ค", | |
| "่ทๅคด", | |
| "่ถ่ถ", | |
| "่ดขไธป", | |
| "่ฑ่ ", | |
| "่ฎฒ็ฉถ", | |
| "่ฎฐๆง", | |
| "่ฎฐๅท", | |
| "่ฎค่ฏ", | |
| "่ง็ฉ", | |
| "่ง่ฏ", | |
| "่ฃ็ผ", | |
| "่กฅไธ", | |
| "่กฃ่ฃณ", | |
| "่กฃๆ", | |
| "่ก้จ", | |
| "่กๅ", | |
| "่กๆ", | |
| "่กๅฝ", | |
| "่ค่", | |
| "่่", | |
| "่่ท", | |
| "่ซ่ฆ", | |
| "่ก่", | |
| "่ๅ", | |
| "่ธ่ ", | |
| "่ๆก", | |
| "่ๅคด", | |
| "่่", | |
| "่้บป", | |
| "่ๆ", | |
| "่ๅฆ", | |
| "่ๅคด", | |
| "่ชๅจ", | |
| "่่ฏ", | |
| "่พๆฐ", | |
| "่่ข", | |
| "่ๆข", | |
| "่ฝ่", | |
| "่ณ่", | |
| "่ญ่", | |
| "่ก่", | |
| "่ก็ด", | |
| "่กๅ", | |
| "่ชๆ", | |
| "่ฝ่ฏฏ", | |
| "่ฝๆ", | |
| "่ทๆ", | |
| "่ณๆต", | |
| "่็ท", | |
| "่ๅฎ", | |
| "่ๅฉ", | |
| "่ๅคด", | |
| "่ๅคช", | |
| "็ฟป่ พ", | |
| "็ฝๅฆ", | |
| "็ฝๅคด", | |
| "็ผ่พ", | |
| "็ปๅฎ", | |
| "็บข็ซ", | |
| "็ดฏ่ต", | |
| "็ณจ็ณ", | |
| "็ณๆถ", | |
| "็ฒพ็ฅ", | |
| "็ฒฎ้ฃ", | |
| "็ฐธ็ฎ", | |
| "็ฏฑ็ฌ", | |
| "็ฎ่ฎก", | |
| "็ฎ็", | |
| "็ญๅบ", | |
| "็ฌคๅธ", | |
| "็ฌ่ฏญ", | |
| "็ฌ่ฏ", | |
| "็ช็ชฟ", | |
| "็ชๅ", | |
| "็ชๆท", | |
| "็จณๅฝ", | |
| "็จ็ฝ", | |
| "็งฐๅผ", | |
| "็งงๆญ", | |
| "็งๆฐ", | |
| "็งๆ", | |
| "็ฆๆฐ", | |
| "็ฅๅฎ", | |
| "็ ๅฐ", | |
| "็ ๅคด", | |
| "็ณๆฆด", | |
| "็ณๅคด", | |
| "็ณๅ ", | |
| "็ฅ่ฏ", | |
| "็ผ็", | |
| "็ฏ็ผ", | |
| "็จๅทด", | |
| "็ๆฏ", | |
| "็ธๅฃฐ", | |
| "็็ฎ", | |
| "็ฝๅ", | |
| "็ข็พ", | |
| "็ๅฟซ", | |
| "็็พ", | |
| "็็ฉ", | |
| "็ๅฟฝ", | |
| "็็", | |
| "็ๆ", | |
| "็่", | |
| "็ต็ถ", | |
| "็ข็ฃจ", | |
| "็็", | |
| "็ป็", | |
| "็ซ็ฐ", | |
| "็ไน", | |
| "็็ธ", | |
| "็ถๅ ", | |
| "็นๅก", | |
| "็ฒๅฃ", | |
| "็็ข", | |
| "็ๆฅผ", | |
| "็ฝๅฟซ", | |
| "็ฑไบบ", | |
| "็ญ้น", | |
| "็ง้ฅผ", | |
| "็็ญ", | |
| "็็ณ", | |
| "็นๅฟ", | |
| "็ๅธ", | |
| "็ฏ็ฌผ", | |
| "็ซๅ", | |
| "ๆผไบฎ", | |
| "ๆปๆบ", | |
| "ๆบ่พพ", | |
| "ๆธฉๅ", | |
| "ๆธ ๆฅ", | |
| "ๆถๆฏ", | |
| "ๆตชๅคด", | |
| "ๆดปๆณผ", | |
| "ๆฏๆน", | |
| "ๆญฃ็ป", | |
| "ๆฌบ่ด", | |
| "ๆจก็ณ", | |
| "ๆงๆฆ", | |
| "ๆฃบๆ", | |
| "ๆฃๆง", | |
| "ๆฃ่ฑ", | |
| "ๆ ธๆก", | |
| "ๆ ๆ ", | |
| "ๆด็ซ", | |
| "ๆถๅฟ", | |
| "ๆๅคด", | |
| "ๆๆท", | |
| "ๆบ็ต", | |
| "ๆฌไบ", | |
| "ๆจๅคด", | |
| "ๆจๅ ", | |
| "ๆๅ", | |
| "ๆ้ฅผ", | |
| "ๆไบฎ", | |
| "ๆๅ", | |
| "ๆ็ฝ", | |
| "ๆถๅ", | |
| "ๆฐ้ฒ", | |
| "ๆ ไบ", | |
| "ๆถๆพ", | |
| "ๆถๆ", | |
| "ๆ้ฒ", | |
| "ๆ่ฆ", | |
| "ๆๅ", | |
| "ๆ็ฒ", | |
| "ๆๅคด", | |
| "ๆพๆ", | |
| "ๆณๅคด", | |
| "ๆจๅผ", | |
| "ๆ็", | |
| "ๆๅผ", | |
| "ๆฌไธพ", | |
| "ๆคๅฃซ", | |
| "ๆ่ พ", | |
| "ๆซๅธ", | |
| "ๆ้", | |
| "ๆ็ฎ", | |
| "ๆ็น", | |
| "ๆๆฎ", | |
| "ๆๅฌ", | |
| "ๆๅ", | |
| "ๆๅฎ", | |
| "ๆๆ ", | |
| "ๆๆ", | |
| "ๆๅพ", | |
| "ๆ่ฏ", | |
| "ๆๆ", | |
| "ๆ ๅฝข", | |
| "ๆๆง", | |
| "ๆช็ฉ", | |
| "ๆ้", | |
| "ๆไน", | |
| "ๅฟตๅคด", | |
| "ๅฟตๅจ", | |
| "ๅฟซๆดป", | |
| "ๅฟๆดป", | |
| "ๅฟๆฐ", | |
| "ๅฟๆ", | |
| "ๅพ็ฝช", | |
| "ๅผ ็ฝ", | |
| "ๅผๅ ", | |
| "ๅผ้", | |
| "ๅบ้ ฌ", | |
| "ๅบ็จผ", | |
| "ๅนฒไบ", | |
| "ๅธฎๆ", | |
| "ๅธ็ฏท", | |
| "ๅธ็ฝ", | |
| "ๅธ็ถ", | |
| "ๅธๅ ", | |
| "ๅทด็ป", | |
| "ๅทดๆ", | |
| "ๅทฎไบ", | |
| "ๅทฅๅคซ", | |
| "ๅฒๆฐ", | |
| "ๅฑ่ก", | |
| "ๅฐพๅทด", | |
| "ๅฐ็ท", | |
| "ๅฐๆฐ", | |
| "ๅฐไผ", | |
| "ๅฐๅฐฑ", | |
| "ๅฏนๅคด", | |
| "ๅฏนไป", | |
| "ๅฏกๅฆ", | |
| "ๅฎถไผ", | |
| "ๅฎขๆฐ", | |
| "ๅฎๅจ", | |
| "ๅฎๅธ", | |
| "ๅญฆ้ฎ", | |
| "ๅญฆ็", | |
| "ๅญๅท", | |
| "ๅซๅฆ", | |
| "ๅชณๅฆ", | |
| "ๅชไบบ", | |
| "ๅฉๅฎถ", | |
| "ๅจๅฎถ", | |
| "ๅงๅฑ", | |
| "ๅงๅจ", | |
| "ๅงๅคซ", | |
| "ๅฆฏๅจ", | |
| "ๅฆฅๅฝ", | |
| "ๅฆ็ฒพ", | |
| "ๅฅดๆ", | |
| "ๅฅณๅฉฟ", | |
| "ๅคดๅ", | |
| "ๅคช้ณ", | |
| "ๅคง็ท", | |
| "ๅคงๆน", | |
| "ๅคงๆ", | |
| "ๅคงๅคซ", | |
| "ๅคๅฐ", | |
| "ๅคไน", | |
| "ๅค็ฅ", | |
| "ๅฃฎๅฎ", | |
| "ๅฐ้", | |
| "ๅฐๆน", | |
| "ๅจไน", | |
| "ๅฐ้พ", | |
| "ๅดๅทด", | |
| "ๅฑๅ", | |
| "ๅๅ", | |
| "ๅๅ", | |
| "ๅๆฌข", | |
| "ๅๅ", | |
| "ๅๅญ", | |
| "ๅ้", | |
| "ๅพๆฒซ", | |
| "ๅๅทด", | |
| "ๅๆฌ ", | |
| "ๅๅฆ", | |
| "ๅณๅฝ", | |
| "ๅๅฐ", | |
| "ๅ่ฏ", | |
| "ๅ็คบ", | |
| "ๅซ็ณ", | |
| "ๅๅฌ", | |
| "ๅๅคด", | |
| "ๅๅญ", | |
| "ๅๅ ", | |
| "ๅๅ", | |
| "ๅๅ", | |
| "ๅซๅค", | |
| "ๅฃ่ข", | |
| "ๅ้", | |
| "ๅๅฎณ", | |
| "ๅๆค", | |
| "ๅ ่ขฑ", | |
| "ๅ ๆถต", | |
| "ๅ็งฐ", | |
| "ๅคๅฟซ", | |
| "ๅจ้", | |
| "ๅจๅผน", | |
| "ๅๅคซ", | |
| "ๅๆฐ", | |
| "ๅๅคด", | |
| "ๅบ็ฌ", | |
| "ๅบๆฟ", | |
| "ๅซๆญ", | |
| "ๅฉ่ฝ", | |
| "ๅฉ็ดข", | |
| "ๅฉๅฎณ", | |
| "ๅๆ", | |
| "ๅบๆฏ", | |
| "ๅๅ", | |
| "ๅๅฟซ", | |
| "ๅทๆ", | |
| "ๅคๆ", | |
| "ๅๅคฑ", | |
| "ๅ ปๆดป", | |
| "ๅ ณ็ณป", | |
| "ๅ ็", | |
| "ๅ ๅผ", | |
| "ไพฟๅฎ", | |
| "ไฝฟๅค", | |
| "ไฝฉๆ", | |
| "ไฝๅ", | |
| "ไฝ้ข", | |
| "ไฝ็ฝฎ", | |
| "ไผผ็", | |
| "ไผ่ฎก", | |
| "ไผๆฏ", | |
| "ไปไน", | |
| "ไบบๅฎถ", | |
| "ไบฒๆ", | |
| "ไบฒๅฎถ", | |
| "ไบคๆ ", | |
| "ไบๅฝฉ", | |
| "ไบๆ ", | |
| "ไนฐๅ", | |
| "ไธปๆ", | |
| "ไธซๅคด", | |
| "ไธงๆฐ", | |
| "ไธคๅฃ", | |
| "ไธ่ฅฟ", | |
| "ไธๅฎถ", | |
| "ไธๆ ", | |
| "ไธ็ฑ", | |
| "ไธๅจ", | |
| "ไธๆฐด", | |
| "ไธๅทด", | |
| "ไธๅคด", | |
| "ไธๅธ", | |
| "ไธๅคซ", | |
| "ไธไบบ", | |
| "ไธ่พ", | |
| "้ฃไธช", | |
| "่ฉ่จ", | |
| "็ถไบฒ", | |
| "ๆฏไบฒ", | |
| "ๅๅ", | |
| "้้ข", | |
| "่ดน็จ", | |
| "ๅคๅฎถ", | |
| "็ๅคด", | |
| "ไป็ป", | |
| "่ๅ", | |
| "ๅคงไบบ", | |
| "ๆณฅ้ณ ", | |
| "ๅนธ็ฆ", | |
| "็ๆ", | |
| "่ฎกๅ", | |
| "ๆ่ พ", | |
| "่ก็", | |
| "ๅงฅ็ท", | |
| "็ ง้กพ", | |
| "ๅๅ", | |
| "ๅไป", | |
| "ๅผๅ ", | |
| "่่ฑ", | |
| "ๅคๅฐ", | |
| "ๆๆฒ", | |
| "ๅฏ็ข", | |
| "็ณ่น", | |
| "ๅ่ พ", | |
| "ๆฅๅค", | |
| "้ป่พ", | |
| "็็ผ ", | |
| "ๅฝๅฐ", | |
| "็ข้ช", | |
| "ๅๅฑ", | |
| "ๆซๆ", | |
| "ๆฆ่ฎฐ", | |
| } | |
| self.must_not_neural_tone_words = { | |
| "็ทๅญ", | |
| "ๅฅณๅญ", | |
| "ๅๅญ", | |
| "ๅๅญ", | |
| "้ๅญ", | |
| "่ฒๅญ", | |
| "็ณๅญ", | |
| "็ๅญ", | |
| "็ตๅญ", | |
| "ไบบไบบ", | |
| "่่", | |
| "ๅนบๅนบ", | |
| "ๅนฒๅ", | |
| "ๅญฆๅญ", | |
| "ๅๅ", | |
| "ๆฐๆฐ", | |
| "่ข ่ข ", | |
| "ๅฑๅฐ", | |
| "ไปฅไธ", | |
| "ๅจๅๅ", | |
| "่ฑ่ฑ่่", | |
| "็ๅพ", | |
| "่ๅฐ", | |
| "ๆณๆณ", | |
| "็็", | |
| "ๆๆ", | |
| "ๅตๅญ", | |
| "ๆญปๆญป", | |
| "ๅๅ", | |
| "ๆณๆณ", | |
| "ไฝผไฝผ", | |
| "ๅตๅต", | |
| "ๆๆ", | |
| "่่", | |
| "ๆดๆด", | |
| "่่", | |
| "่ฝๅฐ", | |
| "็ฎๅญ", | |
| "ๅฎถๅฎถๆทๆท", | |
| "้้", | |
| } | |
| self.punc = "๏ผ๏ผ๏ผใ๏ผ๏ผโโโโ':,;.?!" | |
| # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041 | |
| # e.g. | |
| # word: "ๅฎถ้" | |
| # pos: "s" | |
| # finals: ['ia1', 'i3'] | |
| def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]: | |
| # reduplication words for n. and v. e.g. ๅฅถๅฅถ, ่ฏ่ฏ, ๆบๆบ | |
| for j, item in enumerate(word): | |
| if ( | |
| j - 1 >= 0 | |
| and item == word[j - 1] | |
| and pos[0] in {"n", "v", "a"} | |
| and word not in self.must_not_neural_tone_words | |
| ): | |
| finals[j] = finals[j][:-1] + "5" | |
| ge_idx = word.find("ไธช") | |
| if len(word) >= 1 and word[-1] in "ๅงๅขๅๅๅๅปๅๅๅจๅๅฆๅ้ขๆปดๅฉๅๅฝๅฐ่ถๅ่ฏถ": | |
| finals[-1] = finals[-1][:-1] + "5" | |
| elif len(word) >= 1 and word[-1] in "็ๅฐๅพ": | |
| finals[-1] = finals[-1][:-1] + "5" | |
| # e.g. ่ตฐไบ, ็็, ๅป่ฟ | |
| elif len(word) == 1 and word in "ไบ็่ฟ" and pos in {"ul", "uz", "ug"}: | |
| finals[-1] = finals[-1][:-1] + "5" | |
| elif ( | |
| len(word) > 1 | |
| and word[-1] in "ไปฌๅญ" | |
| and pos in {"r", "n"} | |
| and word not in self.must_not_neural_tone_words | |
| ): | |
| finals[-1] = finals[-1][:-1] + "5" | |
| # e.g. ๆกไธ, ๅฐไธ, ๅฎถ้ | |
| elif len(word) > 1 and word[-1] in "ไธไธ้" and pos in {"s", "l", "f"}: | |
| finals[-1] = finals[-1][:-1] + "5" | |
| # e.g. ไธๆฅ, ไธๅป | |
| elif len(word) > 1 and word[-1] in "ๆฅๅป" and word[-2] in "ไธไธ่ฟๅบๅ่ฟ่ตทๅผ": | |
| finals[-1] = finals[-1][:-1] + "5" | |
| # ไธชๅ้่ฏ | |
| elif ( | |
| ge_idx >= 1 | |
| and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "ๅ ๆไธคๅๅคๅๆดๆฏๅๆฏ") | |
| ) or word == "ไธช": | |
| finals[ge_idx] = finals[ge_idx][:-1] + "5" | |
| else: | |
| if ( | |
| word in self.must_neural_tone_words | |
| or word[-2:] in self.must_neural_tone_words | |
| ): | |
| finals[-1] = finals[-1][:-1] + "5" | |
| word_list = self._split_word(word) | |
| finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] | |
| for i, word in enumerate(word_list): | |
| # conventional neural in Chinese | |
| if ( | |
| word in self.must_neural_tone_words | |
| or word[-2:] in self.must_neural_tone_words | |
| ): | |
| finals_list[i][-1] = finals_list[i][-1][:-1] + "5" | |
| finals = sum(finals_list, []) | |
| return finals | |
| def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]: | |
| # e.g. ็ไธๆ | |
| if len(word) == 3 and word[1] == "ไธ": | |
| finals[1] = finals[1][:-1] + "5" | |
| else: | |
| for i, char in enumerate(word): | |
| # "ไธ" before tone4 should be bu2, e.g. ไธๆ | |
| if char == "ไธ" and i + 1 < len(word) and finals[i + 1][-1] == "4": | |
| finals[i] = finals[i][:-1] + "2" | |
| return finals | |
| def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]: | |
| # "ไธ" in number sequences, e.g. ไธ้ถ้ถ, ไบไธ้ถ | |
| if word.find("ไธ") != -1 and all( | |
| [item.isnumeric() for item in word if item != "ไธ"] | |
| ): | |
| return finals | |
| # "ไธ" between reduplication words shold be yi5, e.g. ็ไธ็ | |
| elif len(word) == 3 and word[1] == "ไธ" and word[0] == word[-1]: | |
| finals[1] = finals[1][:-1] + "5" | |
| # when "ไธ" is ordinal word, it should be yi1 | |
| elif word.startswith("็ฌฌไธ"): | |
| finals[1] = finals[1][:-1] + "1" | |
| else: | |
| for i, char in enumerate(word): | |
| if char == "ไธ" and i + 1 < len(word): | |
| # "ไธ" before tone4 should be yi2, e.g. ไธๆฎต | |
| if finals[i + 1][-1] == "4": | |
| finals[i] = finals[i][:-1] + "2" | |
| # "ไธ" before non-tone4 should be yi4, e.g. ไธๅคฉ | |
| else: | |
| # "ไธ" ๅ้ขๅฆๆๆฏๆ ็น๏ผ่ฟ่ฏปไธๅฃฐ | |
| if word[i + 1] not in self.punc: | |
| finals[i] = finals[i][:-1] + "4" | |
| return finals | |
| def _split_word(self, word: str) -> List[str]: | |
| word_list = jieba.cut_for_search(word) | |
| word_list = sorted(word_list, key=lambda i: len(i), reverse=False) | |
| first_subword = word_list[0] | |
| first_begin_idx = word.find(first_subword) | |
| if first_begin_idx == 0: | |
| second_subword = word[len(first_subword) :] | |
| new_word_list = [first_subword, second_subword] | |
| else: | |
| second_subword = word[: -len(first_subword)] | |
| new_word_list = [second_subword, first_subword] | |
| return new_word_list | |
| def _three_sandhi(self, word: str, finals: List[str]) -> List[str]: | |
| if len(word) == 2 and self._all_tone_three(finals): | |
| finals[0] = finals[0][:-1] + "2" | |
| elif len(word) == 3: | |
| word_list = self._split_word(word) | |
| if self._all_tone_three(finals): | |
| # disyllabic + monosyllabic, e.g. ่ๅค/ๅ | |
| if len(word_list[0]) == 2: | |
| finals[0] = finals[0][:-1] + "2" | |
| finals[1] = finals[1][:-1] + "2" | |
| # monosyllabic + disyllabic, e.g. ็บธ/่่ | |
| elif len(word_list[0]) == 1: | |
| finals[1] = finals[1][:-1] + "2" | |
| else: | |
| finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]] | |
| if len(finals_list) == 2: | |
| for i, sub in enumerate(finals_list): | |
| # e.g. ๆๆ/ไบบ | |
| if self._all_tone_three(sub) and len(sub) == 2: | |
| finals_list[i][0] = finals_list[i][0][:-1] + "2" | |
| # e.g. ๅฅฝ/ๅๆฌข | |
| elif ( | |
| i == 1 | |
| and not self._all_tone_three(sub) | |
| and finals_list[i][0][-1] == "3" | |
| and finals_list[0][-1][-1] == "3" | |
| ): | |
| finals_list[0][-1] = finals_list[0][-1][:-1] + "2" | |
| finals = sum(finals_list, []) | |
| # split idiom into two words who's length is 2 | |
| elif len(word) == 4: | |
| finals_list = [finals[:2], finals[2:]] | |
| finals = [] | |
| for sub in finals_list: | |
| if self._all_tone_three(sub): | |
| sub[0] = sub[0][:-1] + "2" | |
| finals += sub | |
| return finals | |
| def _all_tone_three(self, finals: List[str]) -> bool: | |
| return all(x[-1] == "3" for x in finals) | |
| # merge "ไธ" and the word behind it | |
| # if don't merge, "ไธ" sometimes appears alone according to jieba, which may occur sandhi error | |
| def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: | |
| new_seg = [] | |
| last_word = "" | |
| for word, pos in seg: | |
| if last_word == "ไธ": | |
| word = last_word + word | |
| if word != "ไธ": | |
| new_seg.append((word, pos)) | |
| last_word = word[:] | |
| if last_word == "ไธ": | |
| new_seg.append((last_word, "d")) | |
| last_word = "" | |
| return new_seg | |
| # function 1: merge "ไธ" and reduplication words in it's left and right, e.g. "ๅฌ","ไธ","ๅฌ" ->"ๅฌไธๅฌ" | |
| # function 2: merge single "ไธ" and the word behind it | |
| # if don't merge, "ไธ" sometimes appears alone according to jieba, which may occur sandhi error | |
| # e.g. | |
| # input seg: [('ๅฌ', 'v'), ('ไธ', 'm'), ('ๅฌ', 'v')] | |
| # output seg: [['ๅฌไธๅฌ', 'v']] | |
| def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: | |
| new_seg = [] | |
| # function 1 | |
| for i, (word, pos) in enumerate(seg): | |
| if ( | |
| i - 1 >= 0 | |
| and word == "ไธ" | |
| and i + 1 < len(seg) | |
| and seg[i - 1][0] == seg[i + 1][0] | |
| and seg[i - 1][1] == "v" | |
| and seg[i + 1][1] == "v" | |
| ): | |
| new_seg[i - 1][0] = new_seg[i - 1][0] + "ไธ" + new_seg[i - 1][0] | |
| else: | |
| if ( | |
| i - 2 >= 0 | |
| and seg[i - 1][0] == "ไธ" | |
| and seg[i - 2][0] == word | |
| and pos == "v" | |
| ): | |
| continue | |
| else: | |
| new_seg.append([word, pos]) | |
| seg = new_seg | |
| new_seg = [] | |
| # function 2 | |
| for i, (word, pos) in enumerate(seg): | |
| if new_seg and new_seg[-1][0] == "ไธ": | |
| new_seg[-1][0] = new_seg[-1][0] + word | |
| else: | |
| new_seg.append([word, pos]) | |
| return new_seg | |
| # the first and the second words are all_tone_three | |
| def _merge_continuous_three_tones( | |
| self, seg: List[Tuple[str, str]] | |
| ) -> List[Tuple[str, str]]: | |
| new_seg = [] | |
| sub_finals_list = [ | |
| lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) | |
| for (word, pos) in seg | |
| ] | |
| assert len(sub_finals_list) == len(seg) | |
| merge_last = [False] * len(seg) | |
| for i, (word, pos) in enumerate(seg): | |
| if ( | |
| i - 1 >= 0 | |
| and self._all_tone_three(sub_finals_list[i - 1]) | |
| and self._all_tone_three(sub_finals_list[i]) | |
| and not merge_last[i - 1] | |
| ): | |
| # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi | |
| if ( | |
| not self._is_reduplication(seg[i - 1][0]) | |
| and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 | |
| ): | |
| new_seg[-1][0] = new_seg[-1][0] + seg[i][0] | |
| merge_last[i] = True | |
| else: | |
| new_seg.append([word, pos]) | |
| else: | |
| new_seg.append([word, pos]) | |
| return new_seg | |
| def _is_reduplication(self, word: str) -> bool: | |
| return len(word) == 2 and word[0] == word[1] | |
| # the last char of first word and the first char of second word is tone_three | |
| def _merge_continuous_three_tones_2( | |
| self, seg: List[Tuple[str, str]] | |
| ) -> List[Tuple[str, str]]: | |
| new_seg = [] | |
| sub_finals_list = [ | |
| lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3) | |
| for (word, pos) in seg | |
| ] | |
| assert len(sub_finals_list) == len(seg) | |
| merge_last = [False] * len(seg) | |
| for i, (word, pos) in enumerate(seg): | |
| if ( | |
| i - 1 >= 0 | |
| and sub_finals_list[i - 1][-1][-1] == "3" | |
| and sub_finals_list[i][0][-1] == "3" | |
| and not merge_last[i - 1] | |
| ): | |
| # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi | |
| if ( | |
| not self._is_reduplication(seg[i - 1][0]) | |
| and len(seg[i - 1][0]) + len(seg[i][0]) <= 3 | |
| ): | |
| new_seg[-1][0] = new_seg[-1][0] + seg[i][0] | |
| merge_last[i] = True | |
| else: | |
| new_seg.append([word, pos]) | |
| else: | |
| new_seg.append([word, pos]) | |
| return new_seg | |
| def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: | |
| new_seg = [] | |
| for i, (word, pos) in enumerate(seg): | |
| if i - 1 >= 0 and word == "ๅฟ" and seg[i - 1][0] != "#": | |
| new_seg[-1][0] = new_seg[-1][0] + seg[i][0] | |
| else: | |
| new_seg.append([word, pos]) | |
| return new_seg | |
| def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: | |
| new_seg = [] | |
| for i, (word, pos) in enumerate(seg): | |
| if new_seg and word == new_seg[-1][0]: | |
| new_seg[-1][0] = new_seg[-1][0] + seg[i][0] | |
| else: | |
| new_seg.append([word, pos]) | |
| return new_seg | |
| def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]: | |
| seg = self._merge_bu(seg) | |
| try: | |
| seg = self._merge_yi(seg) | |
| except: | |
| print("_merge_yi failed") | |
| seg = self._merge_reduplication(seg) | |
| try: | |
| seg = self._merge_continuous_three_tones(seg) | |
| except: | |
| print("_merge_continuous_three_tones failed") | |
| try: | |
| seg = self._merge_continuous_three_tones_2(seg) | |
| except: | |
| print("_merge_continuous_three_tones_2 failed") | |
| seg = self._merge_er(seg) | |
| return seg | |
| def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]: | |
| finals = self._bu_sandhi(word, finals) | |
| finals = self._yi_sandhi(word, finals) | |
| finals = self._neural_sandhi(word, pos, finals) | |
| finals = self._three_sandhi(word, finals) | |
| return finals | |