Refactor: Format Code with Ruff and Update Deprecated G2PW Link (#2255)

* ruff check --fix * ruff format --line-length 120 --target-version py39 * Change the link for G2PW Model * update pytorch version and colab
2025-04-07 09:42:47 +01:00
parent 9da7e17efe
commit 53cac93589
132 changed files with 8185 additions and 6648 deletions
--- a/GPT_SoVITS/text/en_normalization/expend.py
+++ b/GPT_SoVITS/text/en_normalization/expend.py
@@ -9,17 +9,17 @@ import unicodedata
 # 后缀计量单位替换表
 measurement_map = {
    "m": ["meter", "meters"],
-    'km': ["kilometer", "kilometers"],
+    "km": ["kilometer", "kilometers"],
    "km/h": ["kilometer per hour", "kilometers per hour"],
    "ft": ["feet", "feet"],
    "L": ["liter", "liters"],
    "tbsp": ["tablespoon", "tablespoons"],
-    'tsp': ["teaspoon", "teaspoons"],
+    "tsp": ["teaspoon", "teaspoons"],
    "h": ["hour", "hours"],
    "min": ["minute", "minutes"],
    "s": ["second", "seconds"],
    "°C": ["degree celsius", "degrees celsius"],
-    "°F": ["degree fahrenheit", "degrees fahrenheit"]
+    "°F": ["degree fahrenheit", "degrees fahrenheit"],
 }


@@ -27,41 +27,42 @@ measurement_map = {
 _inflect = inflect.engine()

 # 转化数字序数词
-_ordinal_number_re = re.compile(r'\b([0-9]+)\. ')
+_ordinal_number_re = re.compile(r"\b([0-9]+)\. ")

 # 我听说好像对于数字正则识别其实用 \d 会好一点

-_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
+_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")

 # 时间识别
-_time_re = re.compile(r'\b([01]?[0-9]|2[0-3]):([0-5][0-9])\b')
+_time_re = re.compile(r"\b([01]?[0-9]|2[0-3]):([0-5][0-9])\b")

 # 后缀计量单位识别
-_measurement_re = re.compile(r'\b([0-9]+(\.[0-9]+)?(m|km|km/h|ft|L|tbsp|tsp|h|min|s|°C|°F))\b')
+_measurement_re = re.compile(r"\b([0-9]+(\.[0-9]+)?(m|km|km/h|ft|L|tbsp|tsp|h|min|s|°C|°F))\b")

 # 前后 £ 识别 ( 写了识别两边某一边的，但是不知道为什么失败了┭┮﹏┭┮ )
-_pounds_re_start = re.compile(r'£([0-9\.\,]*[0-9]+)')
-_pounds_re_end = re.compile(r'([0-9\.\,]*[0-9]+)£')
+_pounds_re_start = re.compile(r"£([0-9\.\,]*[0-9]+)")
+_pounds_re_end = re.compile(r"([0-9\.\,]*[0-9]+)£")

 # 前后 $ 识别
-_dollars_re_start = re.compile(r'\$([0-9\.\,]*[0-9]+)')
-_dollars_re_end = re.compile(r'([(0-9\.\,]*[0-9]+)\$')
+_dollars_re_start = re.compile(r"\$([0-9\.\,]*[0-9]+)")
+_dollars_re_end = re.compile(r"([(0-9\.\,]*[0-9]+)\$")

 # 小数的识别
-_decimal_number_re = re.compile(r'([0-9]+\.\s*[0-9]+)')
+_decimal_number_re = re.compile(r"([0-9]+\.\s*[0-9]+)")

 # 分数识别 (形式 "3/4" )
-_fraction_re = re.compile(r'([0-9]+/[0-9]+)')
+_fraction_re = re.compile(r"([0-9]+/[0-9]+)")

 # 序数词识别
-_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
+_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")

 # 数字处理
-_number_re = re.compile(r'[0-9]+')
+_number_re = re.compile(r"[0-9]+")
+

 def _convert_ordinal(m):
    """
-    标准化序数词, 例如: 1. 2. 3. 4. 5. 6. 
+    标准化序数词, 例如: 1. 2. 3. 4. 5. 6.
    Examples:
        input: "1. "
        output: "1st"
@@ -70,24 +71,26 @@ def _convert_ordinal(m):
    ordinal = _inflect.ordinal(m.group(1))
    return ordinal + ", "

+
 def _remove_commas(m):
-    return m.group(1).replace(',', '')
+    return m.group(1).replace(",", "")
+

 def _expand_time(m):
    """
    将 24 小时制的时间转换为 12 小时制的时间表示方式。
-    
+
    Examples:
        input: "13:00 / 4:00 / 13:30"
        output: "one o'clock p.m. / four o'clock am. / one thirty p.m."
    """
    hours, minutes = map(int, m.group(1, 2))
-    period = 'a.m.' if hours < 12 else 'p.m.'
+    period = "a.m." if hours < 12 else "p.m."
    if hours > 12:
        hours -= 12

    hour_word = _inflect.number_to_words(hours)
-    minute_word = _inflect.number_to_words(minutes) if minutes != 0 else ''
+    minute_word = _inflect.number_to_words(minutes) if minutes != 0 else ""

    if minutes == 0:
        return f"{hour_word} o'clock {period}"
@@ -103,7 +106,7 @@ def _expand_measurement(m):
    sign = m.group(3)
    ptr = 1
    # 想不到怎么方便的取数字，又懒得改正则，诶，1.2 反正也是复数读法，干脆直接去掉 "."
-    num = int(m.group(1).replace(sign, '').replace(".",''))
+    num = int(m.group(1).replace(sign, "").replace(".", ""))
    decimal_part = m.group(2)
    # 上面判断的漏洞，比如 0.1 的情况，在这里排除了
    if decimal_part == None and num == 1:
@@ -116,23 +119,24 @@ def _expand_pounds(m):
    没找到特别规范的说明，和美元的处理一样，其实可以把两个合并在一起
    """
    match = m.group(1)
-    parts = match.split('.')
+    parts = match.split(".")
    if len(parts) > 2:
-        return match + ' pounds'    # Unexpected format
+        return match + " pounds"  # Unexpected format
    pounds = int(parts[0]) if parts[0] else 0
-    pence = int(parts[1].ljust(2, '0')) if len(parts) > 1 and parts[1] else 0
+    pence = int(parts[1].ljust(2, "0")) if len(parts) > 1 and parts[1] else 0
    if pounds and pence:
-        pound_unit = 'pound' if pounds == 1 else 'pounds'
-        penny_unit = 'penny' if pence == 1 else 'pence'
-        return '%s %s and %s %s' % (pounds, pound_unit, pence, penny_unit)
+        pound_unit = "pound" if pounds == 1 else "pounds"
+        penny_unit = "penny" if pence == 1 else "pence"
+        return "%s %s and %s %s" % (pounds, pound_unit, pence, penny_unit)
    elif pounds:
-        pound_unit = 'pound' if pounds == 1 else 'pounds'
-        return '%s %s' % (pounds, pound_unit)
+        pound_unit = "pound" if pounds == 1 else "pounds"
+        return "%s %s" % (pounds, pound_unit)
    elif pence:
-        penny_unit = 'penny' if pence == 1 else 'pence'
-        return '%s %s' % (pence, penny_unit)
+        penny_unit = "penny" if pence == 1 else "pence"
+        return "%s %s" % (pence, penny_unit)
    else:
-        return 'zero pounds'
+        return "zero pounds"
+

 def _expand_dollars(m):
    """
@@ -142,23 +146,24 @@ def _expand_dollars(m):
        output: "thirty-two dollars and thirty cents" / "six dollars and twenty-four cents"
    """
    match = m.group(1)
-    parts = match.split('.')
+    parts = match.split(".")
    if len(parts) > 2:
-        return match + ' dollars'    # Unexpected format
+        return match + " dollars"  # Unexpected format
    dollars = int(parts[0]) if parts[0] else 0
-    cents = int(parts[1].ljust(2, '0')) if len(parts) > 1 and parts[1] else 0
+    cents = int(parts[1].ljust(2, "0")) if len(parts) > 1 and parts[1] else 0
    if dollars and cents:
-        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
-        cent_unit = 'cent' if cents == 1 else 'cents'
-        return '%s %s and %s %s' % (dollars, dollar_unit, cents, cent_unit)
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s and %s %s" % (dollars, dollar_unit, cents, cent_unit)
    elif dollars:
-        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
-        return '%s %s' % (dollars, dollar_unit)
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        return "%s %s" % (dollars, dollar_unit)
    elif cents:
-        cent_unit = 'cent' if cents == 1 else 'cents'
-        return '%s %s' % (cents, cent_unit)
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s" % (cents, cent_unit)
    else:
-        return 'zero dollars'
+        return "zero dollars"
+

 # 小数的处理
 def _expand_decimal_number(m):
@@ -168,11 +173,11 @@ def _expand_decimal_number(m):
        output: "thirteen point two three four"
    """
    match = m.group(1)
-    parts = match.split('.')
+    parts = match.split(".")
    words = []
    # 遍历字符串中的每个字符
    for char in parts[1]:
-        if char == '.':
+        if char == ".":
            words.append("point")
        else:
            words.append(char)
@@ -186,7 +191,7 @@ def _expend_fraction(m):
    规则2: 如果分子大于 1, 在读分母的时候使用序数词复数读法.
    规则3: 当分母为2的时候, 分母读做 half, 并且当分子大于 1 的时候, half 也要用复数读法, 读为 halves.
    Examples:
-    
+
    | Written |	Said |
    |:---:|:---:|
    | 1/3 | one third |
@@ -196,39 +201,41 @@ def _expend_fraction(m):
    | 3/2 | three halves |
    """
    match = m.group(0)
-    numerator, denominator = map(int, match.split('/'))
+    numerator, denominator = map(int, match.split("/"))

    numerator_part = _inflect.number_to_words(numerator)
    if denominator == 2:
        if numerator == 1:
-            denominator_part = 'half'
+            denominator_part = "half"
        else:
-            denominator_part = 'halves'
+            denominator_part = "halves"
    elif denominator == 1:
-        return f'{numerator_part}'
+        return f"{numerator_part}"
    else:
        denominator_part = _inflect.ordinal(_inflect.number_to_words(denominator))
        if numerator > 1:
-            denominator_part += 's'
+            denominator_part += "s"
+
+    return f"{numerator_part} {denominator_part}"

-    return f'{numerator_part} {denominator_part}'

 def _expand_ordinal(m):
    return _inflect.number_to_words(m.group(0))

+
 def _expand_number(m):
    num = int(m.group(0))
    if num > 1000 and num < 3000:
        if num == 2000:
-            return 'two thousand'
+            return "two thousand"
        elif num > 2000 and num < 2010:
-            return 'two thousand ' + _inflect.number_to_words(num % 100)
+            return "two thousand " + _inflect.number_to_words(num % 100)
        elif num % 100 == 0:
-            return _inflect.number_to_words(num // 100) + ' hundred'
+            return _inflect.number_to_words(num // 100) + " hundred"
        else:
-            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+            return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
    else:
-        return _inflect.number_to_words(num, andword='')
+        return _inflect.number_to_words(num, andword="")


 def normalize(text):
@@ -238,7 +245,7 @@ def normalize(text):
    """

    text = re.sub(_ordinal_number_re, _convert_ordinal, text)
-    text = re.sub(r'(?<!\d)-|-(?!\d)', ' minus ', text)
+    text = re.sub(r"(?<!\d)-|-(?!\d)", " minus ", text)
    text = re.sub(_comma_number_re, _remove_commas, text)
    text = re.sub(_time_re, _expand_time, text)
    text = re.sub(_measurement_re, _expand_measurement, text)
@@ -251,19 +258,20 @@ def normalize(text):
    text = re.sub(_ordinal_re, _expand_ordinal, text)
    text = re.sub(_number_re, _expand_number, text)

-    text = ''.join(char for char in unicodedata.normalize('NFD', text)
-                    if unicodedata.category(char) != 'Mn')  # Strip accents
+    text = "".join(
+        char for char in unicodedata.normalize("NFD", text) if unicodedata.category(char) != "Mn"
+    )  # Strip accents

    text = re.sub("%", " percent", text)
    text = re.sub("[^ A-Za-z'.,?!\-]", "", text)
    text = re.sub(r"(?i)i\.e\.", "that is", text)
    text = re.sub(r"(?i)e\.g\.", "for example", text)
    # 增加纯大写单词拆分
-    text = re.sub(r'(?<!^)(?<![\s])([A-Z])', r' \1', text)
+    text = re.sub(r"(?<!^)(?<![\s])([A-Z])", r" \1", text)
    return text


-if __name__ == '__main__':
+if __name__ == "__main__":
    # 我觉得其实可以把切分结果展示出来（只读，或者修改不影响传给TTS的实际text）
    # 然后让用户确认后再输入给 TTS，可以让用户检查自己有没有不标准的输入
    print(normalize("1. test ordinal number 1st"))
@@ -272,4 +280,4 @@ if __name__ == '__main__':
    print(normalize("1st, 22nd"))
    print(normalize("a test 20h, 1.2s, 1L, 0.1km"))
    print(normalize("a test of time 4:00, 13:00, 13:30"))
-    print(normalize("a test of temperature 4°F, 23°C, -19°C"))
+    print(normalize("a test of temperature 4°F, 23°C, -19°C"))