In this video, I look at text cleaners, and how they could be potentially causing issues with training your TTS models.
I refer to cleaners in the Tortoise TTS AI Voice Cloning WebUI (MRQ) and Coqui TTS
Messy and unfinished LJSpeech-format dataset markup/processing script:
# process ljspeech format datasets
import glob
import os
import re
import shutil
import string
replace_dict = {
'Mrs.': 'Misses',
'Mr.': 'Mister',
'Ms.': 'Miss',
'Jr.': 'Junior',
'Sr.': 'Senior',
'Dr.': 'Doctor',
'St.': 'Saint',
'Rev.': 'Reverend',
'email': 'e-mail'
}
training_file = "input-ljspeech-format.txt"
out_file_name = "output-file.txt"
separator = "|"
all_text =""
write_digits = 0
do_replace_words = 1
out_file = open(out_file_name,"w")
with open(training_file, 'r') as tf:
for line in tf:
line_sep = separator
print(line)
split_line = line.split("|")
print(split_line)
file_path = split_line[0]
training_audio_files.append(file_path)
trans_text = split_line[1]
match = re.search("[^A-Z0-9a-zŽžÀ-ÿ!¿¡'(),-.:;? ]", trans_text)
if not match:
print(trans_text)
trans_text = ""
file_path = ""
line_sep = ""
domain = re.search("([a-z0-9]+(-[a-z0-9]+)*\.)+[a-z]{2,}", trans_text)
if domain:
trans_text = "*****" + trans_text
abbrev = re.search(r"(?:[A-Z]{2}[:alpha:]*)|(?:[A-Z][a-z][A-Z][:alpha:]*)", trans_text)
if abbrev:
trans_text = "*****" + trans_text
digits_found = re.search(r"^(?!.*[0-9]).*$", trans_text)
if not digits_found:
if write_digits == 1:
trans_text = "*****" + trans_text
elif write_digits == 0:
trans_text = ""
line_sep = ""
file_path = ""
if do_replace_words==1:
for key, value in replace_dict.items():
trans_text = re.sub(rf'\b{key}\b', value, trans_text)
abbrev_periods = re.search(r"\b(?:[a-zA-Z]\.){2,}", trans_text)
if abbrev_periods:
print(trans_text)
trans_text = "*****" + trans_text
print(file_path)
print(line_sep)
print(trans_text)
out_file.write(file_path + line_sep + trans_text)