XTTSv2 Hindi Finetuning

XTTSv2 Hindi Finetuned Checkpoints

For use in most implementations of XTTSv2, these must be renamed to model.pth and replace the original XTTSv2 checkpoint.

https://huggingface.co/AOLCDROM/XTTSv2-Hi_ft/tree/main

Indic TTS Hindi Dataset

https://www.iitm.ac.in/donlab/indictts/database

Common Voice Dataset

https://commonvoice.mozilla.org/en/datasets

Convert Mozilla Common Voice .TSV to VCTK format dataset metadata

conv_cv_vctk.py

import csv
import os
import subprocess

from pandas import read_csv
ds_name="vctk-cv-hi-22k"
 cv_in_path = 'c:\\tts\\datasets\\hi\\'
df = read_csv(cv_in_path+'validated.tsv', delimiter='\t', encoding='utf-8')
out_path_root = 'c:\\tts\\datasets\\'
print(df)

for i in range(0, len(df)):
    file_name = df.iloc[i]['path']
    file_name = os.path.basename(file_name)
    dir_name= os.path.dirname(file_name)
    subprocess.run(["mkdir", "-p", out_path_root+ds_name+'/txt/'+str(dir_name)])
    mp3_file = file_name
    file_name = file_name[:-4]
    subprocess.run(["mkdir", "-p", out_path_root+ds_name+'/txt/'+str(df.iloc[i]['client_id'])])
    subprocess.run(["mkdir", "-p", out_path_root+ds_name+'/wav48_silence_trimmed/'+str(df.iloc[i]['client_id'])])
    subprocess.run(["cp", cv_in_path+"/clips/"+mp3_file, out_path_root+ds_name+'/wav48_silence_trimmed/'+str(df.iloc[i]['client_id'])])
    outfilepath = out_path_root+ds_name+'/txt/'+df.iloc[i]['client_id']+'/'+file_name+'.txt'
    with open(outfilepath, 'w', encoding='utf-8') as vctk_txt_out:
        vctk_txt_out.write(df.iloc[i]['sentence'])
df2 = df.groupby(['client_id'])['client_id'].count()

df2.to_csv("hi-ids.csv", sep='\t', encoding='utf-8')

Download and install ffmpeg, and add it to your windows system path environment variable. Save this to a .bat file and run it.

convert_cv_mp3_to_flac.bat

@ECHO OFF

rem Define the sample path (replace with your actual path)
set "sample_path=C:\tts\datasets\vctk-cv-hi-22k\wav48_silence_trimmed"

cd /D "%sample_path%"

for /d %%ytid in (*) do (
  echo %%ytid
  cd /D "%%ytid"
  
  for %%mp3 in (*.mp3) do (
    echo %%mp3
    set "trim_ytid=%%~nytid"
    ffmpeg -i "%%mp3" -ar 22050 -acodec flac -af aresample=osf=s16:dither_method=triangular_hp -ac 1 "%%~n%%mp3_mic1.flac"
    del "%%mp3"
  )
  cd ..
)

Convert Indic TTS format dataset metadata file to LJspeech format metadata file:

conv_indic_to_ljs.py

def process_file(input_file, output_file):
  """
  Reads an input file with format ( text_id "character" ) and writes a CSV file with format text_id|character (romanized)

  Args:
      input_file (str): Path to the input file.
      output_file (str): Path to the output CSV file.
  """
  with open(input_file, 'r', encoding='utf-8') as input_f, open(output_file, 'w', encoding='utf-8') as output_f:
    # Remove leading space before iterating through lines
    for line in input_f.readlines()[1:]:  # Skip the first line (assuming header)
      # Split the line based on space and quotation marks
      parts = line.strip().split('"')

      # Extract text id and character, remove extra spaces and leading quote
      text_id = parts[0].strip()[2:]  # Remove leading bracket and space
      character = parts[1].strip()[1:]  # Remove leading quote and trailing space

      # Write to the output file with pipe delimiter and newline (without leading space)
      output_f.write(f"{text_id}|{character}|{character}\n")  # No leading space before f-string

# Specify the input and output filenames
input_file = "c:\\tts\\datasets\\hindi_female_mono\\numbers\\numbers.txt"
output_file = "c:\\tts\\datasets\\hindi_female_mono\\numbers\\numbers.csv"

# Process the files
process_file(input_file, output_file)

print(f"Successfully processed {input_file} to {output_file}")

Convert wavs to single channel, 22050hz. Save to batch file, run from wavs directory:

mkdir out
for %%a in (*.wav) do ffmpeg -i "%%a" -ar 22050 -ac 1 out/"%%a"

Remove missing file entries from LJSpeech format dataset with this:

rm_missing.py

import os
import csv

def remove_nonexistent_files(metadata_file, wavs_dir, output_file):
  """
  Removes lines from a CSV file if the corresponding .wav files don't exist in a subdirectory and saves the filtered data to a new file.

  Args:
      metadata_file (str): Path to the CSV file (metadata.csv).
      wavs_dir (str): Path to the subdirectory containing .wav files (wavs).
      output_file (str, optional): Path to the output file (defaults to "metadata2.csv").
  """
  with open(metadata_file, 'r', encoding='utf-8') as csvfile, \
          open(output_file, 'w', encoding='utf-8') as new_csvfile:
    reader = csv.reader(csvfile, delimiter='|')
    writer = csv.writer(new_csvfile, delimiter='|')

    for row in reader:
      wav_filename = row[0] + '.wav'  # Extract filename from first column
      wav_path = os.path.join(wavs_dir, wav_filename)

      if os.path.exists(wav_path):
        print(row)
        writer.writerow(row)  # Write the row if the file exists
      else:
        print(f"File not found: {wav_path}")  # Print a message

  print(f"Processed metadata file and removed lines for non-existent .wav files. Results saved to {output_file}.")

if __name__ == '__main__':
  output_file = 'c:\\tts\\datasets\\hindi_female_mono\\speaker1\\speaker1-2.csv'

  metadata_file = 'c:\\tts\\datasets\\hindi_female_mono\\speaker1\\speaker1.csv'
  wavs_dir = 'c:\\tts\\datasets\\hindi_female_mono\\speaker1\\wavs\\'

  remove_nonexistent_files(metadata_file, wavs_dir,output_file)

Batch inference test a directory of samples with a list of sentences:

xtts-batch-inf.py

import os
import torch
import torchaudio
from datetime import datetime
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
import logging
import time
logger = logging.getLogger(__name__)
print("Loading model...")
config = XttsConfig()
config.load_json("./xtts/config.json")
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="./xtts/", use_deepspeed=False)
#config.enable_readaction=True
model.cuda()
speakerpath = "./speakers-hi/"
phrases = ["I like big butts and I cannot lie, You other brothers can't deny. That when a girl walks in with an itty bitty waist, And a round thing in your face, you get sprung. Wanna pull up tough 'cause you notice that butt was stuffed. Deep in the jeans she's wearin', I'm hooked and I can't stop starin'. Oh, baby, I wanna get with ya, And take your picture, My homeboys tried to warn me, But that butt you got makes Me-me so horny.","गजधर वास्तुकार थे। गांव-समाज हो या नगर-समाज - उसके नव निर्माण की, रख-रखाव की ज़िम्मेदारी गजधर निभाते थे। नगर नियोजन से लेकर छोटे से छोटे निर्माण के काम गजधर के कधों पर टिके थे।","पसिखाई जाती थी तो कहीं यह जात से हट कर एक विशेष पांत भी जाती थी। बनाने वाले लोग कहीं एक जगह बसे मिलते थे तो कहीं -घूम कर इस काम को करते थे।","I am the very model of a modern Major-General. I've information vegetable, animal, and mineral. I know the kings of England, and I quote the fights historical; from Marathon to Waterloo, in order categorical."]
print(len(phrases))
for filename in os.listdir(speakerpath):
    if filename.endswith(".wav"):
        for phrase in phrases:
            start_time = time.time()

            print("Computing speaker latents...")
            gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[speakerpath+filename])

            print("Inference...")
            out = model.inference(
            phrase,
            "hi",
            gpt_cond_latent, 
            speaker_embedding,
            temperature=0.7, # Add custom parameters here
        )
            now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
            # compute stats
            process_time = time.time() - start_time
            audio_time = len(torch.tensor(out["wav"]).unsqueeze(0) / 24000)
            logger.warning("Processing time: %.3f", process_time)
            logger.warning("Real-time factor: %.3f", process_time / audio_time)
            torchaudio.save(f"{now}-xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)

Leave a Reply

Your email address will not be published. Required fields are marked *