In this video I look at Coqui’s new XTTS v1 text to speech model, and complain about licensing. Then I look at a couple tools, pyannote and Speechbrain, and use a model to generate and compare audio embeddings.
This can be used to identify mismatching audio clips in your datasets. Remove poor quality clips, and shrink your dataset for faster and more reliable training.
Video Link: https://www.youtube.com/watch?v=AUln9N9dh9M
import IPython.display as ipd
import pandas as pd
from IPython.display import Audio
from IPython.core.display import display
from speechbrain.pretrained import SpeakerRecognition
!pip install https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
!pip install speechbrain
verification = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")
file1 = 'file1.wav'
file2 = 'file2.wav'
score, prediction = verification.verify_files(file1, file2)
print(score)
print(prediction) # True = same speaker, False=Different speakers
score_ds = pd.DataFrame(columns=["file1","file2","score","prediction"])
df = pd.read_csv("/home/nano/ai2/vctk-jim-16k-ds-ljs-jw-metadata.csv",sep="|",names = ['audio', 'text'],index_col=False)
import os, itertools
files = os.listdir("/mnt/e/jimsamples")
file1 = files[35]
for file2 in files:
fp1 = "/mnt/e/jimsamples/"+file1
fp2 = "/mnt/e/jimsamples/"+file2
score, prediction = verification.verify_files(fp1, fp2)
print(file1 + " to " + file2 + " score: " + str(score) + " prediction: " + str(prediction))
new_row = pd.DataFrame({'file1': file1, 'file2': file2, 'score': score, 'prediction': prediction}, index=[0])
score_ds = pd.concat([new_row,score_ds.loc[:]]).reset_index(drop=True)
len(score_ds.loc[score_ds['prediction'] == False].index)
print("File 1:")
print(fp1)
display(ipd.Audio(fp1))
print("Compared to:")
for ind in score_ds.loc[score_ds['prediction'] == False].index:
fp2 = "/mnt/e/jimsamples/"+score_ds['file2'][ind]
print(fp2)
display(ipd.Audio(fp2))
cos_ds = pd.DataFrame(columns=["file1","file2","cos","prediction"])
import torch
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
model = PretrainedSpeakerEmbedding(
"speechbrain/spkrec-ecapa-voxceleb",
device=torch.device("cuda"))
from pyannote.audio import Audio
from pyannote.core import Segment
audio = Audio(sample_rate=22500, mono="downmix")
import os, itertools
files = os.listdir("/mnt/e/jimsamples")
file1 = "ash-30.wav"
for file2 in files:
fp1 = "/mnt/e/jimsamples/"+file1
fp2 = "/mnt/e/jimsamples/"+file2
speaker1 = Segment(0.1,)
waveform1, sample_rate = audio.crop(fp1, speaker1)
embedding1 = model(waveform1[None])
speaker2 = Segment(0.1,)
waveform2, sample_rate = audio.crop(fp2, speaker2)
embedding2 = model(waveform2[None])
# compare embeddings using "cosine" distance
from scipy.spatial.distance import cdist
distance = cdist(embedding1, embedding2, metric="cosine")
print(file1 + " to " + file2 + " cos distance: " + str(distance) +".")
new_row = pd.DataFrame({'file1': file1, 'file2': file2, 'cos': float(distance)}, index=[0])
cos_ds = pd.concat([new_row,cos_ds.loc[:]]).reset_index(drop=True)
thresh = 0.6
for ind in cos_ds.loc[cos_ds['cos'] > thresh ].index:
fp2 = "/mnt/e/jimsamples/"+cos_ds['file2'][ind]
print(fp2)
display(ipd.Audio(fp2))
!mkdir /mnt/e/jimsamples/backup
import shutil
import os
for ind in score_ds.loc[score_ds['prediction'] == False].index:
fp2 = "/mnt/e/jimsamples/"+score_ds['file2'][ind]
shutil.move(fp2, "/mnt/e/jimsamples/backup/")
print(fp2 + " moved")