| """ | |
| Generate VoxCeleb1 SID manifest for SpeechT5. | |
| iden_split.txt | |
| 1 id11251/s4R4hvqrhFw/00002.wav | |
| 1 id11251/gFfcgOVmiO0/00006.wav | |
| 3 id11251/7GtZpUtReJ8/00001.wav | |
| 2 id11251/5-6lI5JQtb8/00001.wav | |
| 3 id11251/7GtZpUtReJ8/00006.wav | |
| """ | |
| import logging | |
| import argparse | |
| import os | |
| from scipy.io import wavfile | |
| from tqdm import tqdm | |
| logger = logging.getLogger(__name__) | |
| SPLITS = { | |
| "train": 1, | |
| "valid": 2, | |
| "test": 3, | |
| } | |
| class VoxCeleb1SID: | |
| def __init__(self, root, split, iden_path): | |
| self.root = root | |
| self.speakers = [] | |
| self.paths = [] | |
| with open(iden_path, "r") as f: | |
| for line in f: | |
| items = line.strip().split(" ") | |
| split_type = int(items[0]) | |
| wav_path = items[1] | |
| if split == split_type: | |
| self.speakers.append(wav_path.split("/")[0]) | |
| self.paths.append(wav_path) | |
| def __len__(self): | |
| return len(self.paths) | |
| def __getitem__(self, index): | |
| speaker = self.speakers[index] | |
| file_audio = os.path.join(self.root, self.paths[index]) | |
| sample_rate, wav = wavfile.read(file_audio) | |
| n_frames = wav.shape[0] | |
| return n_frames, sample_rate, speaker, self.paths[index] | |
| def get_parser(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| "root", metavar="DIR", help="root directory containing wav files to index" | |
| ) | |
| parser.add_argument( | |
| "--output", default=".", type=str, metavar="DIR", help="output directory of manifest" | |
| ) | |
| parser.add_argument( | |
| "--split", required=True, type=str, choices=["train", "valid", "test"], help="dataset splits" | |
| ) | |
| parser.add_argument( | |
| "--wav-root", default=None, type=str, metavar="DIR", help="saved waveform root directory for tsv" | |
| ) | |
| parser.add_argument( | |
| "--iden-split", required=True, type=str, help="officially released split for identification" | |
| ) | |
| return parser | |
| def main(args): | |
| dest_dir = args.output | |
| wav_root = args.wav_root | |
| if not os.path.exists(args.iden_split): | |
| logger.error(f"split {args.iden_split} does not exist") | |
| if not os.path.exists(dest_dir): | |
| os.makedirs(dest_dir) | |
| dataset = VoxCeleb1SID(args.root, SPLITS[args.split], args.iden_split) | |
| tsv = open(os.path.join(dest_dir, f"{args.split}.tsv"), "w") | |
| print(wav_root, file=tsv) | |
| for n_frames, sr, spk_id, wav_path in tqdm(dataset, desc="tsv/txt/wav"): | |
| assert sr == 16000, f"sampling rate {sr} != 16000" | |
| assert os.path.exists(os.path.join(args.root, wav_path)) | |
| print(f"{wav_path}\t{n_frames}\t{spk_id}", file=tsv) | |
| tsv.close() | |
| if __name__ == "__main__": | |
| parser = get_parser() | |
| args = parser.parse_args() | |
| main(args) | |