Spaces:
Running
Running
File size: 1,110 Bytes
c2e60bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
#!/usr/bin/env python3
import os
from datasets import load_dataset, concatenate_datasets, Dataset
SEED = 42
SAMPLE_SIZE = 100
testsets: list[tuple[str, Dataset]] = [
("TIMIT", load_dataset("KoelLabs/TIMIT")["test"]),
("EpaDB", load_dataset("KoelLabs/EpaDB")["test"]),
("PSST", load_dataset("KoelLabs/PSST")["test"]),
("SpeechOcean", load_dataset("KoelLabs/SpeechOceanNoTH")["test"]),
("ISLE", load_dataset("KoelLabs/ISLE")["train"]),
] # type: ignore
all_datasets = []
for name, test_ds in testsets:
shuffled_ds = test_ds.shuffle(seed=SEED)
sample_ds = shuffled_ds.select(range(SAMPLE_SIZE))
sample_ds = sample_ds.add_column("dataset", [name] * len(sample_ds)) # type: ignore
sample_ds = sample_ds.remove_columns(
[
col
for col in sample_ds.column_names
if col not in ["audio", "ipa", "dataset"]
]
)
all_datasets.append(sample_ds)
combined_ds: Dataset = concatenate_datasets(all_datasets)
os.makedirs(os.path.join("app", "data"), exist_ok=True)
combined_ds.save_to_disk(os.path.join("app", "data", "test"))
|