| | import os
|
| | import sys
|
| | import soundfile as sf
|
| | from tqdm import tqdm
|
| |
|
| | def is_significant_audio(file_path, silence_threshold=-40, silence_percent=90):
|
| | """
|
| | Check if an audio file contains significant non-silent parts.
|
| | """
|
| | try:
|
| | data, samplerate = sf.read(file_path)
|
| | if len(data) == 0:
|
| | return False
|
| |
|
| |
|
| | energy = (data ** 2).mean()
|
| | silence_ratio = (energy < silence_threshold).sum() / len(data) * 100
|
| | return silence_ratio < silence_percent
|
| | except Exception as e:
|
| | print(f"Error processing {file_path}: {e}")
|
| | return False
|
| |
|
| | def filter_manifest(manifest_path, output_path, dataset_dir):
|
| | """
|
| | Read the manifest file, check for silence, and write filtered files.
|
| | """
|
| | with open(manifest_path, 'r') as f:
|
| | lines = f.readlines()
|
| |
|
| | filtered_lines = [lines[0]]
|
| | for line in tqdm(lines[1:], desc=f"Processing {manifest_path}"):
|
| | file_path = os.path.join(dataset_dir, line.split("\t")[0])
|
| | if is_significant_audio(file_path):
|
| | filtered_lines.append(line)
|
| | else:
|
| | print(f"Skipping file due to silence: {file_path}")
|
| |
|
| | with open(output_path, 'w') as f_out:
|
| | f_out.writelines(filtered_lines)
|
| |
|
| | if __name__ == "__main__":
|
| | train_manifest = sys.argv[1]
|
| | valid_manifest = sys.argv[2]
|
| | output_dir = sys.argv[3]
|
| |
|
| | os.makedirs(output_dir, exist_ok=True)
|
| |
|
| | dataset_dir = "dataset"
|
| | filter_manifest(train_manifest, os.path.join(output_dir, "train.tsv"), dataset_dir)
|
| | filter_manifest(valid_manifest, os.path.join(output_dir, "valid.tsv"), dataset_dir)
|
| |
|