Prompt48 commited on
Commit
ff8fcd9
·
verified ·
1 Parent(s): 1d9eebe

Upload edit\Qwen3-TTS-test\.venv\Lib\site-packages\librosa\effects.py with huggingface_hub

Browse files
edit//Qwen3-TTS-test//.venv//Lib//site-packages//librosa//effects.py ADDED
@@ -0,0 +1,1002 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Effects
5
+ =======
6
+
7
+ Harmonic-percussive source separation
8
+ -------------------------------------
9
+ .. autosummary::
10
+ :toctree: generated/
11
+
12
+ hpss
13
+ harmonic
14
+ percussive
15
+
16
+ Time and frequency
17
+ ------------------
18
+ .. autosummary::
19
+ :toctree: generated/
20
+
21
+ time_stretch
22
+ pitch_shift
23
+
24
+ Miscellaneous
25
+ -------------
26
+ .. autosummary::
27
+ :toctree: generated/
28
+
29
+ remix
30
+ trim
31
+ split
32
+ preemphasis
33
+ deemphasis
34
+ """
35
+
36
+ import numpy as np
37
+ import scipy.signal
38
+
39
+ from . import core
40
+ from . import decompose
41
+ from . import feature
42
+ from . import util
43
+ from .util.exceptions import ParameterError
44
+ from typing import Any, Callable, Iterable, Optional, Tuple, List, Union, overload
45
+ from typing_extensions import Literal
46
+ from numpy.typing import ArrayLike
47
+ from ._typing import (
48
+ _WindowSpec,
49
+ _PadModeSTFT,
50
+ _IntLike_co,
51
+ _FloatLike_co,
52
+ )
53
+
54
+ __all__ = [
55
+ "hpss",
56
+ "harmonic",
57
+ "percussive",
58
+ "time_stretch",
59
+ "pitch_shift",
60
+ "remix",
61
+ "trim",
62
+ "split",
63
+ ]
64
+
65
+
66
+ def hpss(
67
+ y: np.ndarray,
68
+ *,
69
+ kernel_size: Union[
70
+ _IntLike_co, Tuple[_IntLike_co, _IntLike_co], List[_IntLike_co]
71
+ ] = 31,
72
+ power: float = 2.0,
73
+ mask: bool = False,
74
+ margin: Union[
75
+ _FloatLike_co, Tuple[_FloatLike_co, _FloatLike_co], List[_FloatLike_co]
76
+ ] = 1.0,
77
+ n_fft: int = 2048,
78
+ hop_length: Optional[int] = None,
79
+ win_length: Optional[int] = None,
80
+ window: _WindowSpec = "hann",
81
+ center: bool = True,
82
+ pad_mode: _PadModeSTFT = "constant",
83
+ ) -> Tuple[np.ndarray, np.ndarray]:
84
+ """Decompose an audio time series into harmonic and percussive components.
85
+
86
+ This function automates the STFT->HPSS->ISTFT pipeline, and ensures that
87
+ the output waveforms have equal length to the input waveform ``y``.
88
+
89
+ Parameters
90
+ ----------
91
+ y : np.ndarray [shape=(..., n)]
92
+ audio time series. Multi-channel is supported.
93
+ kernel_size
94
+ power
95
+ mask
96
+ margin
97
+ See `librosa.decompose.hpss`
98
+ n_fft
99
+ hop_length
100
+ win_length
101
+ window
102
+ center
103
+ pad_mode
104
+ See `librosa.stft`
105
+
106
+ Returns
107
+ -------
108
+ y_harmonic : np.ndarray [shape=(..., n)]
109
+ audio time series of the harmonic elements
110
+ y_percussive : np.ndarray [shape=(..., n)]
111
+ audio time series of the percussive elements
112
+
113
+ See Also
114
+ --------
115
+ harmonic : Extract only the harmonic component
116
+ percussive : Extract only the percussive component
117
+ librosa.decompose.hpss : HPSS on spectrograms
118
+
119
+ Examples
120
+ --------
121
+ >>> # Extract harmonic and percussive components
122
+ >>> y, sr = librosa.load(librosa.ex('choice'))
123
+ >>> y_harmonic, y_percussive = librosa.effects.hpss(y)
124
+
125
+ >>> # Get a more isolated percussive component by widening its margin
126
+ >>> y_harmonic, y_percussive = librosa.effects.hpss(y, margin=(1.0,5.0))
127
+ """
128
+ # Compute the STFT matrix
129
+ stft = core.stft(
130
+ y,
131
+ n_fft=n_fft,
132
+ hop_length=hop_length,
133
+ win_length=win_length,
134
+ center=center,
135
+ pad_mode=pad_mode,
136
+ )
137
+
138
+ # Decompose into harmonic and percussives
139
+ stft_harm, stft_perc = decompose.hpss(
140
+ stft, kernel_size=kernel_size, power=power, mask=mask, margin=margin
141
+ )
142
+
143
+ # Invert the STFTs. Adjust length to match the input.
144
+ y_harm = core.istft(
145
+ stft_harm,
146
+ dtype=y.dtype,
147
+ n_fft=n_fft,
148
+ hop_length=hop_length,
149
+ win_length=win_length,
150
+ center=center,
151
+ length=y.shape[-1],
152
+ )
153
+ y_perc = core.istft(
154
+ stft_perc,
155
+ dtype=y.dtype,
156
+ n_fft=n_fft,
157
+ hop_length=hop_length,
158
+ win_length=win_length,
159
+ center=center,
160
+ length=y.shape[-1],
161
+ )
162
+
163
+ return y_harm, y_perc
164
+
165
+
166
+ def harmonic(
167
+ y: np.ndarray,
168
+ *,
169
+ kernel_size: Union[
170
+ _IntLike_co, Tuple[_IntLike_co, _IntLike_co], List[_IntLike_co]
171
+ ] = 31,
172
+ power: float = 2.0,
173
+ mask: bool = False,
174
+ margin: Union[
175
+ _FloatLike_co, Tuple[_FloatLike_co, _FloatLike_co], List[_FloatLike_co]
176
+ ] = 1.0,
177
+ n_fft: int = 2048,
178
+ hop_length: Optional[int] = None,
179
+ win_length: Optional[int] = None,
180
+ window: _WindowSpec = "hann",
181
+ center: bool = True,
182
+ pad_mode: _PadModeSTFT = "constant",
183
+ ) -> np.ndarray:
184
+ """Extract harmonic elements from an audio time-series.
185
+
186
+ Parameters
187
+ ----------
188
+ y : np.ndarray [shape=(..., n)]
189
+ audio time series. Multi-channel is supported.
190
+ kernel_size
191
+ power
192
+ mask
193
+ margin
194
+ See `librosa.decompose.hpss`
195
+ n_fft
196
+ hop_length
197
+ win_length
198
+ window
199
+ center
200
+ pad_mode
201
+ See `librosa.stft`
202
+
203
+ Returns
204
+ -------
205
+ y_harmonic : np.ndarray [shape=(..., n)]
206
+ audio time series of just the harmonic portion
207
+
208
+ See Also
209
+ --------
210
+ hpss : Separate harmonic and percussive components
211
+ percussive : Extract only the percussive component
212
+ librosa.decompose.hpss : HPSS for spectrograms
213
+
214
+ Examples
215
+ --------
216
+ >>> # Extract harmonic component
217
+ >>> y, sr = librosa.load(librosa.ex('choice'))
218
+ >>> y_harmonic = librosa.effects.harmonic(y)
219
+
220
+ >>> # Use a margin > 1.0 for greater harmonic separation
221
+ >>> y_harmonic = librosa.effects.harmonic(y, margin=3.0)
222
+ """
223
+ # Compute the STFT matrix
224
+ stft = core.stft(
225
+ y,
226
+ n_fft=n_fft,
227
+ hop_length=hop_length,
228
+ win_length=win_length,
229
+ center=center,
230
+ pad_mode=pad_mode,
231
+ )
232
+
233
+ # Remove percussives
234
+ stft_harm = decompose.hpss(
235
+ stft, kernel_size=kernel_size, power=power, mask=mask, margin=margin
236
+ )[0]
237
+
238
+ # Invert the STFTs
239
+ y_harm = core.istft(
240
+ stft_harm,
241
+ dtype=y.dtype,
242
+ n_fft=n_fft,
243
+ hop_length=hop_length,
244
+ win_length=win_length,
245
+ center=center,
246
+ length=y.shape[-1],
247
+ )
248
+
249
+ return y_harm
250
+
251
+
252
+ def percussive(
253
+ y: np.ndarray,
254
+ *,
255
+ kernel_size: Union[
256
+ _IntLike_co, Tuple[_IntLike_co, _IntLike_co], List[_IntLike_co]
257
+ ] = 31,
258
+ power: float = 2.0,
259
+ mask: bool = False,
260
+ margin: Union[
261
+ _FloatLike_co, Tuple[_FloatLike_co, _FloatLike_co], List[_FloatLike_co]
262
+ ] = 1.0,
263
+ n_fft: int = 2048,
264
+ hop_length: Optional[int] = None,
265
+ win_length: Optional[int] = None,
266
+ window: _WindowSpec = "hann",
267
+ center: bool = True,
268
+ pad_mode: _PadModeSTFT = "constant",
269
+ ) -> np.ndarray:
270
+ """Extract percussive elements from an audio time-series.
271
+
272
+ Parameters
273
+ ----------
274
+ y : np.ndarray [shape=(..., n)]
275
+ audio time series. Multi-channel is supported.
276
+ kernel_size
277
+ power
278
+ mask
279
+ margin
280
+ See `librosa.decompose.hpss`
281
+ n_fft
282
+ hop_length
283
+ win_length
284
+ window
285
+ center
286
+ pad_mode
287
+ See `librosa.stft`
288
+
289
+ Returns
290
+ -------
291
+ y_percussive : np.ndarray [shape=(..., n)]
292
+ audio time series of just the percussive portion
293
+
294
+ See Also
295
+ --------
296
+ hpss : Separate harmonic and percussive components
297
+ harmonic : Extract only the harmonic component
298
+ librosa.decompose.hpss : HPSS for spectrograms
299
+
300
+ Examples
301
+ --------
302
+ >>> # Extract percussive component
303
+ >>> y, sr = librosa.load(librosa.ex('choice'))
304
+ >>> y_percussive = librosa.effects.percussive(y)
305
+
306
+ >>> # Use a margin > 1.0 for greater percussive separation
307
+ >>> y_percussive = librosa.effects.percussive(y, margin=3.0)
308
+ """
309
+ # Compute the STFT matrix
310
+ stft = core.stft(
311
+ y,
312
+ n_fft=n_fft,
313
+ hop_length=hop_length,
314
+ win_length=win_length,
315
+ center=center,
316
+ pad_mode=pad_mode,
317
+ )
318
+
319
+ # Remove harmonics
320
+ stft_perc = decompose.hpss(
321
+ stft, kernel_size=kernel_size, power=power, mask=mask, margin=margin
322
+ )[1]
323
+
324
+ # Invert the STFT
325
+ y_perc = core.istft(
326
+ stft_perc,
327
+ dtype=y.dtype,
328
+ n_fft=n_fft,
329
+ hop_length=hop_length,
330
+ win_length=win_length,
331
+ center=center,
332
+ length=y.shape[-1],
333
+ )
334
+
335
+ return y_perc
336
+
337
+
338
+ def time_stretch(y: np.ndarray, *, rate: float, **kwargs: Any) -> np.ndarray:
339
+ """Time-stretch an audio series by a fixed rate.
340
+
341
+ Parameters
342
+ ----------
343
+ y : np.ndarray [shape=(..., n)]
344
+ audio time series. Multi-channel is supported.
345
+ rate : float > 0 [scalar]
346
+ Stretch factor. If ``rate > 1``, then the signal is sped up.
347
+ If ``rate < 1``, then the signal is slowed down.
348
+ **kwargs : additional keyword arguments.
349
+ See `librosa.decompose.stft` for details.
350
+
351
+ Returns
352
+ -------
353
+ y_stretch : np.ndarray [shape=(..., round(n/rate))]
354
+ audio time series stretched by the specified rate
355
+
356
+ See Also
357
+ --------
358
+ pitch_shift :
359
+ pitch shifting
360
+ librosa.phase_vocoder :
361
+ spectrogram phase vocoder
362
+ pyrubberband.pyrb.time_stretch :
363
+ high-quality time stretching using RubberBand
364
+
365
+ Examples
366
+ --------
367
+ Compress to be twice as fast
368
+
369
+ >>> y, sr = librosa.load(librosa.ex('choice'))
370
+ >>> y_fast = librosa.effects.time_stretch(y, rate=2.0)
371
+
372
+ Or half the original speed
373
+
374
+ >>> y_slow = librosa.effects.time_stretch(y, rate=0.5)
375
+ """
376
+ if rate <= 0:
377
+ raise ParameterError("rate must be a positive number")
378
+
379
+ # Construct the short-term Fourier transform (STFT)
380
+ stft = core.stft(y, **kwargs)
381
+
382
+ # Stretch by phase vocoding
383
+ stft_stretch = core.phase_vocoder(
384
+ stft,
385
+ rate=rate,
386
+ hop_length=kwargs.get("hop_length", None),
387
+ n_fft=kwargs.get("n_fft", None),
388
+ )
389
+
390
+ # Predict the length of y_stretch
391
+ len_stretch = int(round(y.shape[-1] / rate))
392
+
393
+ # Invert the STFT
394
+ y_stretch = core.istft(stft_stretch, dtype=y.dtype, length=len_stretch, **kwargs)
395
+
396
+ return y_stretch
397
+
398
+
399
+ def pitch_shift(
400
+ y: np.ndarray,
401
+ *,
402
+ sr: float,
403
+ n_steps: float,
404
+ bins_per_octave: int = 12,
405
+ res_type: str = "soxr_hq",
406
+ scale: bool = False,
407
+ **kwargs: Any,
408
+ ) -> np.ndarray:
409
+ """Shift the pitch of a waveform by ``n_steps`` steps.
410
+
411
+ A step is equal to a semitone if ``bins_per_octave`` is set to 12.
412
+
413
+ Parameters
414
+ ----------
415
+ y : np.ndarray [shape=(..., n)]
416
+ audio time series. Multi-channel is supported.
417
+
418
+ sr : number > 0 [scalar]
419
+ audio sampling rate of ``y``
420
+
421
+ n_steps : float [scalar]
422
+ how many (fractional) steps to shift ``y``
423
+
424
+ bins_per_octave : int > 0 [scalar]
425
+ how many steps per octave
426
+
427
+ res_type : string
428
+ Resample type. By default, 'soxr_hq' is used.
429
+
430
+ See `librosa.resample` for more information.
431
+
432
+ scale : bool
433
+ Scale the resampled signal so that ``y`` and ``y_hat`` have approximately
434
+ equal total energy.
435
+
436
+ **kwargs : additional keyword arguments.
437
+ See `librosa.decompose.stft` for details.
438
+
439
+ Returns
440
+ -------
441
+ y_shift : np.ndarray [shape=(..., n)]
442
+ The pitch-shifted audio time-series
443
+
444
+ See Also
445
+ --------
446
+ time_stretch :
447
+ time stretching
448
+ librosa.phase_vocoder :
449
+ spectrogram phase vocoder
450
+ pyrubberband.pyrb.pitch_shift :
451
+ high-quality pitch shifting using RubberBand
452
+
453
+ Examples
454
+ --------
455
+ Shift up by a major third (four steps if ``bins_per_octave`` is 12)
456
+
457
+ >>> y, sr = librosa.load(librosa.ex('choice'))
458
+ >>> y_third = librosa.effects.pitch_shift(y, sr=sr, n_steps=4)
459
+
460
+ Shift down by a tritone (six steps if ``bins_per_octave`` is 12)
461
+
462
+ >>> y_tritone = librosa.effects.pitch_shift(y, sr=sr, n_steps=-6)
463
+
464
+ Shift up by 3 quarter-tones
465
+
466
+ >>> y_three_qt = librosa.effects.pitch_shift(y, sr=sr, n_steps=3,
467
+ ... bins_per_octave=24)
468
+ """
469
+ if not util.is_positive_int(bins_per_octave):
470
+ raise ParameterError(
471
+ f"bins_per_octave={bins_per_octave} must be a positive integer."
472
+ )
473
+
474
+ rate = 2.0 ** (-float(n_steps) / bins_per_octave)
475
+
476
+ # Stretch in time, then resample
477
+ y_shift = core.resample(
478
+ time_stretch(y, rate=rate, **kwargs),
479
+ orig_sr=float(sr) / rate,
480
+ target_sr=sr,
481
+ res_type=res_type,
482
+ scale=scale,
483
+ )
484
+
485
+ # Crop to the same dimension as the input
486
+ return util.fix_length(y_shift, size=y.shape[-1])
487
+
488
+
489
+ def remix(
490
+ y: np.ndarray, intervals: Iterable[Tuple[int, int]], *, align_zeros: bool = True
491
+ ) -> np.ndarray:
492
+ """Remix an audio signal by re-ordering time intervals.
493
+
494
+ Parameters
495
+ ----------
496
+ y : np.ndarray [shape=(..., t)]
497
+ Audio time series. Multi-channel is supported.
498
+ intervals : iterable of tuples (start, end)
499
+ An iterable (list-like or generator) where the ``i``th item
500
+ ``intervals[i]`` indicates the start and end (in samples)
501
+ of a slice of ``y``.
502
+ align_zeros : boolean
503
+ If ``True``, interval boundaries are mapped to the closest
504
+ zero-crossing in ``y``. If ``y`` is stereo, zero-crossings
505
+ are computed after converting to mono.
506
+
507
+ Returns
508
+ -------
509
+ y_remix : np.ndarray [shape=(..., d)]
510
+ ``y`` remixed in the order specified by ``intervals``
511
+
512
+ Examples
513
+ --------
514
+ Load in the example track and reverse the beats
515
+
516
+ >>> y, sr = librosa.load(librosa.ex('choice'))
517
+
518
+ Compute beats
519
+
520
+ >>> _, beat_frames = librosa.beat.beat_track(y=y, sr=sr,
521
+ ... hop_length=512)
522
+
523
+ Convert from frames to sample indices
524
+
525
+ >>> beat_samples = librosa.frames_to_samples(beat_frames)
526
+
527
+ Generate intervals from consecutive events
528
+
529
+ >>> intervals = librosa.util.frame(beat_samples, frame_length=2,
530
+ ... hop_length=1).T
531
+
532
+ Reverse the beat intervals
533
+
534
+ >>> y_out = librosa.effects.remix(y, intervals[::-1])
535
+ """
536
+ y_out = []
537
+
538
+ if align_zeros:
539
+ y_mono = core.to_mono(y)
540
+ zeros = np.nonzero(core.zero_crossings(y_mono))[-1]
541
+ # Force end-of-signal onto zeros
542
+ zeros = np.append(zeros, [len(y_mono)])
543
+
544
+ for interval in intervals:
545
+ if align_zeros:
546
+ interval = zeros[util.match_events(interval, zeros)]
547
+
548
+ y_out.append(y[..., interval[0] : interval[1]])
549
+
550
+ return np.concatenate(y_out, axis=-1)
551
+
552
+
553
+ def _signal_to_frame_nonsilent(
554
+ y: np.ndarray,
555
+ frame_length: int = 2048,
556
+ hop_length: int = 512,
557
+ top_db: float = 60,
558
+ ref: Union[Callable, float] = np.max,
559
+ aggregate: Callable = np.max,
560
+ ) -> np.ndarray:
561
+ """Frame-wise non-silent indicator for audio input.
562
+
563
+ This is a helper function for `trim` and `split`.
564
+
565
+ Parameters
566
+ ----------
567
+ y : np.ndarray
568
+ Audio signal, mono or stereo
569
+
570
+ frame_length : int > 0
571
+ The number of samples per frame
572
+
573
+ hop_length : int > 0
574
+ The number of samples between frames
575
+
576
+ top_db : number
577
+ The threshold (in decibels) below reference to consider as
578
+ silence.
579
+ You can also use a negative value for `top_db` to treat any value
580
+ below `ref + |top_db|` as silent. This will only make sense if
581
+ `ref` is not `np.max`.
582
+
583
+ ref : callable or float
584
+ The reference amplitude
585
+
586
+ aggregate : callable [default: np.max]
587
+ Function to aggregate dB measurements across channels (if y.ndim > 1)
588
+
589
+ Note: for multiple leading axes, this is performed using ``np.apply_over_axes``.
590
+
591
+ Returns
592
+ -------
593
+ non_silent : np.ndarray, shape=(m,), dtype=bool
594
+ Indicator of non-silent frames
595
+ """
596
+ # Compute the MSE for the signal
597
+ mse = feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)
598
+
599
+ # Convert to decibels and slice out the mse channel
600
+ db: np.ndarray = core.amplitude_to_db(mse[..., 0, :], ref=ref, top_db=None)
601
+
602
+ # Aggregate everything but the time dimension
603
+ if db.ndim > 1:
604
+ db = np.apply_over_axes(aggregate, db, range(db.ndim - 1))
605
+ # Squeeze out leading singleton dimensions here
606
+ # We always want to keep the trailing dimension though
607
+ db = np.squeeze(db, axis=tuple(range(db.ndim - 1)))
608
+
609
+ return db > -top_db
610
+
611
+
612
+ def trim(
613
+ y: np.ndarray,
614
+ *,
615
+ top_db: float = 60,
616
+ ref: Union[float, Callable] = np.max,
617
+ frame_length: int = 2048,
618
+ hop_length: int = 512,
619
+ aggregate: Callable = np.max,
620
+ ) -> Tuple[np.ndarray, np.ndarray]:
621
+ """Trim leading and trailing silence from an audio signal.
622
+
623
+ Silence is defined as segments of the audio signal that are `top_db`
624
+ decibels (or more) quieter than a reference level, `ref`.
625
+ By default, `ref` is set to the signal's maximum RMS value.
626
+ It's important to note that if the entire signal maintains a uniform
627
+ RMS value, there will be no segments considered quieter than the maximum,
628
+ leading to no trimming.
629
+ This implies that a completely silent signal will remain untrimmed with the default `ref` setting.
630
+ In these situations, an explicit value for `ref` (in decibels) should be used instead.
631
+
632
+ Parameters
633
+ ----------
634
+ y : np.ndarray, shape=(..., n)
635
+ Audio signal. Multi-channel is supported.
636
+ top_db : number
637
+ The threshold (in decibels) below reference to consider as
638
+ silence.
639
+ You can also use a negative value for `top_db` to treat any value
640
+ below `ref + |top_db|` as silent. This will only make sense if
641
+ `ref` is not `np.max`.
642
+ ref : number or callable
643
+ The reference amplitude. By default, it uses `np.max` and compares
644
+ to the peak amplitude in the signal.
645
+ frame_length : int > 0
646
+ The number of samples per analysis frame
647
+ hop_length : int > 0
648
+ The number of samples between analysis frames
649
+ aggregate : callable [default: np.max]
650
+ Function to aggregate across channels (if y.ndim > 1)
651
+
652
+ Returns
653
+ -------
654
+ y_trimmed : np.ndarray, shape=(..., m)
655
+ The trimmed signal
656
+ index : np.ndarray, shape=(2,)
657
+ the interval of ``y`` corresponding to the non-silent region:
658
+ ``y_trimmed = y[index[0]:index[1]]`` (for mono) or
659
+ ``y_trimmed = y[:, index[0]:index[1]]`` (for stereo).
660
+
661
+ Examples
662
+ --------
663
+ >>> # Load some audio
664
+ >>> y, sr = librosa.load(librosa.ex('choice'))
665
+ >>> # Trim the beginning and ending silence
666
+ >>> yt, index = librosa.effects.trim(y)
667
+ >>> # Print the durations
668
+ >>> print(librosa.get_duration(y, sr=sr), librosa.get_duration(yt, sr=sr))
669
+ 25.025986394557822 25.007891156462584
670
+ """
671
+ non_silent = _signal_to_frame_nonsilent(
672
+ y,
673
+ frame_length=frame_length,
674
+ hop_length=hop_length,
675
+ ref=ref,
676
+ top_db=top_db,
677
+ aggregate=aggregate,
678
+ )
679
+
680
+ nonzero = np.flatnonzero(non_silent)
681
+
682
+ if nonzero.size > 0:
683
+ # Compute the start and end positions
684
+ # End position goes one frame past the last non-zero
685
+ start = int(core.frames_to_samples(nonzero[0], hop_length=hop_length))
686
+ end = min(
687
+ y.shape[-1],
688
+ int(core.frames_to_samples(nonzero[-1] + 1, hop_length=hop_length)),
689
+ )
690
+ else:
691
+ # The entire signal is trimmed here: nothing is above the threshold
692
+ start, end = 0, 0
693
+
694
+ # Slice the buffer and return the corresponding interval
695
+ return y[..., start:end], np.asarray([start, end])
696
+
697
+
698
+ def split(
699
+ y: np.ndarray,
700
+ *,
701
+ top_db: float = 60,
702
+ ref: Union[float, Callable] = np.max,
703
+ frame_length: int = 2048,
704
+ hop_length: int = 512,
705
+ aggregate: Callable = np.max,
706
+ ) -> np.ndarray:
707
+ """Split an audio signal into non-silent intervals.
708
+
709
+ Parameters
710
+ ----------
711
+ y : np.ndarray, shape=(..., n)
712
+ An audio signal. Multi-channel is supported.
713
+ top_db : number > 0
714
+ The threshold (in decibels) below reference to consider as
715
+ silence
716
+ ref : number or callable
717
+ The reference amplitude. By default, it uses `np.max` and compares
718
+ to the peak amplitude in the signal.
719
+ frame_length : int > 0
720
+ The number of samples per analysis frame
721
+ hop_length : int > 0
722
+ The number of samples between analysis frames
723
+ aggregate : callable [default: np.max]
724
+ Function to aggregate across channels (if y.ndim > 1)
725
+
726
+ Returns
727
+ -------
728
+ intervals : np.ndarray, shape=(m, 2)
729
+ ``intervals[i] == (start_i, end_i)`` are the start and end time
730
+ (in samples) of non-silent interval ``i``.
731
+ """
732
+ non_silent = _signal_to_frame_nonsilent(
733
+ y,
734
+ frame_length=frame_length,
735
+ hop_length=hop_length,
736
+ ref=ref,
737
+ top_db=top_db,
738
+ aggregate=aggregate,
739
+ )
740
+
741
+ # Interval slicing, adapted from
742
+ # https://stackoverflow.com/questions/2619413/efficiently-finding-the-interval-with-non-zeros-in-scipy-numpy-in-python
743
+ # Find points where the sign flips
744
+ edges = np.flatnonzero(np.diff(non_silent.astype(int)))
745
+
746
+ # Pad back the sample lost in the diff
747
+ edges = [edges + 1]
748
+
749
+ # If the first frame had high energy, count it
750
+ if non_silent[0]:
751
+ edges.insert(0, np.array([0]))
752
+
753
+ # Likewise for the last frame
754
+ if non_silent[-1]:
755
+ edges.append(np.array([len(non_silent)]))
756
+
757
+ # Convert from frames to samples
758
+ edges = core.frames_to_samples(np.concatenate(edges), hop_length=hop_length)
759
+
760
+ # Clip to the signal duration
761
+ edges = np.minimum(edges, y.shape[-1])
762
+
763
+ # Stack the results back as an ndarray
764
+ edges = edges.reshape((-1, 2)) # type: np.ndarray
765
+ return edges
766
+
767
+
768
+ @overload
769
+ def preemphasis(
770
+ y: np.ndarray,
771
+ *,
772
+ coef: float = ...,
773
+ zi: Optional[ArrayLike] = ...,
774
+ return_zf: Literal[False] = ...,
775
+ ) -> np.ndarray: ...
776
+
777
+
778
+ @overload
779
+ def preemphasis(
780
+ y: np.ndarray,
781
+ *,
782
+ coef: float = ...,
783
+ zi: Optional[ArrayLike] = ...,
784
+ return_zf: Literal[True],
785
+ ) -> Tuple[np.ndarray, np.ndarray]: ...
786
+
787
+
788
+ @overload
789
+ def preemphasis(
790
+ y: np.ndarray,
791
+ *,
792
+ coef: float = ...,
793
+ zi: Optional[ArrayLike] = ...,
794
+ return_zf: bool,
795
+ ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: ...
796
+
797
+
798
+ def preemphasis(
799
+ y: np.ndarray,
800
+ *,
801
+ coef: float = 0.97,
802
+ zi: Optional[ArrayLike] = None,
803
+ return_zf: bool = False,
804
+ ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
805
+ """Pre-emphasize an audio signal with a first-order differencing filter:
806
+
807
+ y[n] -> y[n] - coef * y[n-1]
808
+
809
+ Parameters
810
+ ----------
811
+ y : np.ndarray [shape=(..., n)]
812
+ Audio signal. Multi-channel is supported.
813
+
814
+ coef : positive number
815
+ Pre-emphasis coefficient. Typical values of ``coef`` are between 0 and 1.
816
+
817
+ At the limit ``coef=0``, the signal is unchanged.
818
+
819
+ At ``coef=1``, the result is the first-order difference of the signal.
820
+
821
+ The default (0.97) matches the pre-emphasis filter used in the HTK
822
+ implementation of MFCCs [#]_.
823
+
824
+ .. [#] https://htk.eng.cam.ac.uk/
825
+
826
+ zi : number
827
+ Initial filter state. When making successive calls to non-overlapping
828
+ frames, this can be set to the ``zf`` returned from the previous call.
829
+ (See example below.)
830
+
831
+ By default ``zi`` is initialized as ``2*y[0] - y[1]``.
832
+
833
+ return_zf : boolean
834
+ If ``True``, return the final filter state.
835
+ If ``False``, only return the pre-emphasized signal.
836
+
837
+ Returns
838
+ -------
839
+ y_out : np.ndarray
840
+ pre-emphasized signal
841
+ zf : number
842
+ if ``return_zf=True``, the final filter state is also returned
843
+
844
+ Examples
845
+ --------
846
+ Apply a standard pre-emphasis filter
847
+
848
+ >>> import matplotlib.pyplot as plt
849
+ >>> y, sr = librosa.load(librosa.ex('trumpet'))
850
+ >>> y_filt = librosa.effects.preemphasis(y)
851
+ >>> # and plot the results for comparison
852
+ >>> S_orig = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max, top_db=None)
853
+ >>> S_preemph = librosa.amplitude_to_db(np.abs(librosa.stft(y_filt)), ref=np.max, top_db=None)
854
+ >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
855
+ >>> librosa.display.specshow(S_orig, y_axis='log', x_axis='time', ax=ax[0])
856
+ >>> ax[0].set(title='Original signal')
857
+ >>> ax[0].label_outer()
858
+ >>> img = librosa.display.specshow(S_preemph, y_axis='log', x_axis='time', ax=ax[1])
859
+ >>> ax[1].set(title='Pre-emphasized signal')
860
+ >>> fig.colorbar(img, ax=ax, format="%+2.f dB")
861
+
862
+ Apply pre-emphasis in pieces for block streaming. Note that the second block
863
+ initializes ``zi`` with the final state ``zf`` returned by the first call.
864
+
865
+ >>> y_filt_1, zf = librosa.effects.preemphasis(y[:1000], return_zf=True)
866
+ >>> y_filt_2, zf = librosa.effects.preemphasis(y[1000:], zi=zf, return_zf=True)
867
+ >>> np.allclose(y_filt, np.concatenate([y_filt_1, y_filt_2]))
868
+ True
869
+
870
+ See Also
871
+ --------
872
+ deemphasis
873
+ """
874
+ b = np.asarray([1.0, -coef], dtype=y.dtype)
875
+ a = np.asarray([1.0], dtype=y.dtype)
876
+
877
+ if zi is None:
878
+ # Initialize the filter to implement linear extrapolation
879
+ zi = 2 * y[..., 0:1] - y[..., 1:2]
880
+
881
+ zi = np.atleast_1d(zi)
882
+
883
+ y_out: np.ndarray
884
+ z_f: np.ndarray
885
+
886
+ y_out, z_f = scipy.signal.lfilter(b, a, y, zi=np.asarray(zi, dtype=y.dtype))
887
+
888
+ if return_zf:
889
+ return y_out, z_f
890
+
891
+ return y_out
892
+
893
+
894
+ @overload
895
+ def deemphasis(
896
+ y: np.ndarray,
897
+ *,
898
+ coef: float = ...,
899
+ zi: Optional[ArrayLike] = ...,
900
+ return_zf: Literal[False] = ...,
901
+ ) -> np.ndarray: ...
902
+
903
+
904
+ @overload
905
+ def deemphasis(
906
+ y: np.ndarray,
907
+ *,
908
+ coef: float = ...,
909
+ zi: Optional[ArrayLike] = ...,
910
+ return_zf: Literal[True],
911
+ ) -> Tuple[np.ndarray, np.ndarray]: ...
912
+
913
+
914
+ def deemphasis(
915
+ y: np.ndarray,
916
+ *,
917
+ coef: float = 0.97,
918
+ zi: Optional[ArrayLike] = None,
919
+ return_zf: bool = False,
920
+ ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
921
+ """De-emphasize an audio signal with the inverse operation of preemphasis():
922
+
923
+ If y = preemphasis(x, coef=coef, zi=zi), the deemphasis is:
924
+
925
+ >>> x[i] = y[i] + coef * x[i-1]
926
+ >>> x = deemphasis(y, coef=coef, zi=zi)
927
+
928
+ Parameters
929
+ ----------
930
+ y : np.ndarray [shape=(..., n)]
931
+ Audio signal. Multi-channel is supported.
932
+
933
+ coef : positive number
934
+ Pre-emphasis coefficient. Typical values of ``coef`` are between 0 and 1.
935
+
936
+ At the limit ``coef=0``, the signal is unchanged.
937
+
938
+ At ``coef=1``, the result is the first-order difference of the signal.
939
+
940
+ The default (0.97) matches the pre-emphasis filter used in the HTK
941
+ implementation of MFCCs [#]_.
942
+
943
+ .. [#] https://htk.eng.cam.ac.uk/
944
+
945
+ zi : number
946
+ Initial filter state. If inverting a previous preemphasis(), the same value should be used.
947
+
948
+ By default ``zi`` is initialized as
949
+ ``((2 - coef) * y[0] - y[1]) / (3 - coef)``. This
950
+ value corresponds to the transformation of the default initialization of ``zi`` in ``preemphasis()``,
951
+ ``2*x[0] - x[1]``.
952
+
953
+ return_zf : boolean
954
+ If ``True``, return the final filter state.
955
+ If ``False``, only return the pre-emphasized signal.
956
+
957
+ Returns
958
+ -------
959
+ y_out : np.ndarray
960
+ de-emphasized signal
961
+ zf : number
962
+ if ``return_zf=True``, the final filter state is also returned
963
+
964
+ Examples
965
+ --------
966
+ Apply a standard pre-emphasis filter and invert it with de-emphasis
967
+
968
+ >>> y, sr = librosa.load(librosa.ex('trumpet'))
969
+ >>> y_filt = librosa.effects.preemphasis(y)
970
+ >>> y_deemph = librosa.effects.deemphasis(y_filt)
971
+ >>> np.allclose(y, y_deemph)
972
+ True
973
+
974
+ See Also
975
+ --------
976
+ preemphasis
977
+ """
978
+ b = np.array([1.0, -coef], dtype=y.dtype)
979
+ a = np.array([1.0], dtype=y.dtype)
980
+
981
+ y_out: np.ndarray
982
+ zf: np.ndarray
983
+ if zi is None:
984
+ # initialize with all zeros
985
+ zi = np.zeros(list(y.shape[:-1]) + [1], dtype=y.dtype)
986
+ y_out, zf = scipy.signal.lfilter(a, b, y, zi=zi)
987
+
988
+ # factor in the linear extrapolation
989
+ y_out -= (
990
+ ((2 - coef) * y[..., 0:1] - y[..., 1:2])
991
+ / (3 - coef)
992
+ * (coef ** np.arange(y.shape[-1]))
993
+ )
994
+
995
+ else:
996
+ zi = np.atleast_1d(zi)
997
+ y_out, zf = scipy.signal.lfilter(a, b, y, zi=zi.astype(y.dtype))
998
+
999
+ if return_zf:
1000
+ return y_out, zf
1001
+ else:
1002
+ return y_out