Source code for mexca.audio.features

"""Compute audio signal properties to extract voice features.

This module contains classes and methods to compute and store properties of audio signals
that can be used to extract voice features.

There are two main types of classes: *Signal* (inherits from `BaseSignal`) and
*Frames* (inherits from `BaseFrames`). Signals contain data about an entire
signal (e.g., the audio signal itself) whereas Frames contain transformed and
aggregated data about overlapping slices of the signal.

"""

import logging
import math
from copy import copy
from typing import List, Optional, Tuple, Union
import librosa
import numpy as np
from scipy.interpolate import interp1d
from scipy.signal import find_peaks
from scipy.signal.windows import get_window
from mexca.utils import ClassInitMessage


[docs]class BaseSignal:
    """Store a signal.

    Parameters
    ----------
    sig: numpy.ndarray
        Signal.
    sr: int
        Sampling rate.
    """

    _ts: Optional[np.ndarray] = None
    _idx: Optional[np.ndarray] = None

    def __init__(self, sig: np.ndarray, sr: int) -> None:
        self.logger = logging.getLogger("mexca.audio.extraction.BaseSignal")
        self.sig = sig
        self.sr = sr
        self.logger.debug(ClassInitMessage())

    @property
[docs]    def idx(self) -> np.ndarray:
        """Sample indices (read-only)."""
        if self._idx is None:
            self._idx = np.arange(self.sig.shape[0])
        return self._idx

    @property
[docs]    def ts(self) -> np.ndarray:
        """Sample timestamps (read-only)."""
        if self._ts is None:
            self._ts = librosa.samples_to_time(self.idx, sr=self.sr)
        return self._ts


[docs]class AudioSignal(BaseSignal):
    """Load and store an audio signal.

    Parameters
    ----------
    sig: numpy.ndarray
        Audio signal.
    sr: int
        Sampling rate.
    mono: bool, default=True
        Whether the signal has been converted to mono or not.
    filename: str, optional
        Name of the audio file associated with the signal.
    """

    def __init__(
        self,
        sig: np.ndarray,
        sr: int,
        mono: bool = True,
        filename: Optional[str] = None,
    ) -> None:
        self.logger = logging.getLogger("mexca.audio.extraction.AudioSignal")
        self.filename = filename
        self.mono = mono
        super().__init__(sig, sr)
        self.logger.debug(ClassInitMessage())

    @classmethod
[docs]    def from_file(cls, filename: str, sr: Optional[float] = None, mono: bool = True):
        """Load a signal from an audio file.

        Parameters
        ----------
        filename: str
            Name of the audio file.
            File types must be supported by ``soundfile`` or ``audiofile``.
            See :func:`librosa.load`.
        sr: float, optional, default=None
            Sampling rate. If `None`, is detected from the file, otherwise the signal is resampled.
        mono: bool, default=True
            Whether to convert the signal to mono.
        """
        sig, nat_sr = librosa.load(path=filename, sr=sr, mono=mono)
        return cls(sig, nat_sr, mono, filename)


[docs]class BaseFrames:
    """Create and store signal frames.

    A frame is an (overlapping, padded) slice of a signal for which higher-order
    features can be computed.

    Parameters
    ----------
    frames: numpy.ndarray
        Signal frames. The first dimension should be the number of frames.
    sr: int
        Sampling rate.
    frame_len: int
        Number of samples per frame.
    hop_len: int
        Number of samples between frame starting points.
    center: bool, default=True
        Whether the signal has been centered and padded before framing.
    pad_mode: str, default='constant'
        How the signal has been padded before framing. See :func:`numpy.pad`.
        Uses the default value 0 for `'constant'` padding.

    See Also
    --------
    librosa.util.frame

    """

    _ts: Optional[np.ndarray] = None
    _idx: Optional[np.ndarray] = None

    def __init__(
        self,
        frames: np.ndarray,
        sr: int,
        frame_len: int,
        hop_len: int,
        center: bool = True,
        pad_mode: str = "constant",
    ) -> None:
        self.logger = logging.getLogger("mexca.audio.extraction.BaseFrames")
        self.frames = frames
        self.sr = sr
        self.frame_len = frame_len
        self.hop_len = hop_len
        self.center = center
        self.pad_mode = pad_mode
        self.logger.debug(ClassInitMessage())

    @property
[docs]    def idx(self) -> np.ndarray:
        """Frame indices (read-only)."""
        if self._idx is None:
            self._idx = np.arange(self.frames.shape[0])
        return self._idx

    @property
[docs]    def ts(self) -> np.ndarray:
        """Frame timestamps (read-only)."""
        if self._ts is None:
            self._ts = librosa.frames_to_time(
                self.idx, sr=self.sr, hop_length=self.hop_len
            )
        return self._ts

    @classmethod
[docs]    def from_signal(
        cls,
        sig_obj: BaseSignal,
        frame_len: int,
        hop_len: Optional[int] = None,
        center: bool = True,
        pad_mode: str = "constant",
    ):
        """Create frames from a signal.

        Parameters
        ----------
        sig_obj: BaseSignal
            Signal object.
        frame_len: int
            Number of samples per frame.
        hop_len: int, optional, default=None
            Number of samples between frame starting points. If `None`, uses `frame_len // 4`.
        center: bool, default=True
            Whether to center the frames and apply padding.
        pad_mode: str, default='constant'
            How the signal is padded before framing. See :func:`numpy.pad`.
            Uses the default value 0 for `'constant'` padding. Ignored if `center=False`.
        """
        if hop_len is None:
            hop_len = frame_len // 4
        sig = sig_obj.sig
        # Pad signal if centering
        if center:
            padding = [(0, 0) for _ in sig_obj.sig.shape]
            padding[-1] = (frame_len // 2, frame_len // 2)
            sig = np.pad(sig, padding, mode=pad_mode)

        frames = librosa.util.frame(
            sig, frame_length=frame_len, hop_length=hop_len, axis=0
        )

        return cls(frames, sig_obj.sr, frame_len, hop_len, center, pad_mode)


[docs]class PitchFrames(BaseFrames):
    """Estimate and store pitch frames.

    Estimate and store the voice pitch measured as the fundamental frequency F0 in Hz.

    Parameters
    ----------
    frames: numpy.ndarray
        Voice pitch frames in Hz with shape (num_frames,).
    flag: numpy.ndarray
        Boolean flags indicating which frames are voiced with shape (num_frames,).
    prob: numpy.ndarray
        Probabilities for frames being voiced with shape (num_frames,).
    lower: float
        Lower limit used for pitch estimation (in Hz).
    upper: float
        Upper limit used for pitch estimation (in Hz).
    method: str
        Method used for estimating voice pitch.

    See Also
    --------
    librosa.pyin
    librosa.yin

    """

    def __init__(
        self,
        frames: np.ndarray,
        flag: np.ndarray,
        prob: np.ndarray,
        sr: int,
        lower: float,
        upper: float,
        frame_len: int,
        hop_len: int,
        method: str,
        center: bool = True,
        pad_mode: str = "constant",
    ):
        self.logger = logging.getLogger("mexca.audio.extraction.PitchFrames")
        self.flag = flag
        self.probs = prob
        self.lower = lower
        self.upper = upper
        self.method = method
        super().__init__(frames, sr, frame_len, hop_len, center, pad_mode)
        self.logger.debug(ClassInitMessage())

    @classmethod
[docs]    def from_signal(
        cls,
        sig_obj: BaseSignal,
        frame_len: int,
        hop_len: Optional[int] = None,
        center: bool = True,
        pad_mode: str = "constant",
        lower: float = 75.0,
        upper: float = 600.0,
        method: str = "pyin",
    ):
        """Estimate the voice pitch frames from a signal.

        Currently, voice pitch can only be extracted with the *pYIN* method.

        Parameters
        ----------
        sig_obj: BaseSignal
            Signal object.
        frame_len: int
            Number of samples per frame.
        hop_len: int, optional, default=None
            Number of samples between frame starting points. If `None`, uses `frame_len // 4`.
        center: bool, default=True
            Whether to center the frames and apply padding.
        pad_mode: str, default='constant'
            How the signal is padded before framing. See :func:`numpy.pad`.
            Uses the default value 0 for `'constant'` padding. Ignored if `center=False`.
        lower: float, default = 75.0
            Lower limit for pitch estimation (in Hz).
        upper: float, default = 600.0
            Upper limit for pitch estimation (in Hz).
        method: str, default = 'pyin'
            Method for estimating voice pitch. Only `'pyin'` is currently available.

        Raises
        ------
        NotImplementedError
            If a method other than `'pyin'` is given.

        """
        if method == "pyin":
            pitch_f0, flag, prob = librosa.pyin(
                sig_obj.sig,
                fmin=lower,
                fmax=upper,
                sr=sig_obj.sr,
                frame_length=frame_len,
                hop_length=hop_len,
                center=center,
                pad_mode=pad_mode,
            )
        else:
            raise NotImplementedError('Only the "pyin" method is currently available')

        return cls(
            frames=pitch_f0,
            flag=flag,
            prob=prob,
            sr=sig_obj.sr,
            lower=lower,
            upper=upper,
            frame_len=frame_len,
            hop_len=hop_len,
            method=method,
        )


[docs]class SpecFrames(BaseFrames):
    """Create and store spectrogram frames.

    Computes a spectrogram of a signal using the short-time Fourier transform (STFT).

    Parameters
    ----------
    frames: np.ndarray
        Spectrogram frames.
    window: str
        The window that was applied before the STFT.

    Notes
    -----
    Frames contain complex arrays `x` where ``np.abs(x)`` is the magnitude and
    ``np.angle(x)`` is the phase of the signal for different frequency bins.

    See Also
    --------
    librosa.stft

    """

    def __init__(
        self,
        frames: np.ndarray,
        sr: int,
        window: str,
        frame_len: int,
        hop_len: int,
        center: bool = True,
        pad_mode: str = "constant",
    ) -> None:
        self.logger = logging.getLogger("mexca.audio.extraction.SpecFrames")
        self.window = window
        super().__init__(frames, sr, frame_len, hop_len, center, pad_mode)
        self.logger.debug(ClassInitMessage())

    @classmethod
[docs]    def from_signal(
        cls,
        sig_obj: BaseSignal,
        frame_len: int,
        hop_len: Optional[int] = None,
        center: bool = True,
        pad_mode: str = "constant",
        window: Union[str, float, Tuple] = "hamming",
    ):
        """Transform a signal into spectrogram frames.

        Parameters
        ----------
        sig_obj: BaseSignal
            Signal object.
        frame_len: int
            Number of samples per frame.
        hop_len: int, optional, default=None
            Number of samples between frame starting points. If `None`, uses `frame_len // 4`.
        center: bool, default=True
            Whether to center the frames and apply padding.
        pad_mode: str, default='constant'
            How the signal is padded before framing. See :func:`numpy.pad`.
            Uses the default value 0 for `'constant'` padding. Ignored if `center=False`.
        window: str
            The window that is applied before the STFT.
        """
        spec_frames = librosa.stft(
            sig_obj.sig,
            n_fft=frame_len,
            hop_length=hop_len,
            window=window,
            center=center,
            pad_mode=pad_mode,
        )
        return cls(
            np.swapaxes(spec_frames, 0, 1),
            sig_obj.sr,
            window,
            frame_len,
            hop_len,
            center,
            pad_mode,
        )


[docs]class FormantFrames(BaseFrames):
    """Estimate and store formant frames.

    Parameters
    ----------
    frames: list
        Formant frames. Each frame contains a list of tuples for each formant, where the first item
        is the central frequency and the second the bandwidth.
    max_formants: int, default=5
        The maximum number of formants that were extracted.
    lower: float, default=50.0
        Lower limit for formant frequencies (in Hz).
    upper: float, default=5450.0
        Upper limit for formant frequencies (in Hz).
    preemphasis_from: float, default=50.0
        Starting value for the applied preemphasis function.
    window: str
        Window function that was applied before formant estimation.

    Notes
    -----
    Estimate formants of the signal in each frame:

    1. Apply a preemphasis function with the coefficient
       ``math.exp(-2 * math.pi * preemphasis_from * (1 / sr))``
       to the signal.
    2. Apply a window function to the signal. By default, the same Gaussian window as in
       Praat is used: ``(np.exp(-48.0 * (n - ((N + 1)/2)**2 / (N + 1)**2) - np.exp(-12.0)) / (1.0 - np.exp(-12.0))``,
       where `N` is the length of the window and `n` the index of each sample.
    3. Calculate linear predictive coefficients using :func:`librosa.lpc`
       with order ``2 * max_formants``.
    4. Find the roots of the coefficients.
    5. Compute the formant central frequencies as
       ``np.abs(np.arctan2(np.imag(roots), np.real(roots))) * sr / (2 * math.pi)``.
    6. Compute the formant bandwidth as
       ``np.sqrt(np.abs(np.real(roots) ** 2) + np.abs(np.imag(roots) ** 2)) * sr / (2 * math.pi)``.
    7. Filter out formants outside the lower and upper limits.

    """

    def __init__(
        self,
        frames: List,
        sr: int,
        frame_len: int,
        hop_len: int,
        center: bool = True,
        pad_mode: str = "constant",
        max_formants: int = 5,
        lower: float = 50.0,
        upper: float = 5450.0,
        preemphasis_from: Optional[float] = 50.0,
        window: Optional[Union[str, float, Tuple]] = "praat_gaussian"
    ) -> None:
        self.logger = logging.getLogger("mexca.audio.extraction.FormantFrames")
        self.max_formants = max_formants
        self.lower = lower
        self.upper = upper
        self.preemphasis_from = preemphasis_from
        self.window = window
        super().__init__(frames, sr, frame_len, hop_len, center, pad_mode)
        self.logger.debug(ClassInitMessage())

    @property
[docs]    def idx(self) -> np.ndarray:
        if self._idx is None:
            self._idx = np.arange(len(self.frames))
        return self._idx
    

    @staticmethod
    def _praat_gauss_window(frame_len: int):
        # This is the Gaussian window that is used in Praat for formant estimation
        # See: https://github.com/YannickJadoul/Parselmouth/blob/master/praat/fon/Sound_to_Formant.cpp
        sample_idx = np.arange(frame_len)+1
        idx_mid = 0.5 * (frame_len + 1)
        edge = np.exp(-12.0)
        return (np.exp(-48.0 * (sample_idx - idx_mid)**2 / (frame_len)**2) - edge) / (1.0 - edge)


    @classmethod
[docs]    def from_frames(
        cls,
        sig_frames_obj: BaseFrames,
        max_formants: int = 5,
        lower: float = 50.0,
        upper: float = 5450.0,
        preemphasis_from: Optional[float] = 50.0,
        window: Optional[Union[str, float, Tuple]] = "praat_gaussian"
    ):
        """Extract formants from signal frames.

        Parameters
        ----------
        sig_frames_obj: BaseFrames
            Signal frames object.
        max_formants: int, default=5
            The maximum number of formants that were extracted.
        lower: float, default=50.0
            Lower limit for formant frequencies (in Hz).
        upper: float, default=5450.0
            Upper limit for formant frequencies (in Hz).
        preemphasis_from: float, default=50.0
            Starting value for the preemphasis function.
        window: str
            Window function.

        """
        frames = sig_frames_obj.frames

        if preemphasis_from is not None:
            pre_coef = math.exp(
                -2 * math.pi * preemphasis_from * (1 / sig_frames_obj.sr)
            )
            frames = librosa.effects.preemphasis(sig_frames_obj.frames, coef=pre_coef)
        if window is not None:
            if window == "praat_gaussian":
                win = cls._praat_gauss_window(sig_frames_obj.frame_len)
            else:
                win = get_window(window, sig_frames_obj.frame_len, fftbins=False)
            frames = frames * win

        # Calc linear predictive coefficients
        coefs = librosa.lpc(frames, order=max_formants * 2)
        # Transform LPCs to formants
        formants = [
            cls._calc_formants(coef, sig_frames_obj.sr, lower, upper) for coef in coefs
        ]

        return cls(
            formants,
            sig_frames_obj.sr,
            sig_frames_obj.frame_len,
            sig_frames_obj.hop_len,
            sig_frames_obj.center,
            sig_frames_obj.pad_mode,
            max_formants,
            lower,
            upper,
            preemphasis_from,
            window,
        )

    @staticmethod
    def _calc_formants(
        coefs: np.ndarray, sr: int, lower: float = 50, upper: float = 5450
    ) -> List:
        # Function to compute complex norm
        def complex_norm(x):
            return np.sqrt(np.abs(np.real(x) ** 2) + np.abs(np.imag(x) ** 2))

        nf_pi = sr / (2 * math.pi)  # sr/2 = Nyquist freq
        # Find roots of linear coefficients
        roots = np.roots(coefs)
        # Select roots with positive imag part
        mask = np.imag(roots) > 0
        roots = roots[mask]
        # Calc angular frequency
        ang_freq = np.abs(np.arctan2(np.imag(roots), np.real(roots)))
        # Calc formant centre freq
        formant_freqs = ang_freq * nf_pi
        # Calc formant bandwidth
        formant_bws = -np.log(np.apply_along_axis(complex_norm, 0, roots)) * nf_pi
        # Select formants within boundaries
        in_bounds = np.logical_and(formant_freqs > lower, formant_freqs < upper)
        formants_sorted = sorted(
            list(zip(formant_freqs[in_bounds], formant_bws[in_bounds]))
        )
        return formants_sorted

    def select_formant_attr(self, formant_idx: int, attr_idx: int) -> np.ndarray:
        return np.array(
            [
                f[formant_idx][attr_idx] if len(f) > formant_idx else np.nan
                for f in self.frames
            ]
        )


[docs]class PitchHarmonicsFrames(BaseFrames):
    """Estimate and store voice pitch harmonics.

    Compute the energy of the signal at harmonics (`nF0` for any integer n) of
    the fundamental frequency.

    Parameters
    ----------
    frames: numpy.ndarray
        Harmonics frames with the shape (num_frames, n_harmonics)
    n_harmonics: int, default=100
        Number of estimated harmonics.

    See Also
    --------
    librosa.f0_harmonics

    """

    def __init__(
        self,
        frames: np.ndarray,
        sr: int,
        frame_len: int,
        hop_len: int,
        center: bool = True,
        pad_mode: str = "constant",
        n_harmonics: int = 100,
    ):
        self.logger = logging.getLogger("mexca.audio.extraction.PitchHarmonicsFrames")
        self.n_harmonics = n_harmonics
        super().__init__(frames, sr, frame_len, hop_len, center, pad_mode)
        self.logger.debug(ClassInitMessage())

    @classmethod
[docs]    def from_spec_and_pitch_frames(
        cls,
        spec_frames_obj: SpecFrames,
        pitch_frames_obj: PitchFrames,
        n_harmonics: int = 100,
    ):
        """Estimate voice pitch harmonics from spectrogram frames and voice pitch frames.

        Parameters
        ----------
        spec_frames_obj: SpecFrames
            Spectrogram frames object.
        pitch_frames_obj: PitchFrames
            Pitch frames object.
        n_harmonics: int, default=100
            Number of harmonics to estimate.

        """
        freqs = librosa.fft_frequencies(
            sr=spec_frames_obj.sr, n_fft=spec_frames_obj.frame_len
        )

        # harmonics = librosa.f0_harmonics(
        #     np.abs(spec_frames_obj.frames),
        #     freqs=freqs,
        #     f0=pitch_frames_obj.frames,
        #     harmonics=np.arange(n_harmonics) + 1,  # Shift one up
        #     axis=-1,
        # )

        harmonics = cls._calc_f0_harmonics(
            spec_frames_obj.frames, freqs, pitch_frames_obj.frames, n_harmonics
        )

        return cls(
            harmonics,
            spec_frames_obj.sr,
            spec_frames_obj.frame_len,
            spec_frames_obj.hop_len,
            spec_frames_obj.center,
            spec_frames_obj.pad_mode,
            n_harmonics,
        )

    @staticmethod
    def _calc_f0_harmonics(
        spec_frames: np.ndarray,
        freqs: np.ndarray,
        f0_frames: np.ndarray,
        n_harmonics: int,
    ) -> np.ndarray:
        # Adapted from librosa.f0_harmonics, see:
        # https://librosa.org/doc/latest/generated/librosa.f0_harmonics.html#librosa.f0_harmonics
        is_valid = np.isfinite(freqs)

        def mag_interp_fun(spec_frames, f0_harmonic_freqs):
            interp = interp1d(
                freqs[is_valid],
                spec_frames[is_valid],
                axis=0,
                copy=False,
                assume_sorted=False,
                bounds_error=False,
                fill_value=0,
            )
            return interp(f0_harmonic_freqs)

        xfunc = np.vectorize(mag_interp_fun, signature="(f),(h)->(h)")
        harmonics_frames = xfunc(
            np.abs(spec_frames),
            np.multiply.outer(f0_frames, np.arange(n_harmonics) + 1),  # Shift one up
        )

        return harmonics_frames


[docs]class FormantAmplitudeFrames(BaseFrames):
    """Estimate and store formant amplitudes.

    Parameters
    ----------
    frames: np.ndarray
        Formant amplitude frames of shape (num_frames, max_formants) in dB.
    lower: float
        Lower boundary for peak amplitude search interval.
    upper: float
        Upper boundary for peak amplitude search interval.
    rel_f0: bool
        Whether the amplitude is relative to the fundamental frequency amplitude.

    Notes
    -----
    Estimate the formant amplitude as the maximum amplitude of harmonics of the
    fundamental frequency within an interval ``[lower*f, upper*f]`` where `f` is the
    central frequency of the formant in each frame. If ``rel=True``, divide the amplitude by the amplitude of
    the fundamental frequency.

    """

    def __init__(
        self,
        frames: np.ndarray,
        sr: int,
        frame_len: int,
        hop_len: int,
        center: bool,
        pad_mode: str,
        lower: float,
        upper: float,
        rel_f0: bool,
    ):
        self.lower = lower
        self.upper = upper
        self.rel_f0 = rel_f0
        super().__init__(frames, sr, frame_len, hop_len, center, pad_mode)

    @property
[docs]    def idx(self) -> np.ndarray:
        if self._idx is None:
            self._idx = np.arange(len(self.frames))
        return self._idx

    @classmethod
[docs]    def from_formant_harmonics_and_pitch_frames(  # pylint: disable=too-many-locals
        cls,
        formant_frames_obj: FormantFrames,
        harmonics_frames_obj: PitchHarmonicsFrames,
        pitch_frames_obj: PitchFrames,
        lower: float = 0.8,
        upper: float = 1.2,
        rel_f0: bool = True,
    ):
        """Estimate formant amplitudes from formant, pitch harmonics, and pitch frames.

        Parameters
        ----------
        formant_frames_obj: FormantFrames
            Formant frames object.
        harmonics_frames_obj: PitchHarmonicsFrames
        pitch_frames_obj: PitchFrames
        lower: float, optional, default=0.8
            Lower boundary for peak amplitude search interval.
        upper: float, optional, default=1.2
            Upper boundary for peak amplitude search interval.
        rel_f0: bool, optional, default=True
            Whether the amplitude is divide by the fundamental frequency amplitude.
        """
        amp_frames = []

        for i in range(formant_frames_obj.max_formants):
            freqs = formant_frames_obj.select_formant_attr(i, 0)
            harmonic_freqs = (
                pitch_frames_obj.frames[:, None]
                * (np.arange(harmonics_frames_obj.n_harmonics) + 1)[None, :]
            )
            f0_amp = harmonics_frames_obj.frames[:, 0]
            freqs_lower = lower * freqs
            freqs_upper = upper * freqs
            freq_in_bounds = np.logical_and(
                harmonic_freqs > freqs_lower[:, None],
                harmonic_freqs < freqs_upper[:, None],
            )
            harmonics_amp = copy(harmonics_frames_obj.frames)
            harmonics_amp[~freq_in_bounds] = np.nan
            harmonic_peaks = np.nanmax(harmonics_amp, axis=1)
            harmonic_peaks_db = 20 * np.log10(harmonic_peaks)

            if rel_f0:
                harmonic_peaks_db = harmonic_peaks_db - 20 * np.log10(f0_amp)

            amp_frames.append(harmonic_peaks_db)

        return cls(
            np.array(amp_frames).T,
            formant_frames_obj.sr,
            formant_frames_obj.frame_len,
            formant_frames_obj.hop_len,
            formant_frames_obj.center,
            formant_frames_obj.pad_mode,
            lower,
            upper,
            rel_f0,
        )


[docs]class PitchPulseFrames(BaseFrames):
    """Extract and store glottal pulse frames.

    Glottal pulses are peaks in the signal corresponding to the fundamental frequency F0.

    Parameters
    ----------
    frames: list
        Pulse frames. Each frame contains a list of pulses or an empty list if no pulses are detected.
        Pulses are stored as tuples (pulse timestamp, T0, amplitude).

    Notes
    -----
    Extract glottal pulses with these steps:

    1. Interpolate the fundamental frequency at the timestamps of the framed (padded) signal.
    2. Start at the mid point `m` of each frame and create an interval [start, stop],
       where ``start=m-T0/2`` and
       ``stop=m+T0/2`` and `T0` is the fundamental period (1/F0).
    3. Detect pulses in the interval by:
        a. Find the maximum amplitude in an interval within the frame.
        b. Compute the fundamental period `T0_new` at the timestamp of the maximum `m_new`.
    4. Shift the interval recursively to the right or left until the edges of the frame are reached:
        a. When shifting to the left, set ``start_new=m_new-1.25*T0_new``
           and ``stop_new=m_new-0.8*T0_new``.
        b. When shifting to the right, set ``start_new=m_new+0.8*T0_new``
           and ``stop_new=m_new+1.25*T0_new``.
    5. Filter out duplicate pulses.

    """

    def __init__(
        self,
        frames: List[Tuple],
        sr: int,
        frame_len: int,
        hop_len: int,
        center: bool = True,
        pad_mode: str = "constant",
    ) -> None:
        self.logger = logging.getLogger("mexca.audio.extraction.PitchPulseFrames")
        super().__init__(frames, sr, frame_len, hop_len, center, pad_mode)
        self.logger.debug(ClassInitMessage())

    @property
[docs]    def idx(self) -> np.ndarray:
        if self._idx is None:
            self._idx = np.arange(len(self.frames))
        return self._idx

    @classmethod
[docs]    def from_signal_and_pitch_frames(
        cls, sig_obj: BaseSignal, pitch_frames_obj: PitchFrames
    ):
        """Extract glottal pulse frames from a signal and voice pitch frames.

        Parameters
        ----------
        sig_obj: BaseSignal
            Signal object.
        pitch_frames_obj: PitchFrames
            Voice pitch frames object.

        """
        # Access to padded signal required so we transform it here again! Could go into separate private method perhaps
        padding = [(0, 0) for _ in sig_obj.sig.shape]
        padding[-1] = (pitch_frames_obj.frame_len // 2, pitch_frames_obj.frame_len // 2)
        sig_padded = np.pad(sig_obj.sig, padding, mode=pitch_frames_obj.pad_mode)
        # Create ts for padded signal
        sig_padded_ts = librosa.samples_to_time(
            np.arange(sig_padded.shape[0]), sr=sig_obj.sr
        )

        # Frame padded signal
        sig_frames_obj = BaseFrames.from_signal(
            BaseSignal(sig_padded, sig_obj.sr),
            pitch_frames_obj.frame_len,
            pitch_frames_obj.hop_len,
            center=False,
        )

        # Frame ts of padded signal
        sig_ts_frames_obj = BaseFrames.from_signal(
            BaseSignal(sig_padded_ts, sig_obj.sr),
            pitch_frames_obj.frame_len,
            pitch_frames_obj.hop_len,
            center=False,
        )

        # Interpolate pitch F0 at padded signal ts
        interp_f0 = np.interp(
            sig_padded_ts,
            pitch_frames_obj.ts[pitch_frames_obj.flag],
            pitch_frames_obj.frames[pitch_frames_obj.flag],
        )

        # Frame F0 interpolated signal
        pitch_interp_frames_obj = BaseFrames.from_signal(
            BaseSignal(interp_f0, sig_obj.sr),
            pitch_frames_obj.frame_len,
            pitch_frames_obj.hop_len,
            center=False,
        )

        # Detect pulses in each frame; objects are passed instead of arrays bcs some attributes are needed
        pulses = [
            cls._detect_pulses_in_frame(
                i,
                sig_frames_obj,
                sig_ts_frames_obj,
                pitch_frames_obj,
                pitch_interp_frames_obj,
            )
            for i in pitch_frames_obj.idx
        ]

        return cls(
            pulses,
            pitch_frames_obj.sr,
            pitch_frames_obj.frame_len,
            pitch_frames_obj.hop_len,
            pitch_frames_obj.center,
            pitch_frames_obj.pad_mode,
        )

    @classmethod
    def _get_next_pulse(
        cls,
        sig: np.ndarray,
        ts: np.ndarray,
        t0_interp: np.ndarray,
        start: float,
        stop: float,
        left: bool = True,
        pulses: Optional[List] = None,
    ):
        # Init pulses as list if first iter of recurrence and default
        if pulses is None:
            pulses = []

        # If interval [start, stop] reaches end of frame, exit recurrence
        if (
            (left and start <= ts.min())
            or (not left and stop >= ts.max())
            or any(np.isnan((start, stop)))
        ):
            return pulses

        # Get closest ts to boundaries start, stop
        start_idx = np.argmin(np.abs(ts - start))
        stop_idx = np.argmin(np.abs(ts - stop))
        interval = sig[start_idx:stop_idx]

        # Find max peak in interval [start, stop]
        peak_idx = np.nanargmax(interval)

        # Set new mid point to idx of max peak
        new_ts_mid = ts[start_idx:stop_idx][peak_idx]

        # Add pulse to output
        new_t0_interp_mid = t0_interp[start_idx:stop_idx][peak_idx]
        pulses.append((new_ts_mid, new_t0_interp_mid, interval[peak_idx]))

        # self.logger.debug('%s - %s - %s', start, stop, pulses)

        if left:  # Move interval to left
            start = new_ts_mid - 1.25 * new_t0_interp_mid
            stop = new_ts_mid - 0.8 * new_t0_interp_mid
        else:  # Move interval to right
            stop = new_ts_mid + 1.25 * new_t0_interp_mid
            start = new_ts_mid + 0.8 * new_t0_interp_mid

        # Find next pulse in new interval
        return cls._get_next_pulse(sig, ts, t0_interp, start, stop, left, pulses)

    @classmethod
    def _detect_pulses_in_frame(
        cls,
        frame_idx: int,
        sig_frames_obj: BaseFrames,
        sig_ts_frames_obj: BaseFrames,
        pitch_obj: PitchFrames,
        pitch_interp_obj: BaseFrames,
    ) -> List[Tuple]:
        # Get period of frame
        t0_mid = 1 / pitch_obj.frames[frame_idx]
        # Get ts of frame
        ts_mid = pitch_obj.ts[frame_idx]
        # Get frame signal
        sig_frame = sig_frames_obj.frames[frame_idx, :]
        # Get ts of frame signal
        ts_sig_frame = sig_ts_frames_obj.frames[frame_idx, :]
        # Get interpolated period of frame
        t0 = 1 / pitch_interp_obj.frames[frame_idx, :]

        pulses = []

        # Return empty list if frame is unvoiced (no F0)
        if np.all(np.isnan(t0)) or np.isnan(t0_mid):
            return pulses

        # Set start interval
        start = ts_mid - t0_mid / 2
        stop = ts_mid + t0_mid / 2

        # Get pulses to the left
        cls._get_next_pulse(sig_frame, ts_sig_frame, t0, start, stop, True, pulses)

        # Get pulses to the right
        cls._get_next_pulse(sig_frame, ts_sig_frame, t0, start, stop, False, pulses)

        return list(sorted(set(pulses)))


[docs]class PitchPeriodFrames(BaseFrames):
    def __init__(
        self,
        frames: np.ndarray,
        sr: int,
        frame_len: int,
        hop_len: int,
        center: bool,
        pad_mode: str,
        lower: float,
        upper: float,
    ):
        self.logger = logging.getLogger("mexca.audio.extraction.PitchPeriodFrames")
        self.lower = lower
        self.upper = upper
        super().__init__(frames, sr, frame_len, hop_len, center, pad_mode)
        self.logger.debug(ClassInitMessage())

    @staticmethod
    def _calc_period_length(
        pulses: List[Tuple], lower: float, upper: float
    ) -> Tuple[List, np.ndarray]:
        # Calc period length as first order diff of pulse ts
        periods = np.diff(np.array([puls[0] for puls in pulses]))

        # Filter out too short and long periods
        mask = np.logical_and(periods > lower, periods < upper)

        # Split periods according to mask and remove masked periods
        periods = np.array_split(periods[mask], np.where(~mask)[0])

        return periods, mask


[docs]class JitterFrames(PitchPeriodFrames):
    """Extract and store voice jitter frames.

    Parameters
    ----------
    frames: numpy.ndarray
        Voice jitter frames of shape (num_frames,).
    rel: bool
        Whether the voice jitter is relative to the average period length.
    lower: float
        Lower limit for periods between glottal pulses.
    upper: float
        Upper limit for periods between glottal pulses.
    max_period_ratio: float
        Maximum ratio between consecutive periods used for jitter extraction.

    Notes
    -----
    Compute jitter as the average absolute difference between consecutive fundamental periods with a ratio
    below `max_period_ratio` for each frame. If ``rel=True``, jitter is divided by the average fundamental period
    of each frame. Fundamental periods are calculated as the first-order temporal difference between consecutive
    glottal pulses.

    """

    def __init__(
        self,
        frames: np.ndarray,
        sr: int,
        frame_len: int,
        hop_len: int,
        center: bool,
        pad_mode: str,
        rel: bool,
        lower: float,
        upper: float,
        max_period_ratio: float,
    ):
        self.logger = logging.getLogger("mexca.audio.extraction.JitterFrames")
        self.rel = rel
        self.max_period_ratio = max_period_ratio
        super().__init__(frames, sr, frame_len, hop_len, center, pad_mode, lower, upper)
        self.logger.debug(ClassInitMessage())

    @classmethod
[docs]    def from_pitch_pulse_frames(
        cls,
        pitch_pulse_frames_obj: PitchPulseFrames,
        rel: bool = True,
        lower: float = 0.0001,
        upper: float = 0.02,
        max_period_ratio: float = 1.3,
    ):
        """Extract voice jitter frames from glottal pulse frames.

        Parameters
        ----------
        pitch_pulse_frames_obj: PitchPulseFrames
            Glottal pulse frames object.
        rel: bool, optional, default=True
            Divide jitter by the average pitch period.
        lower: float, optional, default=0.0001
            Lower limit for periods between glottal pulses.
        upper: float, optional, default=0.02
            Upper limit for periods between glottal pulses.
        max_period_ratio: float, optional, default=1.3
            Maximum ratio between consecutive periods for jitter extraction.
        """
        jitter_frames = np.array(
            [
                cls._calc_jitter_frame(pulses, rel, lower, upper, max_period_ratio)
                for pulses in pitch_pulse_frames_obj.frames
            ]
        )

        return cls(
            jitter_frames,
            pitch_pulse_frames_obj.sr,
            pitch_pulse_frames_obj.frame_len,
            pitch_pulse_frames_obj.hop_len,
            pitch_pulse_frames_obj.center,
            pitch_pulse_frames_obj.pad_mode,
            rel,
            lower,
            upper,
            max_period_ratio,
        )

    @classmethod
    def _calc_jitter_frame(
        cls,
        pulses: List[Tuple],
        rel: bool,
        lower: float,
        upper: float,
        max_period_ratio: float,
    ):
        if len(pulses) > 0:
            # Calc period length as first order diff of pulse ts
            periods, _ = cls._calc_period_length(pulses, lower, upper)

            # Calc avg of first order diff in period length
            # only consider period pairs where ratio is < max_period_ratio
            avg_period_diff = np.nanmean(
                np.array(
                    [
                        np.mean(
                            np.abs(
                                np.diff(period)[
                                    (period[:-1] / period[1:]) < max_period_ratio
                                ]
                            )
                        )
                        for period in periods
                        if len(period) > 0
                    ]
                )
            )

            if rel:  # Relative to mean period length
                avg_period_len = np.nanmean(
                    np.array([np.mean(period) for period in periods if len(period) > 0])
                )
                return avg_period_diff / avg_period_len
            return avg_period_diff
        return np.nan


[docs]class ShimmerFrames(PitchPeriodFrames):
    """Extract and store voice shimmer frames.

    Parameters
    ----------
    frames: numpy.ndarray
        Voice shimmer frames of shape (num_frames,).
    rel: bool
        Whether the voice shimmer is relative to the average period length.
    lower: float
        Lower limit for periods between glottal pulses.
    upper: float
        Upper limit for periods between glottal pulses.
    max_period_ratio: float
        Maximum ratio between consecutive periods used for shimmer extraction.
    max_amp_factor: float
        Maximum ratio between consecutive amplitudes used for shimmer extraction.

    Notes
    -----
    Compute shimmer as the average absolute difference between consecutive pitch amplitudes with a
    fundamental period ratio below `max_period_ratio` and amplitude ratio below `max_amp_factor`
    for each frame. If ``rel=True``, shimmer is divided by the average amplitude
    of each frame. Fundamental periods are calculated as the first-order temporal difference
    between consecutive glottal pulses. Amplitudes are signal amplitudes at the glottal pulses.
    """

    def __init__(
        self,
        frames: List[Tuple],
        sr: int,
        frame_len: int,
        hop_len: int,
        center: bool,
        pad_mode: str,
        rel: bool,
        lower: float,
        upper: float,
        max_period_ratio: float,
        max_amp_factor: float,
    ):
        self.logger = logging.getLogger("mexca.audio.extraction.ShimmerFrames")
        self.rel = rel
        self.max_period_ratio = max_period_ratio
        self.max_amp_factor = max_amp_factor
        super().__init__(frames, sr, frame_len, hop_len, center, pad_mode, lower, upper)
        self.logger.debug(ClassInitMessage())

    @classmethod
[docs]    def from_pitch_pulse_frames(
        cls,
        pitch_pulse_frames_obj: PitchPulseFrames,
        rel: bool = True,
        lower: float = 0.0001,
        upper: float = 0.02,
        max_period_ratio: float = 1.3,
        max_amp_factor: float = 1.6,
    ):
        """Extract voice shimmer frames from glottal pulse frames.

        Parameters
        ----------
        pitch_pulse_frames_obj: PitchPulseFrames
            Glottal pulse frames object.
        rel: bool, optional, default=True
            Divide shimmer by the average pitch period.
        lower: float, optional, default=0.0001
            Lower limit for periods between glottal pulses.
        upper: float, optional, default=0.02
            Upper limit for periods between glottal pulses.
        max_period_ratio: float, optional, default=1.3
            Maximum ratio between consecutive periods for shimmer extraction.
        max_amp_factor: float, optional, default=1.6
            Maximum ratio between consecutive amplitudes used for shimmer extraction.
        """
        shimmer_frames = np.array(
            [
                cls._calc_shimmer_frame(
                    pulses, rel, lower, upper, max_period_ratio, max_amp_factor
                )
                for pulses in pitch_pulse_frames_obj.frames
            ]
        )

        return cls(
            shimmer_frames,
            pitch_pulse_frames_obj.sr,
            pitch_pulse_frames_obj.frame_len,
            pitch_pulse_frames_obj.hop_len,
            pitch_pulse_frames_obj.center,
            pitch_pulse_frames_obj.pad_mode,
            rel,
            lower,
            upper,
            max_period_ratio,
            max_amp_factor,
        )

    @classmethod
    def _calc_shimmer_frame(
        cls,
        pulses: List[Tuple],
        rel: bool,
        lower: float,
        upper: float,
        max_period_ratio: float,
        max_amp_factor: float,
    ) -> float:
        if len(pulses) > 0:
            # Calc period length as first order diff of pulse ts
            periods, mask = cls._calc_period_length(pulses, lower, upper)
            amps = cls._get_amplitude(pulses, mask)

            # Calc avg of first order diff in amplitude
            # only consider period pairs where period ratio is < max_period_ratio and
            # where amplitude ratio is < max_amp_factor
            avg_amp_diff = np.nanmean(
                np.array(
                    [
                        np.mean(
                            np.abs(
                                np.diff(amp)[
                                    np.logical_and(
                                        (period[:-1] / period[1:]) < max_period_ratio,
                                        (amp[:-1] / amp[1:]) < max_amp_factor,
                                    )
                                ]
                            )
                        )
                        for amp, period in zip(amps, periods)
                        if len(period) > 0
                    ]
                )
            )

            if rel:  # Relative to mean amplitude
                avg_amp = np.nanmean(
                    np.array([np.mean(amp) for amp in amps if len(amp) > 0])
                )
                return avg_amp_diff / avg_amp
            return avg_amp_diff
        return np.nan

    @staticmethod
    def _get_amplitude(pulses: List[Tuple], mask: np.ndarray) -> List:
        # Get amplitudes
        amps = np.array([puls[2] for puls in pulses])[
            1:
        ]  # Skip first amplitude to align with periods

        # Split periods according to mask and remove masked periods
        amps = np.array_split(amps[mask], np.where(~mask)[0])

        return amps


[docs]class HnrFrames(BaseFrames):
    """Estimate and store harmonics-to-noise ratios (HNRs).

    Parameters
    ----------
    frames: numpy.ndarray
        HNR frames in dB with shape (num_frames,).
    lower: float
        Lower fundamental frequency limit for choosing pitch candidates.
    rel_silence_threshold: float
        Relative threshold for treating signal frames as silent.

    Notes
    -----
    Estimate the HNR for each signal frame with ``np.max(np.abs(frames), axis=1) > rel_silence_threshold*np.max(np.abs(frames))`` by:

    1. Compute the autocorrelation function (ACF) using the short-term Fourier transform (STFT).
    2. Find the lags of peaks in the ACF excluding the zero-th lag.
    3. Filter out peaks that correspond to pitch candidates below `lower` and above the Nyquist frequency.
    4. Compute the harmonic component `R0` as the highest of the remaining peaks divided by the ACF at lag zero.
    5. Compute the HNR as `R0/(1-R0)` and convert to dB.

    """

    def __init__(
        self,
        frames: np.ndarray,
        sr: int,
        frame_len: int,
        hop_len: int,
        center: bool,
        pad_mode: str,
        lower: float,
        rel_silence_threshold,
    ):
        self.logger = logging.getLogger("mexca.audio.extraction.HnrFrames")
        self.lower = lower
        self.rel_silence_threshold = rel_silence_threshold
        super().__init__(frames, sr, frame_len, hop_len, center, pad_mode)
        self.logger.debug(ClassInitMessage())

    @classmethod
[docs]    def from_frames(
        cls,
        sig_frames_obj: BaseFrames,
        lower: float = 75.0,
        rel_silence_threshold: float = 0.1,
    ):
        """Estimate the HNR from signal frames.

        Parameters
        ----------
        sig_frames_obj: BaseFrames
            Signal frames object.
        lower: float, default = 75.0
            Lower fundamental frequency limit for choosing pitch candidates.
        rel_silence_threshold: float, default = 0.1
            Relative threshold for treating signal frames as silent.

        """
        auto_cor = librosa.autocorrelate(sig_frames_obj.frames)
        harmonic_strength = np.apply_along_axis(
            cls._find_max_peak, 1, auto_cor[:, 1:], sr=sig_frames_obj.sr, lower=lower
        )
        harmonic_comp = harmonic_strength / auto_cor[:, 0]
        hnr = harmonic_comp / (1 - harmonic_comp)
        silence_mask = np.max(
            np.abs(sig_frames_obj.frames), axis=1
        ) > rel_silence_threshold * np.max(np.abs(sig_frames_obj.frames))
        hnr[~silence_mask] = np.nan
        hnr_db = 10 * np.log10(hnr)
        return cls(
            hnr_db,
            sig_frames_obj.sr,
            sig_frames_obj.frame_len,
            sig_frames_obj.hop_len,
            sig_frames_obj.center,
            sig_frames_obj.pad_mode,
            lower,
            rel_silence_threshold,
        )

    @staticmethod
    def _find_max_peak(auto_cor: np.ndarray, sr: int, lower: float) -> float:
        if np.all(np.isnan(auto_cor)):
            return np.nan
        
        auto_cor_peak_lags = find_peaks(auto_cor)[0]
        auto_cor_peaks = auto_cor[auto_cor_peak_lags]
        auto_cor_peak_periods = 1 / auto_cor_peak_lags * sr
        auto_cor_peaks_voiced = auto_cor_peaks[
            np.logical_and(
                auto_cor_peak_periods > lower, auto_cor_peak_periods < sr / 2
            )
        ]

        if len(auto_cor_peaks_voiced) == 0:
            return np.nan
        
        auto_cor_max_peak_lag = np.argmax(
            auto_cor_peaks_voiced
        )
        return auto_cor[auto_cor_max_peak_lag]