Source code for mexca.audio.identification

"""Speech segment and speaker identification.
"""

import argparse
import logging
import os
from typing import Optional, Union
from pyannote.audio import Pipeline
from mexca.data import SpeakerAnnotation
from mexca.utils import ClassInitMessage, bool_or_str, optional_int


[docs]class AuthenticationError(Exception): """Failed authentication to HuggingFace Hub. Parameters ---------- msg : str Error message. """ def __init__(self, msg: str): super().__init__(msg)
[docs]class SpeakerIdentifier: """Identify speech segments and cluster speakers using speaker diarization. Wrapper class for ``pyannote.audio.SpeakerDiarization``. Parameters ---------- num_speakers : int, optional Number of speakers to which speech segments will be assigned during the clustering (oracle speakers). If `None`, the number of speakers is estimated from the audio signal. use_auth_token : bool or str, default=True Whether to use the HuggingFace authentication token stored on the machine (if bool) or a HuggingFace authentication token with access to the models ``pyannote/speaker-diarization`` and ``pyannote/segmentation`` (if str). Notes ----- This class requires pretrained models for speaker diarization and segmentation from HuggingFace. To download the models accept the user conditions on `<hf.co/pyannote/speaker-diarization>`_ and `<hf.co/pyannote/segmentation>`_. Then generate an authentication token on `<hf.co/settings/tokens>`_. """ def __init__( self, num_speakers: Optional[int] = None, use_auth_token: Union[bool, str] = True, ): self.logger = logging.getLogger("mexca.audio.identification.SpeakerIdentifier") self.num_speakers = num_speakers self.use_auth_token = use_auth_token # Lazy initialization self._pipeline = None self.logger.debug(ClassInitMessage()) # Initialize pretrained models only when needed @property
[docs] def pipeline(self) -> Pipeline: """The pretrained speaker diarization pipeline. See `pyannote.audio.SpeakerDiarization <https://github.com/pyannote/pyannote-audio/blob/develop/pyannote/audio/pipelines/speaker_diarization.py#L56>`_ for details. """ if not self._pipeline: try: self._pipeline = Pipeline.from_pretrained( "pyannote/speaker-diarization", use_auth_token=self.use_auth_token ) except EnvironmentError as exc: self.logger.exception("EnvironmentError: %s", exc) raise exc try: if self._pipeline is None: raise AuthenticationError( 'Could not download pretrained "pyannote/speaker-diarization" pipeline; please provide a valid authentication token' ) except AuthenticationError as exc: self.logger.exception("Error: %s", exc) raise exc self.logger.debug("Initialized speaker diarization pipeline") return self._pipeline
# Delete pretrained models when not needed anymore @pipeline.deleter def pipeline(self): self._pipeline = None self.logger.debug("Removed speaker diarization pipeline")
[docs] def apply(self, filepath: str) -> SpeakerAnnotation: """Identify speech segments and speakers. Parameters ---------- filepath : str Path to the audio file. Returns ------- SpeakerAnnotation A data class object that contains detected speech segments and speakers. """ annotation = self.pipeline(filepath, num_speakers=self.num_speakers) del self.pipeline self.logger.info("Detected %s speakers", len(annotation.labels())) self.logger.debug("Detected speaker chart: %s", annotation.chart()) return SpeakerAnnotation.from_pyannote( annotation.rename_labels(generator="int")
)
[docs]def cli(): """Command line interface for identifying speech segments and speakers. See `identify-speakers -h` for details. """ parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument("-f", "--filepath", type=str, required=True) parser.add_argument("-o", "--outdir", type=str, required=True) parser.add_argument( "--num-speakers", type=optional_int, default=None, dest="num_speakers" ) parser.add_argument( "--use-auth-token", type=bool_or_str, default=True, dest="use_auth_token" ) args = parser.parse_args().__dict__ identifier = SpeakerIdentifier( num_speakers=args["num_speakers"], use_auth_token=args["use_auth_token"] ) output = identifier.apply(args["filepath"]) output.write_rttm( os.path.join( args["outdir"], os.path.splitext(os.path.basename(args["filepath"]))[0] + "_audio_annotation.rttm",
) ) if __name__ == "__main__": cli()