Source code for mexca.pipeline

"""Build a pipeline to extract emotion expression features from a video file.
"""

import logging
import logging.config
import os
from typing import Optional, Tuple, Union
from moviepy.editor import VideoFileClip
from mexca.data import Multimodal
from mexca.utils import ClassInitMessage


[docs]class Pipeline:
    """Build a pipeline to extract emotion expression features from a video file.

    Takes either component objects or container component objects (or a mix of both) as input.

    Parameters
    ----------
    face_extractor : FaceExtractor or FaceExtractorContainer, optional, default=None
        Component for detecting and identifying faces as well as extracting facial features.
    speaker_identifier : SpeakerIdentifier or SpeakerIdentifierContainer, optional, default=None
        Component for identifying speech segments and speakers.
    voice_extractor : VoiceExtractor or VoiceExtractorContainer, optional, default=None
        Component for extracting voice features.
    audio_transcriber : AudioTranscriber or AudioTranscriberContainer, optional, default=None
        Component for transcribing speech segments to text.
    sentiment_extractor : SentimentExtractor or SentimentExtractorContainer, optional, default=None
        Component for extracting sentiment from text.

    Examples
    --------
    Create a pipeline with standard components.

    >>> from mexca import Pipeline
    >>> from mexca.audio import SpeakerIdentifier, VoiceExtractor
    >>> from mexca.text import AudioTranscriber, SentimentExtractor
    >>> from mexca.video import FaceExtractor
    >>> num_faces = 2
    >>> num_speaker = 2
    >>> pipeline = Pipeline(
    ...     face_extractor=FaceExtractor(num_faces=num_faces),
    ...     speaker_identifier=SpeakerIdentifier(
    ...         num_speakers=num_speakers
    ...     ),
    ...     voice_extractor=VoiceExtractor(),
    ...     audio_transcriber=AudioTranscriber(),
    ...     sentiment_extractor=SentimentExtractor()
    ... )

    Create a pipeline with container components.

    >>> from mexca import Pipeline
    >>> from mexca.container import AudioTranscriberContainer, FaceExtractorContainer,
    >>>     SentimentExtractorContainer, SpeakerIdentifierContainer, VoiceExtractorContainer
    >>> num_faces = 2
    >>> num_speaker = 2
    >>> pipeline = Pipeline(
    ...     face_extractor=FaceExtractorContainer(num_faces=num_faces),
    ...     speaker_identifier=SpeakerIdentifierContainer(
    ...         num_speakers=num_speakers
    ...     ),
    ...     voice_extractor=VoiceExtractorContainer(),
    ...     audio_transcriber=AudioTranscriberContainer(),
    ...     sentiment_extractor=SentimentExtractorContainer()
    ... )

    Create a pipeline with standard *and* container components.

    >>> from mexca import Pipeline
    >>> from mexca.audio import SpeakerIdentifier, VoiceExtractor
    >>> from mexca.container import AudioTranscriberContainer, FaceExtractorContainer,
    >>>     SentimentExtractorContainer
    >>> num_faces = 2
    >>> num_speaker = 2
    >>> pipeline = Pipeline(
    ...     face_extractor=FaceExtractorContainer(num_faces=num_faces),
    ...     speaker_identifier=SpeakerIdentifier( # standard
    ...         num_speakers=num_speakers
    ...     ),
    ...     voice_extractor=VoiceExtractor(), # standard
    ...     audio_transcriber=AudioTranscriberContainer(),
    ...     sentiment_extractor=SentimentExtractorContainer()
    ... )

    """
    def __init__(self,
        face_extractor: Optional[Union['FaceExtractor', 'FaceExtractorContainer']] = None,
        speaker_identifier: Optional[Union['SpeakerIdentifier', 'SpeakerIdentifierContainer']] = None,
        voice_extractor: Optional[Union['VoiceExtractor', 'VoiceExtractorContainer']] = None,
        audio_transcriber: Optional[Union['AudioTranscriber', 'AudioTranscriberContainer']] = None,
        sentiment_extractor: Optional[Union['SentimentExtractor', 'SentimentExtractorContainer']] = None
    ):
        self.logger = logging.getLogger('mexca.pipeline.Pipeline')
        self.face_extractor = face_extractor
        self.speaker_identifier = speaker_identifier
        self.voice_extractor = voice_extractor
        self.audio_transcriber = audio_transcriber
        self.sentiment_extractor = sentiment_extractor
        self.logger.debug(ClassInitMessage())


[docs]    def apply(self, # pylint: disable=too-many-locals
        filepath: str,
        frame_batch_size: int = 1,
        skip_frames: int = 1,
        process_subclip: Tuple[Optional[float]] = (0, None),
        language: Optional[str] = None,
        keep_audiofile: bool = False,
        show_progress: bool = True
    ) -> 'Multimodal':
        """
        Extract emotion expression features from a video file.

        This is the main function to apply the complete mexca pipeline to a video file.

        Parameters
        ----------
        filepath: str
            Path to the video file.
        frame_batch_size: int, default=1
            Size of the batch of video frames that are loaded and processed at the same time.
        skip_frames: int, default=1
            Only process every nth frame, starting at 0.
        process_subclip: tuple, default=(0, None)
            Process only a part of the video clip. Must be the start and end of the subclip in seconds.
            `None` indicates the end of the video.
        language: str, optional, default=None
            The language of the speech that is transcribed.
            If `None`, the language is detected for each speech segment.
        keep_audiofile: bool, default=False
            Keeps the audio file after processing. If False, the audio file is only stored temporarily.
        show_progress: bool, default=True
            Enables progress bars and printing info logging messages to the console.
            The logging is overriden when a custom logger is explicitly created.

        Returns
        -------
        Multimodal
            A data class object that contains the extracted merged features in the `features` attribute.
            See the `Output <https://mexca.readthedocs.io/en/latest/output.html>`_ section for details.

        See Also
        --------
        mexca.data.Multimodal

        Examples
        --------
        >>> import pandas as pd
        >>> filepath = 'path/to/video'
        >>> output = pipeline.apply(filepath)
        >>> assert isinstance(output.features, pd.DataFrame)
        True

        """
        if show_progress:
            logging.getLogger(__name__).setLevel(logging.INFO)
            

        self.logger.info('Starting MEXCA pipeline')
        output = Multimodal(filename=filepath)

        with VideoFileClip(filepath) as clip:
            audio_path = os.path.splitext(filepath)[0] + '.wav'
            subclip = clip.subclip(
                process_subclip[0],
                process_subclip[1]
            )
            self.logger.debug('Reading video file from %s to %s', subclip.start, subclip.end)
            output.duration = subclip.duration
            output.fps = subclip.fps
            output.fps_adjusted = subclip.fps / skip_frames
            time_step = 1 / (subclip.fps / skip_frames)

            if self.speaker_identifier or self.voice_extractor:
                self.logger.debug('Writing audio file')
                subclip.audio.write_audiofile(audio_path, logger=None, fps=16000, ffmpeg_params=["-ac", "1"])
                self.logger.info('Wrote audio file to %s', audio_path)

        if self.face_extractor:
            self.logger.info('Processing video frames')
            video_annotation = self.face_extractor.apply(
                filepath,
                batch_size=frame_batch_size,
                skip_frames=skip_frames,
                process_subclip=process_subclip,
                show_progress=show_progress
            )
            output.video_annotation = video_annotation

        if self.speaker_identifier:
            self.logger.info('Identifying speakers')
            audio_annotation = self.speaker_identifier.apply(audio_path)

            output.audio_annotation = audio_annotation

            if self.audio_transcriber:
                self.logger.info('Transcribing speech segments to text')
                transcription = self.audio_transcriber.apply(
                    audio_path,
                    audio_annotation,
                    language=language,
                    show_progress=show_progress
                )

                output.transcription = transcription

                if self.sentiment_extractor:
                    self.logger.info('Extracting sentiment from transcribed text')
                    sentiment = self.sentiment_extractor.apply(
                        transcription=transcription,
                        show_progress=show_progress
                    )

                    output.sentiment = sentiment

        if self.voice_extractor:
            self.logger.info('Extracting voice features')
            voice_features = self.voice_extractor.apply(
                audio_path,
                time_step=time_step,
                skip_frames=skip_frames
            )

            output.voice_features = voice_features

        output.merge_features()

        if not keep_audiofile and os.path.exists(audio_path):
            self.logger.info('Removing audio file at %s', audio_path)
            os.remove(audio_path)

        self.logger.info('MEXCA pipeline finished')
        return output