sintonia/analyzer/libretime_analyzer/pipeline/analyze_metadata.py

import datetime
import hashlib
import os
import wave
from typing import Any, Dict

import magic
import mutagen
from loguru import logger


def analyze_metadata(filename: str, metadata: Dict[str, Any]):
    """Extract audio metadata from tags embedded in the file (eg. ID3 tags)

    Keyword arguments:
        filename: The path to the audio file to extract metadata from.
        metadata: A dictionary that the extracted metadata will be added to.
    """
    if not isinstance(filename, str):
        raise TypeError(
            "filename must be string. Was of type " + type(filename).__name__
        )
    if not isinstance(metadata, dict):
        raise TypeError(
            "metadata must be a dict. Was of type " + type(metadata).__name__
        )
    if not os.path.exists(filename):
        raise FileNotFoundError(f"audio file not found: {filename}")

    # Airtime <= 2.5.x nonsense:
    metadata["ftype"] = "audioclip"
    # Other fields we'll want to set for Airtime:
    metadata["hidden"] = False

    # Get file size and md5 hash of the file
    metadata["filesize"] = os.path.getsize(filename)

    with open(filename, "rb") as fh:
        m = hashlib.md5()
        while True:
            data = fh.read(8192)
            if not data:
                break
            m.update(data)
        metadata["md5"] = m.hexdigest()

    # Mutagen doesn't handle WAVE files so we use a different package
    ms = magic.open(magic.MIME_TYPE)
    ms.load()
    with open(filename, "rb") as fh:
        mime_check = ms.buffer(fh.read(2014))
    metadata["mime"] = mime_check
    if mime_check == "audio/x-wav":
        return _analyze_wave(filename, metadata)

    # Extract metadata from an audio file using mutagen
    audio_file = mutagen.File(filename, easy=True)

    # Bail if the file couldn't be parsed. The title should stay as the filename
    # inside Airtime.
    if (
        audio_file == None
    ):  # Don't use "if not" here. It is wrong due to mutagen's design.
        return metadata
    # Note that audio_file can equal {} if the file is valid but there's no metadata tags.
    # We can still try to grab the info variables below.

    # Grab other file information that isn't encoded in a tag, but instead usually
    # in the file header. Mutagen breaks that out into a separate "info" object:
    info = audio_file.info
    if hasattr(info, "sample_rate"):  # Mutagen is annoying and inconsistent
        metadata["sample_rate"] = info.sample_rate
    if hasattr(info, "length"):
        metadata["length_seconds"] = info.length
        # Converting the length in seconds (float) to a formatted time string
        track_length = datetime.timedelta(seconds=info.length)
        metadata["length"] = str(
            track_length
        )  # time.strftime("%H:%M:%S.%f", track_length)
        # Other fields for Airtime
        metadata["cueout"] = metadata["length"]

    # Set a default cue in time in seconds
    metadata["cuein"] = 0.0

    if hasattr(info, "bitrate"):
        metadata["bit_rate"] = info.bitrate

    # Use the mutagen to get the MIME type, if it has one. This is more reliable and
    # consistent for certain types of MP3s or MPEG files than the MIMEs returned by magic.
    if audio_file.mime:
        metadata["mime"] = audio_file.mime[0]

    # Try to get the number of channels if mutagen can...
    try:
        # Special handling for getting the # of channels from MP3s. It's in the "mode" field
        # which is 0=Stereo, 1=Joint Stereo, 2=Dual Channel, 3=Mono. Part of the ID3 spec...
        if metadata["mime"] in ["audio/mpeg", "audio/mp3"]:
            if info.mode == 3:
                metadata["channels"] = 1
            else:
                metadata["channels"] = 2
        else:
            metadata["channels"] = info.channels
    except (AttributeError, KeyError):
        # If mutagen can't figure out the number of channels, we'll just leave it out...
        pass

    # Try to extract the number of tracks on the album if we can (the "track total")
    try:
        track_number = audio_file["tracknumber"]
        if isinstance(track_number, list):  # Sometimes tracknumber is a list, ugh
            track_number = track_number[0]
        track_number_tokens = track_number
        if "/" in track_number:
            track_number_tokens = track_number.split("/")
            track_number = track_number_tokens[0]
        elif "-" in track_number:
            track_number_tokens = track_number.split("-")
            track_number = track_number_tokens[0]
        metadata["track_number"] = track_number
        track_total = track_number_tokens[1]
        metadata["track_total"] = track_total
    except (AttributeError, KeyError, IndexError):
        # If we couldn't figure out the track_number or track_total, just ignore it...
        pass

    # We normalize the mutagen tags slightly here, so in case mutagen changes,
    # we find the
    mutagen_to_airtime_mapping = {
        "title": "track_title",
        "artist": "artist_name",
        "album": "album_title",
        "bpm": "bpm",
        "composer": "composer",
        "conductor": "conductor",
        "copyright": "copyright",
        "comment": "comment",
        "encoded_by": "encoder",
        "genre": "genre",
        "isrc": "isrc",
        "label": "label",
        "organization": "label",
        #'length':       'length',
        "language": "language",
        "last_modified": "last_modified",
        "mood": "mood",
        "bit_rate": "bit_rate",
        "replay_gain": "replaygain",
        #'tracknumber':  'track_number',
        #'track_total':  'track_total',
        "website": "website",
        "date": "year",
        #'mime_type':    'mime',
    }

    for mutagen_tag, airtime_tag in mutagen_to_airtime_mapping.items():
        try:
            metadata[airtime_tag] = audio_file[mutagen_tag]

            # Some tags are returned as lists because there could be multiple values.
            # This is unusual so we're going to always just take the first item in the list.
            if isinstance(metadata[airtime_tag], list):
                if metadata[airtime_tag]:
                    metadata[airtime_tag] = metadata[airtime_tag][0]
                else:  # Handle empty lists
                    metadata[airtime_tag] = ""

        except KeyError:
            continue

    return metadata


def _analyze_wave(filename, metadata):
    try:
        reader = wave.open(filename, "rb")
        metadata["channels"] = reader.getnchannels()
        metadata["sample_rate"] = reader.getframerate()
        length_seconds = float(reader.getnframes()) / float(metadata["sample_rate"])
        # Converting the length in seconds (float) to a formatted time string
        track_length = datetime.timedelta(seconds=length_seconds)
        metadata["length"] = str(
            track_length
        )  # time.strftime("%H:%M:%S.%f", track_length)
        metadata["length_seconds"] = length_seconds
        metadata["cueout"] = metadata["length"]
    except wave.Error as ex:
        logger.error(f"Invalid WAVE file: {str(ex)}")
        raise
    return metadata