sintonia/analyzer/airtime_analyzer/metadata_analyzer.py

196 lines
7.7 KiB
Python

import datetime
import hashlib
import logging
import os
import time
import wave
import magic
import mutagen
from .analyzer import Analyzer
class MetadataAnalyzer(Analyzer):
@staticmethod
def analyze(filename, metadata):
"""Extract audio metadata from tags embedded in the file (eg. ID3 tags)
Keyword arguments:
filename: The path to the audio file to extract metadata from.
metadata: A dictionary that the extracted metadata will be added to.
"""
if not isinstance(filename, str):
raise TypeError(
"filename must be string. Was of type " + type(filename).__name__
)
if not isinstance(metadata, dict):
raise TypeError(
"metadata must be a dict. Was of type " + type(metadata).__name__
)
if not os.path.exists(filename):
raise FileNotFoundError("audio file not found: {}".format(filename))
# Airtime <= 2.5.x nonsense:
metadata["ftype"] = "audioclip"
# Other fields we'll want to set for Airtime:
metadata["hidden"] = False
# Get file size and md5 hash of the file
metadata["filesize"] = os.path.getsize(filename)
with open(filename, "rb") as fh:
m = hashlib.md5()
while True:
data = fh.read(8192)
if not data:
break
m.update(data)
metadata["md5"] = m.hexdigest()
# Mutagen doesn't handle WAVE files so we use a different package
ms = magic.open(magic.MIME_TYPE)
ms.load()
with open(filename, "rb") as fh:
mime_check = ms.buffer(fh.read(2014))
metadata["mime"] = mime_check
if mime_check == "audio/x-wav":
return MetadataAnalyzer._analyze_wave(filename, metadata)
# Extract metadata from an audio file using mutagen
audio_file = mutagen.File(filename, easy=True)
# Bail if the file couldn't be parsed. The title should stay as the filename
# inside Airtime.
if (
audio_file == None
): # Don't use "if not" here. It is wrong due to mutagen's design.
return metadata
# Note that audio_file can equal {} if the file is valid but there's no metadata tags.
# We can still try to grab the info variables below.
# Grab other file information that isn't encoded in a tag, but instead usually
# in the file header. Mutagen breaks that out into a separate "info" object:
info = audio_file.info
if hasattr(info, "sample_rate"): # Mutagen is annoying and inconsistent
metadata["sample_rate"] = info.sample_rate
if hasattr(info, "length"):
metadata["length_seconds"] = info.length
# Converting the length in seconds (float) to a formatted time string
track_length = datetime.timedelta(seconds=info.length)
metadata["length"] = str(
track_length
) # time.strftime("%H:%M:%S.%f", track_length)
# Other fields for Airtime
metadata["cueout"] = metadata["length"]
# Set a default cue in time in seconds
metadata["cuein"] = 0.0
if hasattr(info, "bitrate"):
metadata["bit_rate"] = info.bitrate
# Use the mutagen to get the MIME type, if it has one. This is more reliable and
# consistent for certain types of MP3s or MPEG files than the MIMEs returned by magic.
if audio_file.mime:
metadata["mime"] = audio_file.mime[0]
# Try to get the number of channels if mutagen can...
try:
# Special handling for getting the # of channels from MP3s. It's in the "mode" field
# which is 0=Stereo, 1=Joint Stereo, 2=Dual Channel, 3=Mono. Part of the ID3 spec...
if metadata["mime"] in ["audio/mpeg", "audio/mp3"]:
if info.mode == 3:
metadata["channels"] = 1
else:
metadata["channels"] = 2
else:
metadata["channels"] = info.channels
except (AttributeError, KeyError):
# If mutagen can't figure out the number of channels, we'll just leave it out...
pass
# Try to extract the number of tracks on the album if we can (the "track total")
try:
track_number = audio_file["tracknumber"]
if isinstance(track_number, list): # Sometimes tracknumber is a list, ugh
track_number = track_number[0]
track_number_tokens = track_number
if "/" in track_number:
track_number_tokens = track_number.split("/")
track_number = track_number_tokens[0]
elif "-" in track_number:
track_number_tokens = track_number.split("-")
track_number = track_number_tokens[0]
metadata["track_number"] = track_number
track_total = track_number_tokens[1]
metadata["track_total"] = track_total
except (AttributeError, KeyError, IndexError):
# If we couldn't figure out the track_number or track_total, just ignore it...
pass
# We normalize the mutagen tags slightly here, so in case mutagen changes,
# we find the
mutagen_to_airtime_mapping = {
"title": "track_title",
"artist": "artist_name",
"album": "album_title",
"bpm": "bpm",
"composer": "composer",
"conductor": "conductor",
"copyright": "copyright",
"comment": "comment",
"encoded_by": "encoder",
"genre": "genre",
"isrc": "isrc",
"label": "label",
"organization": "label",
#'length': 'length',
"language": "language",
"last_modified": "last_modified",
"mood": "mood",
"bit_rate": "bit_rate",
"replay_gain": "replaygain",
#'tracknumber': 'track_number',
#'track_total': 'track_total',
"website": "website",
"date": "year",
#'mime_type': 'mime',
}
for mutagen_tag, airtime_tag in mutagen_to_airtime_mapping.items():
try:
metadata[airtime_tag] = audio_file[mutagen_tag]
# Some tags are returned as lists because there could be multiple values.
# This is unusual so we're going to always just take the first item in the list.
if isinstance(metadata[airtime_tag], list):
if metadata[airtime_tag]:
metadata[airtime_tag] = metadata[airtime_tag][0]
else: # Handle empty lists
metadata[airtime_tag] = ""
except KeyError:
continue
return metadata
@staticmethod
def _analyze_wave(filename, metadata):
try:
reader = wave.open(filename, "rb")
metadata["channels"] = reader.getnchannels()
metadata["sample_rate"] = reader.getframerate()
length_seconds = float(reader.getnframes()) / float(metadata["sample_rate"])
# Converting the length in seconds (float) to a formatted time string
track_length = datetime.timedelta(seconds=length_seconds)
metadata["length"] = str(
track_length
) # time.strftime("%H:%M:%S.%f", track_length)
metadata["length_seconds"] = length_seconds
metadata["cueout"] = metadata["length"]
except wave.Error as ex:
logging.error("Invalid WAVE file: {}".format(str(ex)))
raise
return metadata