feat(analyzer): rework analyze_metadata step

- Upgrade mutagen to 1.45.1 - Remove custom wave data extraction - Add .wav and ogg without extension fixtures - Move md5 sum and mime type in there own function - Cleanup comments - Let analyze_cuepoints handle cuein and cueout metadata - Remove python magic mime guessing
2022-02-14 21:35:15 +01:00 · 2022-02-14 21:35:15 +01:00 · 88dcd13fc8
parent 0106b4c6cb
commit 88dcd13fc8
5 changed files with 164 additions and 155 deletions
--- a/analyzer/libretime_analyzer/pipeline/analyze_metadata.py
+++ b/analyzer/libretime_analyzer/pipeline/analyze_metadata.py
@ -1,116 +1,61 @@
 import datetime
 import hashlib
-import os
+from datetime import timedelta
-import wave
+from pathlib import Path
 from typing import Any, Dict
 import magic
 import mutagen
 from loguru import logger
-def analyze_metadata(filename: str, metadata: Dict[str, Any]):
+def analyze_metadata(filepath_: str, metadata: Dict[str, Any]):
    """Extract audio metadata from tags embedded in the file (eg. ID3 tags)
    Keyword arguments:
        filename: The path to the audio file to extract metadata from.
        metadata: A dictionary that the extracted metadata will be added to.
    """
-    if not isinstance(filename, str):
+    Extract audio metadata from tags embedded in the file using mutagen.
-        raise TypeError(
+    """
-            "filename must be string. Was of type " + type(filename).__name__
+    filepath = Path(filepath_)
        )
    if not isinstance(metadata, dict):
        raise TypeError(
            "metadata must be a dict. Was of type " + type(metadata).__name__
        )
    if not os.path.exists(filename):
        raise FileNotFoundError(f"audio file not found: {filename}")
-    # Airtime <= 2.5.x nonsense:
+    # Airtime <= 2.5.x required fields
    metadata["ftype"] = "audioclip"
    # Other fields we'll want to set for Airtime:
    metadata["hidden"] = False
-    # Get file size and md5 hash of the file
+    # Get file properties
-    metadata["filesize"] = os.path.getsize(filename)
+    metadata["filesize"] = filepath.stat().st_size
    metadata["md5"] = compute_md5(filepath)
-    with open(filename, "rb") as fh:
+    # Get audio file metadata
-        m = hashlib.md5()
+    extracted = mutagen.File(filepath, easy=True)
-        while True:
+    if extracted is None:
-            data = fh.read(8192)
+        logger.warning(f"no metadata were extracted for {filepath}")
            if not data:
                break
            m.update(data)
        metadata["md5"] = m.hexdigest()
    # Mutagen doesn't handle WAVE files so we use a different package
    ms = magic.open(magic.MIME_TYPE)
    ms.load()
    with open(filename, "rb") as fh:
        mime_check = ms.buffer(fh.read(2014))
    metadata["mime"] = mime_check
    if mime_check == "audio/x-wav":
        return _analyze_wave(filename, metadata)
    # Extract metadata from an audio file using mutagen
    audio_file = mutagen.File(filename, easy=True)
    # Bail if the file couldn't be parsed. The title should stay as the filename
    # inside Airtime.
    if (
        audio_file == None
    ):  # Don't use "if not" here. It is wrong due to mutagen's design.
        return metadata
    # Note that audio_file can equal {} if the file is valid but there's no metadata tags.
    # We can still try to grab the info variables below.
-    # Grab other file information that isn't encoded in a tag, but instead usually
+    metadata["mime"] = extracted.mime[0]
-    # in the file header. Mutagen breaks that out into a separate "info" object:
+
-    info = audio_file.info
+    info = extracted.info
-    if hasattr(info, "sample_rate"):  # Mutagen is annoying and inconsistent
+    if hasattr(info, "sample_rate"):
        metadata["sample_rate"] = info.sample_rate
    if hasattr(info, "length"):
        metadata["length_seconds"] = info.length
        # Converting the length in seconds (float) to a formatted time string
        track_length = datetime.timedelta(seconds=info.length)
        metadata["length"] = str(
            track_length
        )  # time.strftime("%H:%M:%S.%f", track_length)
        # Other fields for Airtime
        metadata["cueout"] = metadata["length"]
    # Set a default cue in time in seconds
    metadata["cuein"] = 0.0
    if hasattr(info, "bitrate"):
        metadata["bit_rate"] = info.bitrate
-    # Use the mutagen to get the MIME type, if it has one. This is more reliable and
+    if hasattr(info, "length"):
-    # consistent for certain types of MP3s or MPEG files than the MIMEs returned by magic.
+        metadata["length_seconds"] = info.length
-    if audio_file.mime:
+        metadata["length"] = str(timedelta(seconds=info.length))
        metadata["mime"] = audio_file.mime[0]
    # Try to get the number of channels if mutagen can...
    try:
-        # Special handling for getting the # of channels from MP3s. It's in the "mode" field
+        # Special handling for the number of channels in mp3 files.
-        # which is 0=Stereo, 1=Joint Stereo, 2=Dual Channel, 3=Mono. Part of the ID3 spec...
+        # 0=stereo, 1=joint stereo, 2=dual channel, 3=mono
-        if metadata["mime"] in ["audio/mpeg", "audio/mp3"]:
+        if metadata["mime"] in ("audio/mpeg", "audio/mp3"):
-            if info.mode == 3:
+            metadata["channels"] = 1 if info.mode == 3 else 2
                metadata["channels"] = 1
            else:
                metadata["channels"] = 2
        else:
            metadata["channels"] = info.channels
    except (AttributeError, KeyError):
        # If mutagen can't figure out the number of channels, we'll just leave it out...
        pass
    # Try to extract the number of tracks on the album if we can (the "track total")
    try:
-        track_number = audio_file["tracknumber"]
+        track_number = extracted["tracknumber"]
-        if isinstance(track_number, list):  # Sometimes tracknumber is a list, ugh
+
        if isinstance(track_number, list):
            track_number = track_number[0]
        track_number_tokens = track_number
        if "/" in track_number:
            track_number_tokens = track_number.split("/")
@ -122,12 +67,9 @@ def analyze_metadata(filename: str, metadata: Dict[str, Any]):
        track_total = track_number_tokens[1]
        metadata["track_total"] = track_total
    except (AttributeError, KeyError, IndexError):
        # If we couldn't figure out the track_number or track_total, just ignore it...
        pass
-    # We normalize the mutagen tags slightly here, so in case mutagen changes,
+    extracted_tags_mapping = {
    # we find the
    mutagen_to_airtime_mapping = {
        "title": "track_title",
        "artist": "artist_name",
        "album": "album_title",
@ -141,51 +83,43 @@ def analyze_metadata(filename: str, metadata: Dict[str, Any]):
        "isrc": "isrc",
        "label": "label",
        "organization": "label",
-        #'length':       'length',
+        # "length": "length",
        "language": "language",
        "last_modified": "last_modified",
        "mood": "mood",
        "bit_rate": "bit_rate",
        "replay_gain": "replaygain",
-        #'tracknumber':  'track_number',
+        # "tracknumber": "track_number",
-        #'track_total':  'track_total',
+        # "track_total": "track_total",
        "website": "website",
        "date": "year",
-        #'mime_type':    'mime',
+        # "mime_type": "mime",
    }
-    for mutagen_tag, airtime_tag in mutagen_to_airtime_mapping.items():
+    for extracted_key, metadata_key in extracted_tags_mapping.items():
        try:
-            metadata[airtime_tag] = audio_file[mutagen_tag]
+            metadata[metadata_key] = extracted[extracted_key]
-
+            if isinstance(metadata[metadata_key], list):
-            # Some tags are returned as lists because there could be multiple values.
+                if len(metadata[metadata_key]):
-            # This is unusual so we're going to always just take the first item in the list.
+                    metadata[metadata_key] = metadata[metadata_key][0]
-            if isinstance(metadata[airtime_tag], list):
+                else:
-                if metadata[airtime_tag]:
+                    metadata[metadata_key] = ""
                    metadata[airtime_tag] = metadata[airtime_tag][0]
                else:  # Handle empty lists
                    metadata[airtime_tag] = ""
        except KeyError:
            continue
    return metadata
-def _analyze_wave(filename, metadata):
+def compute_md5(filepath: Path) -> str:
-    try:
+    """
-        reader = wave.open(filename, "rb")
+    Compute a file md5sum.
-        metadata["channels"] = reader.getnchannels()
+    """
-        metadata["sample_rate"] = reader.getframerate()
+    with filepath.open("rb") as file:
-        length_seconds = float(reader.getnframes()) / float(metadata["sample_rate"])
+        buffer = hashlib.md5()  # nosec
-        # Converting the length in seconds (float) to a formatted time string
+        while True:
-        track_length = datetime.timedelta(seconds=length_seconds)
+            blob = file.read(8192)
-        metadata["length"] = str(
+            if not blob:
-            track_length
+                break
-        )  # time.strftime("%H:%M:%S.%f", track_length)
+            buffer.update(blob)
-        metadata["length_seconds"] = length_seconds
+
-        metadata["cueout"] = metadata["length"]
+        return buffer.hexdigest()
    except wave.Error as ex:
        logger.error(f"Invalid WAVE file: {str(ex)}")
        raise
    return metadata
--- a/analyzer/setup.py
+++ b/analyzer/setup.py
@ -30,9 +30,8 @@ setup(
    },
    python_requires=">=3.6",
    install_requires=[
-        "mutagen>=1.31.0",
+        "mutagen>=1.45.1",
        "pika>=1.0.0",
        "file-magic",
        "requests>=2.7.0",
        "typing_extensions",
    ],
--- a/analyzer/tests/fixtures/init.py
+++ b/analyzer/tests/fixtures/init.py
@ -40,6 +40,9 @@ Fixture(here / "s1-mono.m4a",           15.0,   6.0,    13.0,   -4.5    ),
 Fixture(here / "s1-stereo.m4a",         15.0,   6.0,    13.0,   -5.8    ),
 Fixture(here / "s1-mono.ogg",           15.0,   6.0,    13.0,   -4.9    ),
 Fixture(here / "s1-stereo.ogg",         15.0,   6.0,    13.0,   -5.7    ),
 Fixture(here / "s1-stereo",             15.0,   6.0,    13.0,   -5.7    ),
 Fixture(here / "s1-mono.wav",           15.0,   6.0,    13.0,   -2.3    ),
 Fixture(here / "s1-stereo.wav",         15.0,   6.0,    13.0,   -6.0    ),
 # sample 2
 # 0s   -> 1.8s: silence
 # 1.8s        : noise
@ -73,13 +76,14 @@ FixtureMeta = namedtuple(
 )
 meta = {
    "cuein": 0.0,
    "sample_rate": 48000,
    "length": str(timedelta(seconds=15)),
    "length_seconds": approx(15.0, abs=0.1),
    "ftype": "audioclip",
    "hidden": False,
-    # Tags
+}
 tags = {
    "album_title": "Test Album",
    "artist_name": "Test Artist",
    "track_title": "Test Title",
@ -95,6 +99,7 @@ FILES_TAGGED = [
        here / "s1-jointstereo-tagged.mp3",
        {
            **meta,
            **tags,
            "bit_rate": approx(128000, abs=1e2),
            "channels": 2,
            "mime": "audio/mp3",
@ -104,6 +109,7 @@ FILES_TAGGED = [
        here / "s1-mono-tagged.mp3",
        {
            **meta,
            **tags,
            "bit_rate": approx(64000, abs=1e2),
            "channels": 1,
            "mime": "audio/mp3",
@ -113,6 +119,7 @@ FILES_TAGGED = [
        here / "s1-stereo-tagged.mp3",
        {
            **meta,
            **tags,
            "bit_rate": approx(128000, abs=1e2),
            "channels": 2,
            "mime": "audio/mp3",
@ -122,6 +129,7 @@ FILES_TAGGED = [
        here / "s1-mono-tagged.flac",
        {
            **meta,
            **tags,
            "bit_rate": approx(452802, abs=1e2),
            "channels": 1,
            "mime": "audio/flac",
@ -131,6 +139,7 @@ FILES_TAGGED = [
        here / "s1-stereo-tagged.flac",
        {
            **meta,
            **tags,
            "bit_rate": approx(938593, abs=1e3),
            "channels": 2,
            "mime": "audio/flac",
@ -140,6 +149,7 @@ FILES_TAGGED = [
        here / "s1-mono-tagged.m4a",
        {
            **meta,
            **tags,
            "bit_rate": approx(65000, abs=5e4),
            "channels": 2,  # Weird
            "mime": "audio/mp4",
@ -149,6 +159,7 @@ FILES_TAGGED = [
        here / "s1-stereo-tagged.m4a",
        {
            **meta,
            **tags,
            "bit_rate": approx(128000, abs=1e5),
            "channels": 2,
            "mime": "audio/mp4",
@ -158,6 +169,7 @@ FILES_TAGGED = [
        here / "s1-mono-tagged.ogg",
        {
            **meta,
            **tags,
            "bit_rate": approx(80000, abs=1e2),
            "channels": 1,
            "mime": "audio/vorbis",
@ -167,15 +179,43 @@ FILES_TAGGED = [
        here / "s1-stereo-tagged.ogg",
        {
            **meta,
            **tags,
            "bit_rate": approx(112000, abs=1e2),
            "channels": 2,
            "mime": "audio/vorbis",
        },
    ),
    FixtureMeta(
        here / "s1-stereo-tagged",
        {
            **meta,
            **tags,
            "bit_rate": approx(112000, abs=1e2),
            "channels": 2,
            "mime": "audio/vorbis",
        },
    ),
    FixtureMeta(
        here / "s1-mono-tagged.wav",
        {
            **meta,
            "bit_rate": approx(96000, abs=1e2),
            "channels": 1,
            "mime": "audio/wav",
        },
    ),
    FixtureMeta(
        here / "s1-stereo-tagged.wav",
        {
            **meta,
            "bit_rate": approx(384000, abs=1e2),
            "channels": 2,
            "mime": "audio/wav",
        },
    ),
 ]
-meta = {
+tags = {
    **meta,
    "album_title": "Ä ä Ü ü ß",
    "artist_name": "てすと",
    "track_title": "ｱｲｳｴｵｶｷｸｹｺｻｼｽｾｿﾀﾁﾂﾃ",
@ -191,6 +231,7 @@ FILES_TAGGED += [
        here / "s1-jointstereo-tagged-utf8.mp3",
        {
            **meta,
            **tags,
            "bit_rate": approx(128000, abs=1e2),
            "channels": 2,
            "mime": "audio/mp3",
@ -200,6 +241,7 @@ FILES_TAGGED += [
        here / "s1-mono-tagged-utf8.mp3",
        {
            **meta,
            **tags,
            "bit_rate": approx(64000, abs=1e2),
            "channels": 1,
            "mime": "audio/mp3",
@ -209,6 +251,7 @@ FILES_TAGGED += [
        here / "s1-stereo-tagged-utf8.mp3",
        {
            **meta,
            **tags,
            "bit_rate": approx(128000, abs=1e2),
            "channels": 2,
            "mime": "audio/mp3",
@ -218,6 +261,7 @@ FILES_TAGGED += [
        here / "s1-mono-tagged-utf8.flac",
        {
            **meta,
            **tags,
            "bit_rate": approx(452802, abs=1e2),
            "channels": 1,
            "mime": "audio/flac",
@ -227,6 +271,7 @@ FILES_TAGGED += [
        here / "s1-stereo-tagged-utf8.flac",
        {
            **meta,
            **tags,
            "bit_rate": approx(938593, abs=1e2),
            "channels": 2,
            "mime": "audio/flac",
@ -236,6 +281,7 @@ FILES_TAGGED += [
        here / "s1-mono-tagged-utf8.m4a",
        {
            **meta,
            **tags,
            "bit_rate": approx(65000, abs=5e4),
            "channels": 2,  # Weird
            "mime": "audio/mp4",
@ -245,6 +291,7 @@ FILES_TAGGED += [
        here / "s1-stereo-tagged-utf8.m4a",
        {
            **meta,
            **tags,
            "bit_rate": approx(128000, abs=1e5),
            "channels": 2,
            "mime": "audio/mp4",
@ -254,6 +301,7 @@ FILES_TAGGED += [
        here / "s1-mono-tagged-utf8.ogg",
        {
            **meta,
            **tags,
            "bit_rate": approx(80000, abs=1e2),
            "channels": 1,
            "mime": "audio/vorbis",
@ -263,9 +311,38 @@ FILES_TAGGED += [
        here / "s1-stereo-tagged-utf8.ogg",
        {
            **meta,
            **tags,
            "bit_rate": approx(112000, abs=1e2),
            "channels": 2,
            "mime": "audio/vorbis",
        },
    ),
    FixtureMeta(
        here / "s1-stereo-tagged-utf8",
        {
            **meta,
            **tags,
            "bit_rate": approx(112000, abs=1e2),
            "channels": 2,
            "mime": "audio/vorbis",
        },
    ),
    FixtureMeta(
        here / "s1-mono-tagged-utf8.wav",
        {
            **meta,
            "bit_rate": approx(96000, abs=1e2),
            "channels": 1,
            "mime": "audio/wav",
        },
    ),
    FixtureMeta(
        here / "s1-stereo-tagged-utf8.wav",
        {
            **meta,
            "bit_rate": approx(384000, abs=1e2),
            "channels": 2,
            "mime": "audio/wav",
        },
    ),
 ]
--- a/analyzer/tests/fixtures/generate.sh
+++ b/analyzer/tests/fixtures/generate.sh
@ -38,10 +38,12 @@ generate() {
 # Generate sample 1
 generate  s1.flac s1-mono.flac         -ac 1   -acodec flac
 generate  s1.flac s1-mono.wav          -ac 1
 generate  s1.flac s1-mono.m4a          -ac 1   -acodec aac
 generate  s1.flac s1-mono.mp3          -ac 1   -acodec libmp3lame
 generate  s1.flac s1-mono.ogg          -ac 1   -acodec libvorbis
 generate  s1.flac s1-stereo.flac       -ac 2   -acodec flac
 generate  s1.flac s1-stereo.wav        -ac 2
 generate  s1.flac s1-stereo.m4a        -ac 2   -acodec aac
 generate  s1.flac s1-stereo.mp3        -ac 2   -acodec libmp3lame
 generate  s1.flac s1-stereo.ogg        -ac 2   -acodec libvorbis
@ -77,10 +79,12 @@ generate  s3.flac s3-stereo.ogg        -ac 2   -acodec libvorbis
 # Tag sample 1
 tag metadata.txt  s1-mono.flac         s1-mono-tagged.flac
 tag metadata.txt  s1-mono.wav          s1-mono-tagged.wav
 tag metadata.txt  s1-mono.m4a          s1-mono-tagged.m4a
 tag metadata.txt  s1-mono.mp3          s1-mono-tagged.mp3
 tag metadata.txt  s1-mono.ogg          s1-mono-tagged.ogg
 tag metadata.txt  s1-stereo.flac       s1-stereo-tagged.flac
 tag metadata.txt  s1-stereo.wav        s1-stereo-tagged.wav
 tag metadata.txt  s1-stereo.m4a        s1-stereo-tagged.m4a
 tag metadata.txt  s1-stereo.mp3        s1-stereo-tagged.mp3
 tag metadata.txt  s1-stereo.ogg        s1-stereo-tagged.ogg
@ -88,11 +92,18 @@ tag metadata.txt  s1-jointstereo.mp3   s1-jointstereo-tagged.mp3
 # Tag utf8 sample 1
 tag metadata-utf8.txt   s1-mono.flac       s1-mono-tagged-utf8.flac
 tag metadata-utf8.txt   s1-mono.wav        s1-mono-tagged-utf8.wav
 tag metadata-utf8.txt   s1-mono.m4a        s1-mono-tagged-utf8.m4a
 tag metadata-utf8.txt   s1-mono.mp3        s1-mono-tagged-utf8.mp3
 tag metadata-utf8.txt   s1-mono.ogg        s1-mono-tagged-utf8.ogg
 tag metadata-utf8.txt   s1-stereo.flac     s1-stereo-tagged-utf8.flac
 tag metadata-utf8.txt   s1-stereo.wav      s1-stereo-tagged-utf8.wav
 tag metadata-utf8.txt   s1-stereo.m4a      s1-stereo-tagged-utf8.m4a
 tag metadata-utf8.txt   s1-stereo.mp3      s1-stereo-tagged-utf8.mp3
 tag metadata-utf8.txt   s1-stereo.ogg      s1-stereo-tagged-utf8.ogg
 tag metadata-utf8.txt   s1-jointstereo.mp3 s1-jointstereo-tagged-utf8.mp3
 # Extension less files
 cp s1-stereo.ogg s1-stereo
 cp s1-stereo-tagged.ogg s1-stereo-tagged
 cp s1-stereo-tagged-utf8.ogg s1-stereo-tagged-utf8
--- a/analyzer/tests/pipeline/analyze_metadata_test.py
+++ b/analyzer/tests/pipeline/analyze_metadata_test.py
@ -1,66 +1,54 @@
 from pathlib import Path
 import pytest
-from libretime_analyzer.pipeline.analyze_metadata import analyze_metadata
+from libretime_analyzer.pipeline.analyze_metadata import analyze_metadata, compute_md5
 from ..fixtures import FILE_INVALID_DRM, FILE_INVALID_TXT, FILES_TAGGED
@pytest.mark.parametrize(
    "params,exception",
    [
        ((42, dict()), TypeError),
        (("foo", 3), TypeError),
    ],
 )
 def test_analyze_metadata_wrong_params(params, exception):
    with pytest.raises(exception):
        analyze_metadata(*params)
@pytest.mark.parametrize(
    "filepath,metadata",
-    map(lambda i: (str(i.path), i.metadata), FILES_TAGGED),
+    map(lambda i: (i.path, i.metadata), FILES_TAGGED),
 )
-def test_analyze_metadata(filepath: str, metadata: dict):
+def test_analyze_metadata(filepath: Path, metadata: dict):
-    found = analyze_metadata(filepath, dict())
+    found = analyze_metadata(str(filepath), {})
    # Mutagen does not support wav files yet
    if filepath.endswith("wav"):
        return
    assert len(found["md5"]) == 32
    del found["md5"]
    # Handle filesize
-    assert found["filesize"] < 2e6  # ~2Mb
+    assert found["filesize"] < 3e6  # ~3Mb
    assert found["filesize"] > 1e5  # 100Kb
    del found["filesize"]
-    # Handle track formatted length/cueout
+    # Handle track formatted length
    assert metadata["length"] in found["length"]
    assert metadata["length"] in found["cueout"]
    del metadata["length"]
    del found["length"]
    del found["cueout"]
    # mp3,ogg,flac files does not support comments yet
-    if not filepath.endswith("m4a"):
+    if not filepath.suffix == ".m4a":
-        del metadata["comment"]
+        if "comment" in metadata:
            del metadata["comment"]
    assert found == metadata
 def test_analyze_metadata_invalid_wma():
-    metadata = analyze_metadata(str(FILE_INVALID_DRM), dict())
+    metadata = analyze_metadata(str(FILE_INVALID_DRM), {})
    assert metadata["mime"] == "audio/x-ms-wma"
 def test_analyze_metadata_unparsable_file():
-    metadata = analyze_metadata(str(FILE_INVALID_TXT), dict())
+    metadata = analyze_metadata(str(FILE_INVALID_TXT), {})
    assert metadata == {
        "filesize": 10,
        "ftype": "audioclip",
        "hidden": False,
        "md5": "4d5e4b1c8e8febbd31fa9ce7f088beae",
        "mime": "text/plain",
    }
 def test_compute_md5():
    assert compute_md5(FILE_INVALID_TXT) == "4d5e4b1c8e8febbd31fa9ce7f088beae"