feat(analyzer): rework analyze_metadata step

- Upgrade mutagen to 1.45.1 - Remove custom wave data extraction - Add .wav and ogg without extension fixtures - Move md5 sum and mime type in there own function - Cleanup comments - Let analyze_cuepoints handle cuein and cueout metadata - Remove python magic mime guessing
2022-02-14 21:35:15 +01:00 · 2022-02-14 21:35:15 +01:00 · 88dcd13fc8
commit 88dcd13fc8
parent 0106b4c6cb
5 changed files with 164 additions and 155 deletions
--- a/analyzer/libretime_analyzer/pipeline/analyze_metadata.py
+++ b/analyzer/libretime_analyzer/pipeline/analyze_metadata.py
@ -1,116 +1,61 @@
-import datetime
 import hashlib
-import os
-import wave
+from datetime import timedelta
+from pathlib import Path
 from typing import Any, Dict

-import magic
 import mutagen
 from loguru import logger


-def analyze_metadata(filename: str, metadata: Dict[str, Any]):
-    """Extract audio metadata from tags embedded in the file (eg. ID3 tags)
-
-    Keyword arguments:
-        filename: The path to the audio file to extract metadata from.
-        metadata: A dictionary that the extracted metadata will be added to.
+def analyze_metadata(filepath_: str, metadata: Dict[str, Any]):
    """
-    if not isinstance(filename, str):
-        raise TypeError(
-            "filename must be string. Was of type " + type(filename).__name__
-        )
-    if not isinstance(metadata, dict):
-        raise TypeError(
-            "metadata must be a dict. Was of type " + type(metadata).__name__
-        )
-    if not os.path.exists(filename):
-        raise FileNotFoundError(f"audio file not found: {filename}")
+    Extract audio metadata from tags embedded in the file using mutagen.
+    """
+    filepath = Path(filepath_)

-    # Airtime <= 2.5.x nonsense:
+    # Airtime <= 2.5.x required fields
    metadata["ftype"] = "audioclip"
-    # Other fields we'll want to set for Airtime:
    metadata["hidden"] = False

-    # Get file size and md5 hash of the file
-    metadata["filesize"] = os.path.getsize(filename)
+    # Get file properties
+    metadata["filesize"] = filepath.stat().st_size
+    metadata["md5"] = compute_md5(filepath)

-    with open(filename, "rb") as fh:
-        m = hashlib.md5()
-        while True:
-            data = fh.read(8192)
-            if not data:
-                break
-            m.update(data)
-        metadata["md5"] = m.hexdigest()
-
-    # Mutagen doesn't handle WAVE files so we use a different package
-    ms = magic.open(magic.MIME_TYPE)
-    ms.load()
-    with open(filename, "rb") as fh:
-        mime_check = ms.buffer(fh.read(2014))
-    metadata["mime"] = mime_check
-    if mime_check == "audio/x-wav":
-        return _analyze_wave(filename, metadata)
-
-    # Extract metadata from an audio file using mutagen
-    audio_file = mutagen.File(filename, easy=True)
-
-    # Bail if the file couldn't be parsed. The title should stay as the filename
-    # inside Airtime.
-    if (
-        audio_file == None
-    ):  # Don't use "if not" here. It is wrong due to mutagen's design.
+    # Get audio file metadata
+    extracted = mutagen.File(filepath, easy=True)
+    if extracted is None:
+        logger.warning(f"no metadata were extracted for {filepath}")
        return metadata
-    # Note that audio_file can equal {} if the file is valid but there's no metadata tags.
-    # We can still try to grab the info variables below.

-    # Grab other file information that isn't encoded in a tag, but instead usually
-    # in the file header. Mutagen breaks that out into a separate "info" object:
-    info = audio_file.info
-    if hasattr(info, "sample_rate"):  # Mutagen is annoying and inconsistent
+    metadata["mime"] = extracted.mime[0]
+
+    info = extracted.info
+    if hasattr(info, "sample_rate"):
        metadata["sample_rate"] = info.sample_rate
-    if hasattr(info, "length"):
-        metadata["length_seconds"] = info.length
-        # Converting the length in seconds (float) to a formatted time string
-        track_length = datetime.timedelta(seconds=info.length)
-        metadata["length"] = str(
-            track_length
-        )  # time.strftime("%H:%M:%S.%f", track_length)
-        # Other fields for Airtime
-        metadata["cueout"] = metadata["length"]
-
-    # Set a default cue in time in seconds
-    metadata["cuein"] = 0.0

    if hasattr(info, "bitrate"):
        metadata["bit_rate"] = info.bitrate

-    # Use the mutagen to get the MIME type, if it has one. This is more reliable and
-    # consistent for certain types of MP3s or MPEG files than the MIMEs returned by magic.
-    if audio_file.mime:
-        metadata["mime"] = audio_file.mime[0]
+    if hasattr(info, "length"):
+        metadata["length_seconds"] = info.length
+        metadata["length"] = str(timedelta(seconds=info.length))

-    # Try to get the number of channels if mutagen can...
    try:
-        # Special handling for getting the # of channels from MP3s. It's in the "mode" field
-        # which is 0=Stereo, 1=Joint Stereo, 2=Dual Channel, 3=Mono. Part of the ID3 spec...
-        if metadata["mime"] in ["audio/mpeg", "audio/mp3"]:
-            if info.mode == 3:
-                metadata["channels"] = 1
-            else:
-                metadata["channels"] = 2
+        # Special handling for the number of channels in mp3 files.
+        # 0=stereo, 1=joint stereo, 2=dual channel, 3=mono
+        if metadata["mime"] in ("audio/mpeg", "audio/mp3"):
+            metadata["channels"] = 1 if info.mode == 3 else 2
        else:
            metadata["channels"] = info.channels
    except (AttributeError, KeyError):
-        # If mutagen can't figure out the number of channels, we'll just leave it out...
        pass

-    # Try to extract the number of tracks on the album if we can (the "track total")
    try:
-        track_number = audio_file["tracknumber"]
-        if isinstance(track_number, list):  # Sometimes tracknumber is a list, ugh
+        track_number = extracted["tracknumber"]
+
+        if isinstance(track_number, list):
            track_number = track_number[0]
+
        track_number_tokens = track_number
        if "/" in track_number:
            track_number_tokens = track_number.split("/")
@ -122,12 +67,9 @@ def analyze_metadata(filename: str, metadata: Dict[str, Any]):
        track_total = track_number_tokens[1]
        metadata["track_total"] = track_total
    except (AttributeError, KeyError, IndexError):
-        # If we couldn't figure out the track_number or track_total, just ignore it...
        pass

-    # We normalize the mutagen tags slightly here, so in case mutagen changes,
-    # we find the
-    mutagen_to_airtime_mapping = {
+    extracted_tags_mapping = {
        "title": "track_title",
        "artist": "artist_name",
        "album": "album_title",
@ -141,51 +83,43 @@ def analyze_metadata(filename: str, metadata: Dict[str, Any]):
        "isrc": "isrc",
        "label": "label",
        "organization": "label",
-        #'length':       'length',
+        # "length": "length",
        "language": "language",
        "last_modified": "last_modified",
        "mood": "mood",
        "bit_rate": "bit_rate",
        "replay_gain": "replaygain",
-        #'tracknumber':  'track_number',
-        #'track_total':  'track_total',
+        # "tracknumber": "track_number",
+        # "track_total": "track_total",
        "website": "website",
        "date": "year",
-        #'mime_type':    'mime',
+        # "mime_type": "mime",
    }

-    for mutagen_tag, airtime_tag in mutagen_to_airtime_mapping.items():
+    for extracted_key, metadata_key in extracted_tags_mapping.items():
        try:
-            metadata[airtime_tag] = audio_file[mutagen_tag]
-
-            # Some tags are returned as lists because there could be multiple values.
-            # This is unusual so we're going to always just take the first item in the list.
-            if isinstance(metadata[airtime_tag], list):
-                if metadata[airtime_tag]:
-                    metadata[airtime_tag] = metadata[airtime_tag][0]
-                else:  # Handle empty lists
-                    metadata[airtime_tag] = ""
-
+            metadata[metadata_key] = extracted[extracted_key]
+            if isinstance(metadata[metadata_key], list):
+                if len(metadata[metadata_key]):
+                    metadata[metadata_key] = metadata[metadata_key][0]
+                else:
+                    metadata[metadata_key] = ""
        except KeyError:
            continue

    return metadata


-def _analyze_wave(filename, metadata):
-    try:
-        reader = wave.open(filename, "rb")
-        metadata["channels"] = reader.getnchannels()
-        metadata["sample_rate"] = reader.getframerate()
-        length_seconds = float(reader.getnframes()) / float(metadata["sample_rate"])
-        # Converting the length in seconds (float) to a formatted time string
-        track_length = datetime.timedelta(seconds=length_seconds)
-        metadata["length"] = str(
-            track_length
-        )  # time.strftime("%H:%M:%S.%f", track_length)
-        metadata["length_seconds"] = length_seconds
-        metadata["cueout"] = metadata["length"]
-    except wave.Error as ex:
-        logger.error(f"Invalid WAVE file: {str(ex)}")
-        raise
-    return metadata
+def compute_md5(filepath: Path) -> str:
+    """
+    Compute a file md5sum.
+    """
+    with filepath.open("rb") as file:
+        buffer = hashlib.md5()  # nosec
+        while True:
+            blob = file.read(8192)
+            if not blob:
+                break
+            buffer.update(blob)
+
+        return buffer.hexdigest()
--- a/analyzer/setup.py
+++ b/analyzer/setup.py
@ -30,9 +30,8 @@ setup(
    },
    python_requires=">=3.6",
    install_requires=[
-        "mutagen>=1.31.0",
+        "mutagen>=1.45.1",
        "pika>=1.0.0",
-        "file-magic",
        "requests>=2.7.0",
        "typing_extensions",
    ],
--- a/analyzer/tests/fixtures/init.py
+++ b/analyzer/tests/fixtures/init.py
@ -40,6 +40,9 @@ Fixture(here / "s1-mono.m4a",           15.0,   6.0,    13.0,   -4.5    ),
 Fixture(here / "s1-stereo.m4a",         15.0,   6.0,    13.0,   -5.8    ),
 Fixture(here / "s1-mono.ogg",           15.0,   6.0,    13.0,   -4.9    ),
 Fixture(here / "s1-stereo.ogg",         15.0,   6.0,    13.0,   -5.7    ),
+Fixture(here / "s1-stereo",             15.0,   6.0,    13.0,   -5.7    ),
+Fixture(here / "s1-mono.wav",           15.0,   6.0,    13.0,   -2.3    ),
+Fixture(here / "s1-stereo.wav",         15.0,   6.0,    13.0,   -6.0    ),
 # sample 2
 # 0s   -> 1.8s: silence
 # 1.8s        : noise
@ -73,13 +76,14 @@ FixtureMeta = namedtuple(
 )

 meta = {
-    "cuein": 0.0,
    "sample_rate": 48000,
    "length": str(timedelta(seconds=15)),
    "length_seconds": approx(15.0, abs=0.1),
    "ftype": "audioclip",
    "hidden": False,
-    # Tags
+}
+
+tags = {
    "album_title": "Test Album",
    "artist_name": "Test Artist",
    "track_title": "Test Title",
@ -95,6 +99,7 @@ FILES_TAGGED = [
        here / "s1-jointstereo-tagged.mp3",
        {
            **meta,
+            **tags,
            "bit_rate": approx(128000, abs=1e2),
            "channels": 2,
            "mime": "audio/mp3",
@ -104,6 +109,7 @@ FILES_TAGGED = [
        here / "s1-mono-tagged.mp3",
        {
            **meta,
+            **tags,
            "bit_rate": approx(64000, abs=1e2),
            "channels": 1,
            "mime": "audio/mp3",
@ -113,6 +119,7 @@ FILES_TAGGED = [
        here / "s1-stereo-tagged.mp3",
        {
            **meta,
+            **tags,
            "bit_rate": approx(128000, abs=1e2),
            "channels": 2,
            "mime": "audio/mp3",
@ -122,6 +129,7 @@ FILES_TAGGED = [
        here / "s1-mono-tagged.flac",
        {
            **meta,
+            **tags,
            "bit_rate": approx(452802, abs=1e2),
            "channels": 1,
            "mime": "audio/flac",
@ -131,6 +139,7 @@ FILES_TAGGED = [
        here / "s1-stereo-tagged.flac",
        {
            **meta,
+            **tags,
            "bit_rate": approx(938593, abs=1e3),
            "channels": 2,
            "mime": "audio/flac",
@ -140,6 +149,7 @@ FILES_TAGGED = [
        here / "s1-mono-tagged.m4a",
        {
            **meta,
+            **tags,
            "bit_rate": approx(65000, abs=5e4),
            "channels": 2,  # Weird
            "mime": "audio/mp4",
@ -149,6 +159,7 @@ FILES_TAGGED = [
        here / "s1-stereo-tagged.m4a",
        {
            **meta,
+            **tags,
            "bit_rate": approx(128000, abs=1e5),
            "channels": 2,
            "mime": "audio/mp4",
@ -158,6 +169,7 @@ FILES_TAGGED = [
        here / "s1-mono-tagged.ogg",
        {
            **meta,
+            **tags,
            "bit_rate": approx(80000, abs=1e2),
            "channels": 1,
            "mime": "audio/vorbis",
@ -167,15 +179,43 @@ FILES_TAGGED = [
        here / "s1-stereo-tagged.ogg",
        {
            **meta,
+            **tags,
            "bit_rate": approx(112000, abs=1e2),
            "channels": 2,
            "mime": "audio/vorbis",
        },
    ),
+    FixtureMeta(
+        here / "s1-stereo-tagged",
+        {
+            **meta,
+            **tags,
+            "bit_rate": approx(112000, abs=1e2),
+            "channels": 2,
+            "mime": "audio/vorbis",
+        },
+    ),
+    FixtureMeta(
+        here / "s1-mono-tagged.wav",
+        {
+            **meta,
+            "bit_rate": approx(96000, abs=1e2),
+            "channels": 1,
+            "mime": "audio/wav",
+        },
+    ),
+    FixtureMeta(
+        here / "s1-stereo-tagged.wav",
+        {
+            **meta,
+            "bit_rate": approx(384000, abs=1e2),
+            "channels": 2,
+            "mime": "audio/wav",
+        },
+    ),
 ]

-meta = {
-    **meta,
+tags = {
    "album_title": "Ä ä Ü ü ß",
    "artist_name": "てすと",
    "track_title": "ｱｲｳｴｵｶｷｸｹｺｻｼｽｾｿﾀﾁﾂﾃ",
@ -191,6 +231,7 @@ FILES_TAGGED += [
        here / "s1-jointstereo-tagged-utf8.mp3",
        {
            **meta,
+            **tags,
            "bit_rate": approx(128000, abs=1e2),
            "channels": 2,
            "mime": "audio/mp3",
@ -200,6 +241,7 @@ FILES_TAGGED += [
        here / "s1-mono-tagged-utf8.mp3",
        {
            **meta,
+            **tags,
            "bit_rate": approx(64000, abs=1e2),
            "channels": 1,
            "mime": "audio/mp3",
@ -209,6 +251,7 @@ FILES_TAGGED += [
        here / "s1-stereo-tagged-utf8.mp3",
        {
            **meta,
+            **tags,
            "bit_rate": approx(128000, abs=1e2),
            "channels": 2,
            "mime": "audio/mp3",
@ -218,6 +261,7 @@ FILES_TAGGED += [
        here / "s1-mono-tagged-utf8.flac",
        {
            **meta,
+            **tags,
            "bit_rate": approx(452802, abs=1e2),
            "channels": 1,
            "mime": "audio/flac",
@ -227,6 +271,7 @@ FILES_TAGGED += [
        here / "s1-stereo-tagged-utf8.flac",
        {
            **meta,
+            **tags,
            "bit_rate": approx(938593, abs=1e2),
            "channels": 2,
            "mime": "audio/flac",
@ -236,6 +281,7 @@ FILES_TAGGED += [
        here / "s1-mono-tagged-utf8.m4a",
        {
            **meta,
+            **tags,
            "bit_rate": approx(65000, abs=5e4),
            "channels": 2,  # Weird
            "mime": "audio/mp4",
@ -245,6 +291,7 @@ FILES_TAGGED += [
        here / "s1-stereo-tagged-utf8.m4a",
        {
            **meta,
+            **tags,
            "bit_rate": approx(128000, abs=1e5),
            "channels": 2,
            "mime": "audio/mp4",
@ -254,6 +301,7 @@ FILES_TAGGED += [
        here / "s1-mono-tagged-utf8.ogg",
        {
            **meta,
+            **tags,
            "bit_rate": approx(80000, abs=1e2),
            "channels": 1,
            "mime": "audio/vorbis",
@ -263,9 +311,38 @@ FILES_TAGGED += [
        here / "s1-stereo-tagged-utf8.ogg",
        {
            **meta,
+            **tags,
            "bit_rate": approx(112000, abs=1e2),
            "channels": 2,
            "mime": "audio/vorbis",
        },
    ),
+    FixtureMeta(
+        here / "s1-stereo-tagged-utf8",
+        {
+            **meta,
+            **tags,
+            "bit_rate": approx(112000, abs=1e2),
+            "channels": 2,
+            "mime": "audio/vorbis",
+        },
+    ),
+    FixtureMeta(
+        here / "s1-mono-tagged-utf8.wav",
+        {
+            **meta,
+            "bit_rate": approx(96000, abs=1e2),
+            "channels": 1,
+            "mime": "audio/wav",
+        },
+    ),
+    FixtureMeta(
+        here / "s1-stereo-tagged-utf8.wav",
+        {
+            **meta,
+            "bit_rate": approx(384000, abs=1e2),
+            "channels": 2,
+            "mime": "audio/wav",
+        },
+    ),
 ]
--- a/analyzer/tests/fixtures/generate.sh
+++ b/analyzer/tests/fixtures/generate.sh
@ -38,10 +38,12 @@ generate() {

 # Generate sample 1
 generate  s1.flac s1-mono.flac         -ac 1   -acodec flac
+generate  s1.flac s1-mono.wav          -ac 1
 generate  s1.flac s1-mono.m4a          -ac 1   -acodec aac
 generate  s1.flac s1-mono.mp3          -ac 1   -acodec libmp3lame
 generate  s1.flac s1-mono.ogg          -ac 1   -acodec libvorbis
 generate  s1.flac s1-stereo.flac       -ac 2   -acodec flac
+generate  s1.flac s1-stereo.wav        -ac 2
 generate  s1.flac s1-stereo.m4a        -ac 2   -acodec aac
 generate  s1.flac s1-stereo.mp3        -ac 2   -acodec libmp3lame
 generate  s1.flac s1-stereo.ogg        -ac 2   -acodec libvorbis
@ -77,10 +79,12 @@ generate  s3.flac s3-stereo.ogg        -ac 2   -acodec libvorbis

 # Tag sample 1
 tag metadata.txt  s1-mono.flac         s1-mono-tagged.flac
+tag metadata.txt  s1-mono.wav          s1-mono-tagged.wav
 tag metadata.txt  s1-mono.m4a          s1-mono-tagged.m4a
 tag metadata.txt  s1-mono.mp3          s1-mono-tagged.mp3
 tag metadata.txt  s1-mono.ogg          s1-mono-tagged.ogg
 tag metadata.txt  s1-stereo.flac       s1-stereo-tagged.flac
+tag metadata.txt  s1-stereo.wav        s1-stereo-tagged.wav
 tag metadata.txt  s1-stereo.m4a        s1-stereo-tagged.m4a
 tag metadata.txt  s1-stereo.mp3        s1-stereo-tagged.mp3
 tag metadata.txt  s1-stereo.ogg        s1-stereo-tagged.ogg
@ -88,11 +92,18 @@ tag metadata.txt  s1-jointstereo.mp3   s1-jointstereo-tagged.mp3

 # Tag utf8 sample 1
 tag metadata-utf8.txt   s1-mono.flac       s1-mono-tagged-utf8.flac
+tag metadata-utf8.txt   s1-mono.wav        s1-mono-tagged-utf8.wav
 tag metadata-utf8.txt   s1-mono.m4a        s1-mono-tagged-utf8.m4a
 tag metadata-utf8.txt   s1-mono.mp3        s1-mono-tagged-utf8.mp3
 tag metadata-utf8.txt   s1-mono.ogg        s1-mono-tagged-utf8.ogg
 tag metadata-utf8.txt   s1-stereo.flac     s1-stereo-tagged-utf8.flac
+tag metadata-utf8.txt   s1-stereo.wav      s1-stereo-tagged-utf8.wav
 tag metadata-utf8.txt   s1-stereo.m4a      s1-stereo-tagged-utf8.m4a
 tag metadata-utf8.txt   s1-stereo.mp3      s1-stereo-tagged-utf8.mp3
 tag metadata-utf8.txt   s1-stereo.ogg      s1-stereo-tagged-utf8.ogg
 tag metadata-utf8.txt   s1-jointstereo.mp3 s1-jointstereo-tagged-utf8.mp3
+
+# Extension less files
+cp s1-stereo.ogg s1-stereo
+cp s1-stereo-tagged.ogg s1-stereo-tagged
+cp s1-stereo-tagged-utf8.ogg s1-stereo-tagged-utf8
--- a/analyzer/tests/pipeline/analyze_metadata_test.py
+++ b/analyzer/tests/pipeline/analyze_metadata_test.py
@ -1,66 +1,54 @@
+from pathlib import Path
+
 import pytest

-from libretime_analyzer.pipeline.analyze_metadata import analyze_metadata
+from libretime_analyzer.pipeline.analyze_metadata import analyze_metadata, compute_md5

 from ..fixtures import FILE_INVALID_DRM, FILE_INVALID_TXT, FILES_TAGGED


-@pytest.mark.parametrize(
-    "params,exception",
-    [
-        ((42, dict()), TypeError),
-        (("foo", 3), TypeError),
-    ],
-)
-def test_analyze_metadata_wrong_params(params, exception):
-    with pytest.raises(exception):
-        analyze_metadata(*params)
-
-
@pytest.mark.parametrize(
    "filepath,metadata",
-    map(lambda i: (str(i.path), i.metadata), FILES_TAGGED),
+    map(lambda i: (i.path, i.metadata), FILES_TAGGED),
 )
-def test_analyze_metadata(filepath: str, metadata: dict):
-    found = analyze_metadata(filepath, dict())
-
-    # Mutagen does not support wav files yet
-    if filepath.endswith("wav"):
-        return
+def test_analyze_metadata(filepath: Path, metadata: dict):
+    found = analyze_metadata(str(filepath), {})

    assert len(found["md5"]) == 32
    del found["md5"]

    # Handle filesize
-    assert found["filesize"] < 2e6  # ~2Mb
+    assert found["filesize"] < 3e6  # ~3Mb
    assert found["filesize"] > 1e5  # 100Kb
    del found["filesize"]

-    # Handle track formatted length/cueout
+    # Handle track formatted length
    assert metadata["length"] in found["length"]
-    assert metadata["length"] in found["cueout"]
    del metadata["length"]
    del found["length"]
-    del found["cueout"]

    # mp3,ogg,flac files does not support comments yet
-    if not filepath.endswith("m4a"):
-        del metadata["comment"]
+    if not filepath.suffix == ".m4a":
+        if "comment" in metadata:
+            del metadata["comment"]

    assert found == metadata


 def test_analyze_metadata_invalid_wma():
-    metadata = analyze_metadata(str(FILE_INVALID_DRM), dict())
+    metadata = analyze_metadata(str(FILE_INVALID_DRM), {})
    assert metadata["mime"] == "audio/x-ms-wma"


 def test_analyze_metadata_unparsable_file():
-    metadata = analyze_metadata(str(FILE_INVALID_TXT), dict())
+    metadata = analyze_metadata(str(FILE_INVALID_TXT), {})
    assert metadata == {
        "filesize": 10,
        "ftype": "audioclip",
        "hidden": False,
        "md5": "4d5e4b1c8e8febbd31fa9ce7f088beae",
-        "mime": "text/plain",
    }
+
+
+def test_compute_md5():
+    assert compute_md5(FILE_INVALID_TXT) == "4d5e4b1c8e8febbd31fa9ce7f088beae"