diff --git a/analyzer/libretime_analyzer/pipeline/analyze_metadata.py b/analyzer/libretime_analyzer/pipeline/analyze_metadata.py index f61ce154f..648b57ffa 100644 --- a/analyzer/libretime_analyzer/pipeline/analyze_metadata.py +++ b/analyzer/libretime_analyzer/pipeline/analyze_metadata.py @@ -1,116 +1,61 @@ -import datetime import hashlib -import os -import wave +from datetime import timedelta +from pathlib import Path from typing import Any, Dict -import magic import mutagen from loguru import logger -def analyze_metadata(filename: str, metadata: Dict[str, Any]): - """Extract audio metadata from tags embedded in the file (eg. ID3 tags) - - Keyword arguments: - filename: The path to the audio file to extract metadata from. - metadata: A dictionary that the extracted metadata will be added to. +def analyze_metadata(filepath_: str, metadata: Dict[str, Any]): """ - if not isinstance(filename, str): - raise TypeError( - "filename must be string. Was of type " + type(filename).__name__ - ) - if not isinstance(metadata, dict): - raise TypeError( - "metadata must be a dict. Was of type " + type(metadata).__name__ - ) - if not os.path.exists(filename): - raise FileNotFoundError(f"audio file not found: {filename}") + Extract audio metadata from tags embedded in the file using mutagen. + """ + filepath = Path(filepath_) - # Airtime <= 2.5.x nonsense: + # Airtime <= 2.5.x required fields metadata["ftype"] = "audioclip" - # Other fields we'll want to set for Airtime: metadata["hidden"] = False - # Get file size and md5 hash of the file - metadata["filesize"] = os.path.getsize(filename) + # Get file properties + metadata["filesize"] = filepath.stat().st_size + metadata["md5"] = compute_md5(filepath) - with open(filename, "rb") as fh: - m = hashlib.md5() - while True: - data = fh.read(8192) - if not data: - break - m.update(data) - metadata["md5"] = m.hexdigest() - - # Mutagen doesn't handle WAVE files so we use a different package - ms = magic.open(magic.MIME_TYPE) - ms.load() - with open(filename, "rb") as fh: - mime_check = ms.buffer(fh.read(2014)) - metadata["mime"] = mime_check - if mime_check == "audio/x-wav": - return _analyze_wave(filename, metadata) - - # Extract metadata from an audio file using mutagen - audio_file = mutagen.File(filename, easy=True) - - # Bail if the file couldn't be parsed. The title should stay as the filename - # inside Airtime. - if ( - audio_file == None - ): # Don't use "if not" here. It is wrong due to mutagen's design. + # Get audio file metadata + extracted = mutagen.File(filepath, easy=True) + if extracted is None: + logger.warning(f"no metadata were extracted for {filepath}") return metadata - # Note that audio_file can equal {} if the file is valid but there's no metadata tags. - # We can still try to grab the info variables below. - # Grab other file information that isn't encoded in a tag, but instead usually - # in the file header. Mutagen breaks that out into a separate "info" object: - info = audio_file.info - if hasattr(info, "sample_rate"): # Mutagen is annoying and inconsistent + metadata["mime"] = extracted.mime[0] + + info = extracted.info + if hasattr(info, "sample_rate"): metadata["sample_rate"] = info.sample_rate - if hasattr(info, "length"): - metadata["length_seconds"] = info.length - # Converting the length in seconds (float) to a formatted time string - track_length = datetime.timedelta(seconds=info.length) - metadata["length"] = str( - track_length - ) # time.strftime("%H:%M:%S.%f", track_length) - # Other fields for Airtime - metadata["cueout"] = metadata["length"] - - # Set a default cue in time in seconds - metadata["cuein"] = 0.0 if hasattr(info, "bitrate"): metadata["bit_rate"] = info.bitrate - # Use the mutagen to get the MIME type, if it has one. This is more reliable and - # consistent for certain types of MP3s or MPEG files than the MIMEs returned by magic. - if audio_file.mime: - metadata["mime"] = audio_file.mime[0] + if hasattr(info, "length"): + metadata["length_seconds"] = info.length + metadata["length"] = str(timedelta(seconds=info.length)) - # Try to get the number of channels if mutagen can... try: - # Special handling for getting the # of channels from MP3s. It's in the "mode" field - # which is 0=Stereo, 1=Joint Stereo, 2=Dual Channel, 3=Mono. Part of the ID3 spec... - if metadata["mime"] in ["audio/mpeg", "audio/mp3"]: - if info.mode == 3: - metadata["channels"] = 1 - else: - metadata["channels"] = 2 + # Special handling for the number of channels in mp3 files. + # 0=stereo, 1=joint stereo, 2=dual channel, 3=mono + if metadata["mime"] in ("audio/mpeg", "audio/mp3"): + metadata["channels"] = 1 if info.mode == 3 else 2 else: metadata["channels"] = info.channels except (AttributeError, KeyError): - # If mutagen can't figure out the number of channels, we'll just leave it out... pass - # Try to extract the number of tracks on the album if we can (the "track total") try: - track_number = audio_file["tracknumber"] - if isinstance(track_number, list): # Sometimes tracknumber is a list, ugh + track_number = extracted["tracknumber"] + + if isinstance(track_number, list): track_number = track_number[0] + track_number_tokens = track_number if "/" in track_number: track_number_tokens = track_number.split("/") @@ -122,12 +67,9 @@ def analyze_metadata(filename: str, metadata: Dict[str, Any]): track_total = track_number_tokens[1] metadata["track_total"] = track_total except (AttributeError, KeyError, IndexError): - # If we couldn't figure out the track_number or track_total, just ignore it... pass - # We normalize the mutagen tags slightly here, so in case mutagen changes, - # we find the - mutagen_to_airtime_mapping = { + extracted_tags_mapping = { "title": "track_title", "artist": "artist_name", "album": "album_title", @@ -141,51 +83,43 @@ def analyze_metadata(filename: str, metadata: Dict[str, Any]): "isrc": "isrc", "label": "label", "organization": "label", - #'length': 'length', + # "length": "length", "language": "language", "last_modified": "last_modified", "mood": "mood", "bit_rate": "bit_rate", "replay_gain": "replaygain", - #'tracknumber': 'track_number', - #'track_total': 'track_total', + # "tracknumber": "track_number", + # "track_total": "track_total", "website": "website", "date": "year", - #'mime_type': 'mime', + # "mime_type": "mime", } - for mutagen_tag, airtime_tag in mutagen_to_airtime_mapping.items(): + for extracted_key, metadata_key in extracted_tags_mapping.items(): try: - metadata[airtime_tag] = audio_file[mutagen_tag] - - # Some tags are returned as lists because there could be multiple values. - # This is unusual so we're going to always just take the first item in the list. - if isinstance(metadata[airtime_tag], list): - if metadata[airtime_tag]: - metadata[airtime_tag] = metadata[airtime_tag][0] - else: # Handle empty lists - metadata[airtime_tag] = "" - + metadata[metadata_key] = extracted[extracted_key] + if isinstance(metadata[metadata_key], list): + if len(metadata[metadata_key]): + metadata[metadata_key] = metadata[metadata_key][0] + else: + metadata[metadata_key] = "" except KeyError: continue return metadata -def _analyze_wave(filename, metadata): - try: - reader = wave.open(filename, "rb") - metadata["channels"] = reader.getnchannels() - metadata["sample_rate"] = reader.getframerate() - length_seconds = float(reader.getnframes()) / float(metadata["sample_rate"]) - # Converting the length in seconds (float) to a formatted time string - track_length = datetime.timedelta(seconds=length_seconds) - metadata["length"] = str( - track_length - ) # time.strftime("%H:%M:%S.%f", track_length) - metadata["length_seconds"] = length_seconds - metadata["cueout"] = metadata["length"] - except wave.Error as ex: - logger.error(f"Invalid WAVE file: {str(ex)}") - raise - return metadata +def compute_md5(filepath: Path) -> str: + """ + Compute a file md5sum. + """ + with filepath.open("rb") as file: + buffer = hashlib.md5() # nosec + while True: + blob = file.read(8192) + if not blob: + break + buffer.update(blob) + + return buffer.hexdigest() diff --git a/analyzer/setup.py b/analyzer/setup.py index f7f7e2d43..4d4ef6973 100644 --- a/analyzer/setup.py +++ b/analyzer/setup.py @@ -30,9 +30,8 @@ setup( }, python_requires=">=3.6", install_requires=[ - "mutagen>=1.31.0", + "mutagen>=1.45.1", "pika>=1.0.0", - "file-magic", "requests>=2.7.0", "typing_extensions", ], diff --git a/analyzer/tests/fixtures/__init__.py b/analyzer/tests/fixtures/__init__.py index 01bfc2de1..4014be827 100644 --- a/analyzer/tests/fixtures/__init__.py +++ b/analyzer/tests/fixtures/__init__.py @@ -40,6 +40,9 @@ Fixture(here / "s1-mono.m4a", 15.0, 6.0, 13.0, -4.5 ), Fixture(here / "s1-stereo.m4a", 15.0, 6.0, 13.0, -5.8 ), Fixture(here / "s1-mono.ogg", 15.0, 6.0, 13.0, -4.9 ), Fixture(here / "s1-stereo.ogg", 15.0, 6.0, 13.0, -5.7 ), +Fixture(here / "s1-stereo", 15.0, 6.0, 13.0, -5.7 ), +Fixture(here / "s1-mono.wav", 15.0, 6.0, 13.0, -2.3 ), +Fixture(here / "s1-stereo.wav", 15.0, 6.0, 13.0, -6.0 ), # sample 2 # 0s -> 1.8s: silence # 1.8s : noise @@ -73,13 +76,14 @@ FixtureMeta = namedtuple( ) meta = { - "cuein": 0.0, "sample_rate": 48000, "length": str(timedelta(seconds=15)), "length_seconds": approx(15.0, abs=0.1), "ftype": "audioclip", "hidden": False, - # Tags +} + +tags = { "album_title": "Test Album", "artist_name": "Test Artist", "track_title": "Test Title", @@ -95,6 +99,7 @@ FILES_TAGGED = [ here / "s1-jointstereo-tagged.mp3", { **meta, + **tags, "bit_rate": approx(128000, abs=1e2), "channels": 2, "mime": "audio/mp3", @@ -104,6 +109,7 @@ FILES_TAGGED = [ here / "s1-mono-tagged.mp3", { **meta, + **tags, "bit_rate": approx(64000, abs=1e2), "channels": 1, "mime": "audio/mp3", @@ -113,6 +119,7 @@ FILES_TAGGED = [ here / "s1-stereo-tagged.mp3", { **meta, + **tags, "bit_rate": approx(128000, abs=1e2), "channels": 2, "mime": "audio/mp3", @@ -122,6 +129,7 @@ FILES_TAGGED = [ here / "s1-mono-tagged.flac", { **meta, + **tags, "bit_rate": approx(452802, abs=1e2), "channels": 1, "mime": "audio/flac", @@ -131,6 +139,7 @@ FILES_TAGGED = [ here / "s1-stereo-tagged.flac", { **meta, + **tags, "bit_rate": approx(938593, abs=1e3), "channels": 2, "mime": "audio/flac", @@ -140,6 +149,7 @@ FILES_TAGGED = [ here / "s1-mono-tagged.m4a", { **meta, + **tags, "bit_rate": approx(65000, abs=5e4), "channels": 2, # Weird "mime": "audio/mp4", @@ -149,6 +159,7 @@ FILES_TAGGED = [ here / "s1-stereo-tagged.m4a", { **meta, + **tags, "bit_rate": approx(128000, abs=1e5), "channels": 2, "mime": "audio/mp4", @@ -158,6 +169,7 @@ FILES_TAGGED = [ here / "s1-mono-tagged.ogg", { **meta, + **tags, "bit_rate": approx(80000, abs=1e2), "channels": 1, "mime": "audio/vorbis", @@ -167,15 +179,43 @@ FILES_TAGGED = [ here / "s1-stereo-tagged.ogg", { **meta, + **tags, "bit_rate": approx(112000, abs=1e2), "channels": 2, "mime": "audio/vorbis", }, ), + FixtureMeta( + here / "s1-stereo-tagged", + { + **meta, + **tags, + "bit_rate": approx(112000, abs=1e2), + "channels": 2, + "mime": "audio/vorbis", + }, + ), + FixtureMeta( + here / "s1-mono-tagged.wav", + { + **meta, + "bit_rate": approx(96000, abs=1e2), + "channels": 1, + "mime": "audio/wav", + }, + ), + FixtureMeta( + here / "s1-stereo-tagged.wav", + { + **meta, + "bit_rate": approx(384000, abs=1e2), + "channels": 2, + "mime": "audio/wav", + }, + ), ] -meta = { - **meta, +tags = { "album_title": "Ä ä Ü ü ß", "artist_name": "てすと", "track_title": "アイウエオカキクケコサシスセソタチツテ", @@ -191,6 +231,7 @@ FILES_TAGGED += [ here / "s1-jointstereo-tagged-utf8.mp3", { **meta, + **tags, "bit_rate": approx(128000, abs=1e2), "channels": 2, "mime": "audio/mp3", @@ -200,6 +241,7 @@ FILES_TAGGED += [ here / "s1-mono-tagged-utf8.mp3", { **meta, + **tags, "bit_rate": approx(64000, abs=1e2), "channels": 1, "mime": "audio/mp3", @@ -209,6 +251,7 @@ FILES_TAGGED += [ here / "s1-stereo-tagged-utf8.mp3", { **meta, + **tags, "bit_rate": approx(128000, abs=1e2), "channels": 2, "mime": "audio/mp3", @@ -218,6 +261,7 @@ FILES_TAGGED += [ here / "s1-mono-tagged-utf8.flac", { **meta, + **tags, "bit_rate": approx(452802, abs=1e2), "channels": 1, "mime": "audio/flac", @@ -227,6 +271,7 @@ FILES_TAGGED += [ here / "s1-stereo-tagged-utf8.flac", { **meta, + **tags, "bit_rate": approx(938593, abs=1e2), "channels": 2, "mime": "audio/flac", @@ -236,6 +281,7 @@ FILES_TAGGED += [ here / "s1-mono-tagged-utf8.m4a", { **meta, + **tags, "bit_rate": approx(65000, abs=5e4), "channels": 2, # Weird "mime": "audio/mp4", @@ -245,6 +291,7 @@ FILES_TAGGED += [ here / "s1-stereo-tagged-utf8.m4a", { **meta, + **tags, "bit_rate": approx(128000, abs=1e5), "channels": 2, "mime": "audio/mp4", @@ -254,6 +301,7 @@ FILES_TAGGED += [ here / "s1-mono-tagged-utf8.ogg", { **meta, + **tags, "bit_rate": approx(80000, abs=1e2), "channels": 1, "mime": "audio/vorbis", @@ -263,9 +311,38 @@ FILES_TAGGED += [ here / "s1-stereo-tagged-utf8.ogg", { **meta, + **tags, "bit_rate": approx(112000, abs=1e2), "channels": 2, "mime": "audio/vorbis", }, ), + FixtureMeta( + here / "s1-stereo-tagged-utf8", + { + **meta, + **tags, + "bit_rate": approx(112000, abs=1e2), + "channels": 2, + "mime": "audio/vorbis", + }, + ), + FixtureMeta( + here / "s1-mono-tagged-utf8.wav", + { + **meta, + "bit_rate": approx(96000, abs=1e2), + "channels": 1, + "mime": "audio/wav", + }, + ), + FixtureMeta( + here / "s1-stereo-tagged-utf8.wav", + { + **meta, + "bit_rate": approx(384000, abs=1e2), + "channels": 2, + "mime": "audio/wav", + }, + ), ] diff --git a/analyzer/tests/fixtures/generate.sh b/analyzer/tests/fixtures/generate.sh index 113195349..e735add8c 100755 --- a/analyzer/tests/fixtures/generate.sh +++ b/analyzer/tests/fixtures/generate.sh @@ -38,10 +38,12 @@ generate() { # Generate sample 1 generate s1.flac s1-mono.flac -ac 1 -acodec flac +generate s1.flac s1-mono.wav -ac 1 generate s1.flac s1-mono.m4a -ac 1 -acodec aac generate s1.flac s1-mono.mp3 -ac 1 -acodec libmp3lame generate s1.flac s1-mono.ogg -ac 1 -acodec libvorbis generate s1.flac s1-stereo.flac -ac 2 -acodec flac +generate s1.flac s1-stereo.wav -ac 2 generate s1.flac s1-stereo.m4a -ac 2 -acodec aac generate s1.flac s1-stereo.mp3 -ac 2 -acodec libmp3lame generate s1.flac s1-stereo.ogg -ac 2 -acodec libvorbis @@ -77,10 +79,12 @@ generate s3.flac s3-stereo.ogg -ac 2 -acodec libvorbis # Tag sample 1 tag metadata.txt s1-mono.flac s1-mono-tagged.flac +tag metadata.txt s1-mono.wav s1-mono-tagged.wav tag metadata.txt s1-mono.m4a s1-mono-tagged.m4a tag metadata.txt s1-mono.mp3 s1-mono-tagged.mp3 tag metadata.txt s1-mono.ogg s1-mono-tagged.ogg tag metadata.txt s1-stereo.flac s1-stereo-tagged.flac +tag metadata.txt s1-stereo.wav s1-stereo-tagged.wav tag metadata.txt s1-stereo.m4a s1-stereo-tagged.m4a tag metadata.txt s1-stereo.mp3 s1-stereo-tagged.mp3 tag metadata.txt s1-stereo.ogg s1-stereo-tagged.ogg @@ -88,11 +92,18 @@ tag metadata.txt s1-jointstereo.mp3 s1-jointstereo-tagged.mp3 # Tag utf8 sample 1 tag metadata-utf8.txt s1-mono.flac s1-mono-tagged-utf8.flac +tag metadata-utf8.txt s1-mono.wav s1-mono-tagged-utf8.wav tag metadata-utf8.txt s1-mono.m4a s1-mono-tagged-utf8.m4a tag metadata-utf8.txt s1-mono.mp3 s1-mono-tagged-utf8.mp3 tag metadata-utf8.txt s1-mono.ogg s1-mono-tagged-utf8.ogg tag metadata-utf8.txt s1-stereo.flac s1-stereo-tagged-utf8.flac +tag metadata-utf8.txt s1-stereo.wav s1-stereo-tagged-utf8.wav tag metadata-utf8.txt s1-stereo.m4a s1-stereo-tagged-utf8.m4a tag metadata-utf8.txt s1-stereo.mp3 s1-stereo-tagged-utf8.mp3 tag metadata-utf8.txt s1-stereo.ogg s1-stereo-tagged-utf8.ogg tag metadata-utf8.txt s1-jointstereo.mp3 s1-jointstereo-tagged-utf8.mp3 + +# Extension less files +cp s1-stereo.ogg s1-stereo +cp s1-stereo-tagged.ogg s1-stereo-tagged +cp s1-stereo-tagged-utf8.ogg s1-stereo-tagged-utf8 diff --git a/analyzer/tests/pipeline/analyze_metadata_test.py b/analyzer/tests/pipeline/analyze_metadata_test.py index a5646434f..b1ac6c4ca 100644 --- a/analyzer/tests/pipeline/analyze_metadata_test.py +++ b/analyzer/tests/pipeline/analyze_metadata_test.py @@ -1,66 +1,54 @@ +from pathlib import Path + import pytest -from libretime_analyzer.pipeline.analyze_metadata import analyze_metadata +from libretime_analyzer.pipeline.analyze_metadata import analyze_metadata, compute_md5 from ..fixtures import FILE_INVALID_DRM, FILE_INVALID_TXT, FILES_TAGGED -@pytest.mark.parametrize( - "params,exception", - [ - ((42, dict()), TypeError), - (("foo", 3), TypeError), - ], -) -def test_analyze_metadata_wrong_params(params, exception): - with pytest.raises(exception): - analyze_metadata(*params) - - @pytest.mark.parametrize( "filepath,metadata", - map(lambda i: (str(i.path), i.metadata), FILES_TAGGED), + map(lambda i: (i.path, i.metadata), FILES_TAGGED), ) -def test_analyze_metadata(filepath: str, metadata: dict): - found = analyze_metadata(filepath, dict()) - - # Mutagen does not support wav files yet - if filepath.endswith("wav"): - return +def test_analyze_metadata(filepath: Path, metadata: dict): + found = analyze_metadata(str(filepath), {}) assert len(found["md5"]) == 32 del found["md5"] # Handle filesize - assert found["filesize"] < 2e6 # ~2Mb + assert found["filesize"] < 3e6 # ~3Mb assert found["filesize"] > 1e5 # 100Kb del found["filesize"] - # Handle track formatted length/cueout + # Handle track formatted length assert metadata["length"] in found["length"] - assert metadata["length"] in found["cueout"] del metadata["length"] del found["length"] - del found["cueout"] # mp3,ogg,flac files does not support comments yet - if not filepath.endswith("m4a"): - del metadata["comment"] + if not filepath.suffix == ".m4a": + if "comment" in metadata: + del metadata["comment"] assert found == metadata def test_analyze_metadata_invalid_wma(): - metadata = analyze_metadata(str(FILE_INVALID_DRM), dict()) + metadata = analyze_metadata(str(FILE_INVALID_DRM), {}) assert metadata["mime"] == "audio/x-ms-wma" def test_analyze_metadata_unparsable_file(): - metadata = analyze_metadata(str(FILE_INVALID_TXT), dict()) + metadata = analyze_metadata(str(FILE_INVALID_TXT), {}) assert metadata == { "filesize": 10, "ftype": "audioclip", "hidden": False, "md5": "4d5e4b1c8e8febbd31fa9ce7f088beae", - "mime": "text/plain", } + + +def test_compute_md5(): + assert compute_md5(FILE_INVALID_TXT) == "4d5e4b1c8e8febbd31fa9ce7f088beae"