fix(worker): rewrite podcast download task

- Fixes bad exception handling when facing an invalid podcast episode file. Fix #2083
This commit is contained in:
jo 2022-09-09 13:08:52 +02:00 committed by Kyle Robbertze
parent 2afb766b41
commit 08a44186aa
9 changed files with 141 additions and 96 deletions

View File

@ -181,11 +181,11 @@ class Application_Service_PodcastEpisodeService extends Application_Service_Thir
private function _download($id, $url, $title, $album_override, $track_title = null) private function _download($id, $url, $title, $album_override, $track_title = null)
{ {
$data = [ $data = [
'id' => $id, 'episode_id' => $id,
'url' => $url, 'episode_url' => $url,
'episode_title' => $track_title,
'podcast_name' => $title, 'podcast_name' => $title,
'album_override' => $album_override, 'override_album' => $album_override,
'track_title' => $track_title,
]; ];
$task = $this->_executeTask(static::$_CELERY_TASKS[self::DOWNLOAD], $data); $task = $this->_executeTask(static::$_CELERY_TASKS[self::DOWNLOAD], $data);
// Get the created ThirdPartyTaskReference and set the episode ID so // Get the created ThirdPartyTaskReference and set the episode ID so

View File

@ -1,18 +1,16 @@
import json import json
import shutil
import tempfile
import traceback
from cgi import parse_header from cgi import parse_header
from contextlib import closing
from pathlib import Path from pathlib import Path
from typing import Optional from tempfile import NamedTemporaryFile
from typing import Any, Dict, Optional
from urllib.parse import urlsplit from urllib.parse import urlsplit
import mutagen import mutagen
import requests import requests
from celery import Celery from celery import Celery
from celery.utils.log import get_task_logger from celery.utils.log import get_task_logger
from requests import Response from mutagen import MutagenError
from requests import RequestException, Response
from .config import config from .config import config
@ -22,108 +20,95 @@ logger = get_task_logger(__name__)
@worker.task(name="podcast-download", acks_late=True) @worker.task(name="podcast-download", acks_late=True)
def podcast_download( def podcast_download(
id: int, episode_id: int,
url: str, episode_url: str,
episode_title: Optional[str],
podcast_name: str, podcast_name: str,
album_override: bool, override_album: bool,
track_title: Optional[str],
): ):
""" """
Download a podcast episode. Download a podcast episode.
Args: Args:
id: Episode ID. episode_id: Episode ID.
url: Episode download url. episode_url: Episode download url.
episode_title: Episode title to override the title metadata.
podcast_name: Podcast name to save to the metadata. podcast_name: Podcast name to save to the metadata.
album_override: Whether to override the album metadata. override_album: Whether to override the album metadata.
track_title: Episode title to override the title metadata.
Returns: Returns:
Status of the podcast download as JSON string. Status of the podcast download as JSON string.
""" """
# Object to store file IDs, episode IDs, and download status result: Dict[str, Any] = {"episodeid": episode_id}
# (important if there's an error before the file is posted)
obj = {"episodeid": id}
try: try:
re = None # Download podcast episode file
with closing(requests.get(url, stream=True)) as r:
filename = extract_filename(r)
with tempfile.NamedTemporaryFile(mode="wb+", delete=False) as audiofile:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, audiofile)
# mutagen should be able to guess the write file type
metadata_audiofile = mutagen.File(audiofile.name, easy=True)
# if for some reason this should fail lets try it as a mp3 specific code
if metadata_audiofile == None:
# if this happens then mutagen couldn't guess what type of file it is
mp3suffix = ("mp3", "MP3", "Mp3", "mP3")
# so we treat it like a mp3 if it has a mp3 file extension and hope for the best
if filename.endswith(mp3suffix):
metadata_audiofile = mutagen.mp3.MP3(
audiofile.name, ID3=mutagen.easyid3.EasyID3
)
# replace track metadata as indicated by album_override setting
# replace album title as needed
metadata_audiofile = podcast_override_metadata(
metadata_audiofile, podcast_name, album_override, track_title
)
metadata_audiofile.save()
filetypeinfo = metadata_audiofile.pprint()
logger.info(
"filetypeinfo is {}".format(filetypeinfo.encode("ascii", "ignore"))
)
callback_url = f"{config.general.public_url}/rest/media"
callback_api_key = config.general.api_key
re = requests.post(
callback_url,
files={"file": (filename, open(audiofile.name, "rb"))},
auth=requests.auth.HTTPBasicAuth(callback_api_key, ""),
)
re.raise_for_status()
try: try:
response = re.content.decode() with requests.get(episode_url, stream=True, timeout=30) as resp:
except (UnicodeDecodeError, AttributeError): resp.raise_for_status()
response = re.content
f = json.loads(
response
) # Read the response from the media API to get the file id
obj["fileid"] = f["id"]
obj["status"] = 1
except Exception as e:
obj["error"] = e.message
logger.info(f"Error during file download: {e}")
logger.debug("Original Traceback: %s" % (traceback.format_exc(e)))
obj["status"] = 0
return json.dumps(obj)
filename = extract_filename(resp)
def podcast_override_metadata(m, podcast_name, override, track_title): # The filename extension helps to determine the file type using mutagen
""" with NamedTemporaryFile(suffix=filename, delete=False) as tmp_file:
Override m['album'] if empty or forced with override arg for chunk in resp.iter_content(chunk_size=2048):
""" tmp_file.write(chunk)
# if the album override option is enabled replace the album id3 tag with the podcast name even if the album tag contains data
if override is True: except RequestException as exception:
logger.debug( logger.exception(f"could not download podcast episode {episode_id}")
"overriding album name to {} in podcast".format( raise exception
podcast_name.encode("ascii", "ignore")
) # Save metadata to podcast episode file
)
m["album"] = podcast_name
m["title"] = track_title
m["artist"] = podcast_name
else:
# replace the album id3 tag with the podcast name if the album tag is empty
try: try:
m["album"] metadata = mutagen.File(tmp_file.name, easy=True)
except KeyError: if metadata is None:
logger.debug( raise MutagenError(
"setting new album name to {} in podcast".format( f"could not determine episode {episode_id} file type"
podcast_name.encode("ascii", "ignore")
) )
)
m["album"] = podcast_name if override_album:
return m logger.debug(f"overriding album name with podcast name {podcast_name}")
metadata["artist"] = podcast_name
metadata["album"] = podcast_name
metadata["title"] = episode_title
elif "album" not in metadata:
logger.debug(f"setting album name to podcast name {podcast_name}")
metadata["album"] = podcast_name
metadata.save()
logger.debug(f"saved metadata {metadata}")
except MutagenError as exception:
logger.exception(f"could not save episode {episode_id} metadata")
raise exception
# Upload podcast episode file
try:
with requests.post(
f"{config.general.public_url}/rest/media",
files={"file": (filename, open(tmp_file.name, "rb"))},
auth=(config.general.api_key, ""),
timeout=30,
) as upload_resp:
upload_resp.raise_for_status()
upload_payload = upload_resp.json()
result["fileid"] = upload_payload["id"]
result["status"] = 1
except RequestException as exception:
logger.exception(f"could not upload episode {episode_id}")
raise exception
except (RequestException, MutagenError) as exception:
result["status"] = 0
result["error"] = str(exception)
if tmp_file is not None:
Path(tmp_file.name).unlink()
return json.dumps(result)
def extract_filename(response: Response) -> str: def extract_filename(response: Response) -> str:

View File

@ -22,6 +22,7 @@ setup(
], ],
extras_require={ extras_require={
"dev": [ "dev": [
"requests-mock",
"types-requests", "types-requests",
], ],
}, },

0
worker/tests/__init__.py Normal file
View File

6
worker/tests/fixtures/__init__.py vendored Normal file
View File

@ -0,0 +1,6 @@
from pathlib import Path
here = Path(__file__).parent
fixtures_path = here
# malformed.mp3 is related to https://github.com/libretime/libretime/issues/668!!

BIN
worker/tests/fixtures/malformed.mp3 vendored Normal file

Binary file not shown.

Binary file not shown.

BIN
worker/tests/fixtures/s1-stereo.ogg vendored Normal file

Binary file not shown.

View File

@ -1,7 +1,60 @@
import json
import pytest import pytest
from requests import Response from requests import Response
from libretime_worker.tasks import extract_filename from libretime_worker.tasks import extract_filename, podcast_download
from .fixtures import fixtures_path
@pytest.mark.parametrize(
"file",
[
("s1-stereo.ogg"),
("s1-stereo-tagged.mp3"),
("malformed.mp3"),
],
)
@pytest.mark.parametrize("override_album", [(True), (False)])
def test_podcast_download(requests_mock, file, override_album):
episode_url = f"https://remote.example.org/{file}"
episode_filepath = fixtures_path / file
requests_mock.get(episode_url, content=episode_filepath.read_bytes())
requests_mock.post("http://localhost/rest/media", json={"id": 1})
result = podcast_download(
episode_id=1,
episode_url=episode_url,
episode_title="My episode",
podcast_name="My podcast!",
override_album=override_album,
)
assert json.loads(result) == {
"episodeid": 1,
"fileid": 1,
"status": 1,
}
def test_podcast_download_invalid_file(requests_mock):
episode_url = "https://remote.example.org/invalid"
requests_mock.get(episode_url, content=b"some invalid content")
requests_mock.post("http://localhost/rest/media", json={"id": 1})
result = podcast_download(
episode_id=1,
episode_url=episode_url,
episode_title="My episode",
podcast_name="My podcast!",
override_album=False,
)
assert json.loads(result) == {
"episodeid": 1,
"status": 0,
"error": "could not determine episode 1 file type",
}
@pytest.mark.parametrize( @pytest.mark.parametrize(