sintonia/worker/libretime_worker/tasks.py

import json
import shutil
import tempfile
import traceback
from cgi import parse_header
from contextlib import closing
from pathlib import Path
from typing import Optional
from urllib.parse import urlsplit

import mutagen
import requests
from celery import Celery
from celery.utils.log import get_task_logger
from requests import Response

from .config import config

worker = Celery()
logger = get_task_logger(__name__)


@worker.task(name="podcast-download", acks_late=True)
def podcast_download(
    id: int,
    url: str,
    podcast_name: str,
    album_override: bool,
    track_title: Optional[str],
):
    """
    Download a podcast episode.

    Args:
        id: Episode ID.
        url: Episode download url.
        podcast_name: Podcast name to save to the metadata.
        album_override: Whether to override the album metadata.
        track_title: Episode title to override the title metadata.

    Returns:
        Status of the podcast download as JSON string.
    """
    # Object to store file IDs, episode IDs, and download status
    # (important if there's an error before the file is posted)
    obj = {"episodeid": id}
    try:
        re = None
        with closing(requests.get(url, stream=True)) as r:
            filename = extract_filename(r)
            with tempfile.NamedTemporaryFile(mode="wb+", delete=False) as audiofile:
                r.raw.decode_content = True
                shutil.copyfileobj(r.raw, audiofile)
                # mutagen should be able to guess the write file type
                metadata_audiofile = mutagen.File(audiofile.name, easy=True)
                # if for some reason this should fail lets try it as a mp3 specific code
                if metadata_audiofile == None:
                    # if this happens then mutagen couldn't guess what type of file it is
                    mp3suffix = ("mp3", "MP3", "Mp3", "mP3")
                    # so we treat it like a mp3 if it has a mp3 file extension and hope for the best
                    if filename.endswith(mp3suffix):
                        metadata_audiofile = mutagen.mp3.MP3(
                            audiofile.name, ID3=mutagen.easyid3.EasyID3
                        )
                # replace track metadata as indicated by album_override setting
                # replace album title as needed
                metadata_audiofile = podcast_override_metadata(
                    metadata_audiofile, podcast_name, album_override, track_title
                )
                metadata_audiofile.save()
                filetypeinfo = metadata_audiofile.pprint()
                logger.info(
                    "filetypeinfo is {}".format(filetypeinfo.encode("ascii", "ignore"))
                )
                callback_url = f"{config.general.public_url}/rest/media"
                callback_api_key = config.general.api_key

                re = requests.post(
                    callback_url,
                    files={"file": (filename, open(audiofile.name, "rb"))},
                    auth=requests.auth.HTTPBasicAuth(callback_api_key, ""),
                )
        re.raise_for_status()
        try:
            response = re.content.decode()
        except (UnicodeDecodeError, AttributeError):
            response = re.content
        f = json.loads(
            response
        )  # Read the response from the media API to get the file id
        obj["fileid"] = f["id"]
        obj["status"] = 1
    except Exception as e:
        obj["error"] = e.message
        logger.info(f"Error during file download: {e}")
        logger.debug("Original Traceback: %s" % (traceback.format_exc(e)))
        obj["status"] = 0
    return json.dumps(obj)


def podcast_override_metadata(m, podcast_name, override, track_title):
    """
    Override m['album'] if empty or forced with override arg
    """
    # if the album override option is enabled replace the album id3 tag with the podcast name even if the album tag contains data
    if override is True:
        logger.debug(
            "overriding album name to {} in podcast".format(
                podcast_name.encode("ascii", "ignore")
            )
        )
        m["album"] = podcast_name
        m["title"] = track_title
        m["artist"] = podcast_name
    else:
        # replace the album id3 tag with the podcast name if the album tag is empty
        try:
            m["album"]
        except KeyError:
            logger.debug(
                "setting new album name to {} in podcast".format(
                    podcast_name.encode("ascii", "ignore")
                )
            )
            m["album"] = podcast_name
    return m


def extract_filename(response: Response) -> str:
    """
    Extract the filename from a download request.

    Args:
        response: Download request response.

    Returns:
        Extracted filename.
    """
    if "Content-Disposition" in response.headers:
        _, params = parse_header(response.headers["Content-Disposition"])
        if "filename" in params:
            return params["filename"]

    return Path(urlsplit(response.url).path).name
Add isort pre-commit hook Sort import statement in python files See https://github.com/PyCQA/isort 2021-06-03 15:20:39 +02:00			`import json`
first part of the adding album tags to podcast downloads 2017-02-13 15:32:07 +01:00			`import shutil`
			`import tempfile`
Fix unicode issues in podcast downloader The podcast downloader fails pretty badly when the podcast name contains non ascii chars. The main fail happens during logging; I have learnt way to much about pythons stupid unicode implementation. This adds addtional debug logging and also outputs the real reason a download fails properly. The content of the tags should be written as UTF-8 or whater is input into it, this commit mainly touches (and fixes) logging. 2017-03-11 21:01:52 +01:00			`import traceback`
refactor(worker): rewrite extract_filename from download 2022-09-09 11:51:16 +02:00			`from cgi import parse_header`
SAAS-1071 - more work on backend podcast implementation 2015-09-24 18:58:02 +02:00			`from contextlib import closing`
refactor(worker): rewrite extract_filename from download 2022-09-09 11:51:16 +02:00			`from pathlib import Path`
refactor(worker): add types and rewrite docstring 2022-09-09 11:58:57 +02:00			`from typing import Optional`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`from urllib.parse import urlsplit`

Add isort pre-commit hook Sort import statement in python files See https://github.com/PyCQA/isort 2021-06-03 15:20:39 +02:00			`import mutagen`
			`import requests`
			`from celery import Celery`
			`from celery.utils.log import get_task_logger`
refactor(worker): rewrite extract_filename from download 2022-09-09 11:51:16 +02:00			`from requests import Response`
SAAS-853 - Celery backend for SoundCloud uploads 2015-06-10 21:04:49 +02:00
feat(worker): load callback details from config (#1994) 2022-07-26 14:18:41 +02:00			`from .config import config`

feat(worker): load config using shared helpers BREAKING CHANGE: The worker `RMQ_CONFIG_FILE` environement variable has been renamed to `LIBRETIME_CONFIG_FILEPATH`. In addition the systemd working directory for the worker has changed from `/srv/airtime` to `/var/lib/libretime/worker`. 2022-02-22 20:03:31 +01:00			`worker = Celery()`
SAAS-853 - Celery backend for SoundCloud uploads 2015-06-10 21:04:49 +02:00			`logger = get_task_logger(__name__)`


feat(worker): load config using shared helpers BREAKING CHANGE: The worker `RMQ_CONFIG_FILE` environement variable has been renamed to `LIBRETIME_CONFIG_FILEPATH`. In addition the systemd working directory for the worker has changed from `/srv/airtime` to `/var/lib/libretime/worker`. 2022-02-22 20:03:31 +01:00			`@worker.task(name="podcast-download", acks_late=True)`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`def podcast_download(`
refactor(worker): add types and rewrite docstring 2022-09-09 11:58:57 +02:00			`id: int,`
			`url: str,`
			`podcast_name: str,`
			`album_override: bool,`
			`track_title: Optional[str],`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`):`
SAAS-1071 - initial work on podcast celery backend; tweak SoundCloud service 2015-09-22 21:26:08 +02:00			`"""`
refactor(worker): add types and rewrite docstring 2022-09-09 11:58:57 +02:00			`Download a podcast episode.`
SAAS-1071 - initial work on podcast celery backend; tweak SoundCloud service 2015-09-22 21:26:08 +02:00
refactor(worker): add types and rewrite docstring 2022-09-09 11:58:57 +02:00			`Args:`
			`id: Episode ID.`
			`url: Episode download url.`
			`podcast_name: Podcast name to save to the metadata.`
			`album_override: Whether to override the album metadata.`
			`track_title: Episode title to override the title metadata.`
Add SoundCloud update and download tasks to Celery backend; requires airtime-celery reinstall 2015-10-30 21:10:16 +01:00
refactor(worker): add types and rewrite docstring 2022-09-09 11:58:57 +02:00			`Returns:`
			`Status of the podcast download as JSON string.`
SAAS-1071 - initial work on podcast celery backend; tweak SoundCloud service 2015-09-22 21:26:08 +02:00			`"""`
* SAAS-1084 - initial work on publishing API backend * More work on automatic ingest * Add automatic_ingest_timestamp column to ImportedPodcast 2015-10-21 01:03:34 +02:00			`# Object to store file IDs, episode IDs, and download status`
			`# (important if there's an error before the file is posted)`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`obj = {"episodeid": id}`
* SAAS-1084 - initial work on publishing API backend * More work on automatic ingest * Add automatic_ingest_timestamp column to ImportedPodcast 2015-10-21 01:03:34 +02:00			`try:`
			`re = None`
			`with closing(requests.get(url, stream=True)) as r:`
refactor(worker): rewrite extract_filename from download 2022-09-09 11:51:16 +02:00			`filename = extract_filename(r)`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`with tempfile.NamedTemporaryFile(mode="wb+", delete=False) as audiofile:`
Adding decode = true to the raw request object fixes issues with certain podcasts and corruption 2017-10-08 00:48:39 +02:00			`r.raw.decode_content = True`
first part of the adding album tags to podcast downloads 2017-02-13 15:32:07 +01:00			`shutil.copyfileobj(r.raw, audiofile)`
modified the code to treat a file mutagen fails to load as a mp3 2018-12-27 23:50:33 +01:00			`# mutagen should be able to guess the write file type`
changed celery podcast download to use mutagen for all supported files vs. mp3 only 2018-12-13 18:36:10 +01:00			`metadata_audiofile = mutagen.File(audiofile.name, easy=True)`
modified the code to treat a file mutagen fails to load as a mp3 2018-12-27 23:50:33 +01:00			`# if for some reason this should fail lets try it as a mp3 specific code`
			`if metadata_audiofile == None:`
made basic sanity checks to only do back up mp3 mutagen import on files with mp3 extension 2018-12-28 00:38:17 +01:00			`# if this happens then mutagen couldn't guess what type of file it is`
			`mp3suffix = ("mp3", "MP3", "Mp3", "mP3")`
			`# so we treat it like a mp3 if it has a mp3 file extension and hope for the best`
			`if filename.endswith(mp3suffix):`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`metadata_audiofile = mutagen.mp3.MP3(`
			`audiofile.name, ID3=mutagen.easyid3.EasyID3`
			`)`
			`# replace track metadata as indicated by album_override setting`
Refactor override preference Properly defaults the preference to be true and always return a boolean value since that is what celery will be expecting. 2017-03-17 02:10:04 +01:00			`# replace album title as needed`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`metadata_audiofile = podcast_override_metadata(`
			`metadata_audiofile, podcast_name, album_override, track_title`
			`)`
changed celery podcast download to use mutagen for all supported files vs. mp3 only 2018-12-13 18:36:10 +01:00			`metadata_audiofile.save()`
			`filetypeinfo = metadata_audiofile.pprint()`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`logger.info(`
chore: add pyupgrade pre-commit hook - add --py3-plus flag to pyupgrade hook - add --py36-plus flag to pyupgrade hook 2022-01-25 23:45:00 +01:00			`"filetypeinfo is {}".format(filetypeinfo.encode("ascii", "ignore"))`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`)`
feat(worker): load callback details from config (#1994) 2022-07-26 14:18:41 +02:00			`callback_url = f"{config.general.public_url}/rest/media"`
			`callback_api_key = config.general.api_key`

:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`re = requests.post(`
			`callback_url,`
			`files={"file": (filename, open(audiofile.name, "rb"))},`
feat(worker): load callback details from config (#1994) 2022-07-26 14:18:41 +02:00			`auth=requests.auth.HTTPBasicAuth(callback_api_key, ""),`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`)`
* SAAS-1084 - initial work on publishing API backend * More work on automatic ingest * Add automatic_ingest_timestamp column to ImportedPodcast 2015-10-21 01:03:34 +02:00			`re.raise_for_status()`
Ensure all json loads calls use strings 2020-05-04 13:24:57 +02:00			`try:`
			`response = re.content.decode()`
			`except (UnicodeDecodeError, AttributeError):`
			`response = re.content`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`f = json.loads(`
Ensure all json loads calls use strings 2020-05-04 13:24:57 +02:00			`response`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`) # Read the response from the media API to get the file id`
			`obj["fileid"] = f["id"]`
			`obj["status"] = 1`
* SAAS-1084 - initial work on publishing API backend * More work on automatic ingest * Add automatic_ingest_timestamp column to ImportedPodcast 2015-10-21 01:03:34 +02:00			`except Exception as e:`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`obj["error"] = e.message`
chore: add pyupgrade pre-commit hook - add --py3-plus flag to pyupgrade hook - add --py36-plus flag to pyupgrade hook 2022-01-25 23:45:00 +01:00			`logger.info(f"Error during file download: {e}")`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`logger.debug("Original Traceback: %s" % (traceback.format_exc(e)))`
			`obj["status"] = 0`
* SAAS-1084 - initial work on publishing API backend * More work on automatic ingest * Add automatic_ingest_timestamp column to ImportedPodcast 2015-10-21 01:03:34 +02:00			`return json.dumps(obj)`
SAAS-1071 - more work on celery backend for podcasts; add upgrade to make file_id field in third_party_track_references nullable 2015-09-24 21:57:38 +02:00
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00
added track title override for podcasts 2018-12-23 20:54:47 +01:00			`def podcast_override_metadata(m, podcast_name, override, track_title):`
Refactor override preference Properly defaults the preference to be true and always return a boolean value since that is what celery will be expecting. 2017-03-17 02:10:04 +01:00			`"""`
			`Override m['album'] if empty or forced with override arg`
			`"""`
			`# if the album override option is enabled replace the album id3 tag with the podcast name even if the album tag contains data`
			`if override is True:`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`logger.debug(`
chore: add pyupgrade pre-commit hook - add --py3-plus flag to pyupgrade hook - add --py36-plus flag to pyupgrade hook 2022-01-25 23:45:00 +01:00			`"overriding album name to {} in podcast".format(`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`podcast_name.encode("ascii", "ignore")`
			`)`
			`)`
			`m["album"] = podcast_name`
			`m["title"] = track_title`
			`m["artist"] = podcast_name`
Refactor override preference Properly defaults the preference to be true and always return a boolean value since that is what celery will be expecting. 2017-03-17 02:10:04 +01:00			`else:`
			`# replace the album id3 tag with the podcast name if the album tag is empty`
			`try:`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`m["album"]`
Refactor override preference Properly defaults the preference to be true and always return a boolean value since that is what celery will be expecting. 2017-03-17 02:10:04 +01:00			`except KeyError:`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`logger.debug(`
chore: add pyupgrade pre-commit hook - add --py3-plus flag to pyupgrade hook - add --py36-plus flag to pyupgrade hook 2022-01-25 23:45:00 +01:00			`"setting new album name to {} in podcast".format(`
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00			`podcast_name.encode("ascii", "ignore")`
			`)`
			`)`
			`m["album"] = podcast_name`
Refactor override preference Properly defaults the preference to be true and always return a boolean value since that is what celery will be expecting. 2017-03-17 02:10:04 +01:00			`return m`
SAAS-1071 - more work on celery backend for podcasts; add upgrade to make file_id field in third_party_track_references nullable 2015-09-24 21:57:38 +02:00
:recycle: (celery) python3 compat fixes 2019-08-18 17:45:48 +02:00
refactor(worker): rewrite extract_filename from download 2022-09-09 11:51:16 +02:00			`def extract_filename(response: Response) -> str:`
SAAS-1071 - more work on celery backend for podcasts; add upgrade to make file_id field in third_party_track_references nullable 2015-09-24 21:57:38 +02:00			`"""`
refactor(worker): rewrite extract_filename from download 2022-09-09 11:51:16 +02:00			`Extract the filename from a download request.`
SAAS-1071 - more work on celery backend for podcasts; add upgrade to make file_id field in third_party_track_references nullable 2015-09-24 21:57:38 +02:00
refactor(worker): rewrite extract_filename from download 2022-09-09 11:51:16 +02:00			`Args:`
			`response: Download request response.`
Add SoundCloud update and download tasks to Celery backend; requires airtime-celery reinstall 2015-10-30 21:10:16 +01:00
refactor(worker): rewrite extract_filename from download 2022-09-09 11:51:16 +02:00			`Returns:`
			`Extracted filename.`
SAAS-1071 - more work on celery backend for podcasts; add upgrade to make file_id field in third_party_track_references nullable 2015-09-24 21:57:38 +02:00			`"""`
refactor(worker): rewrite extract_filename from download 2022-09-09 11:51:16 +02:00			`if "Content-Disposition" in response.headers:`
			`_, params = parse_header(response.headers["Content-Disposition"])`
			`if "filename" in params:`
			`return params["filename"]`

			`return Path(urlsplit(response.url).path).name`