CC-5709: Airtime Analyzer

* Remove awful StoredFile::uploadFile() function * Massive airtime_analyzer commenting and cleanup * Cleaned up the upload code * Temporarily disabled the liquidsoap playability test.
2014-04-03 16:13:26 -04:00 · 2014-04-03 16:13:26 -04:00 · 95b369c54d
commit 95b369c54d
parent cb62850558
13 changed files with 204 additions and 191 deletions
--- a/python_apps/airtime_analyzer/README.rst
+++ b/python_apps/airtime_analyzer/README.rst
@ -1,5 +1,14 @@
+airtime_analyzer
+==========

-Ghetto temporary installation instructions
+airtime_analyzer is a daemon that processes Airtime file uploads as background jobs.
+It performs metadata extraction using Mutagen and moves uploads into Airtime's 
+music library directory (stor/imported).
+
+airtime_analyzer uses process isolation to make it resilient to crashes and runs in 
+a multi-tenant environment with no modifications.
+
+Installation
 ==========

    $ sudo python setup.py install
@ -71,3 +80,9 @@ a test, run:
 To run the unit tests and generate a code coverage report, run:

    $ nosetests --with-coverage --cover-package=airtime_analyzer
+
+    
+ History and Design Motivation
+ ===========
+ 
+ 
--- a/python_apps/airtime_analyzer/airtime_analyzer/airtime_analyzer.py
+++ b/python_apps/airtime_analyzer/airtime_analyzer/airtime_analyzer.py
@ -1,3 +1,5 @@
+"""Contains the main application class for airtime_analyzer.
+"""
 import ConfigParser
 import logging
 import logging.handlers
@ -8,6 +10,8 @@ from message_listener import MessageListener


 class AirtimeAnalyzerServer:
+    """A server for importing uploads to Airtime as background jobs.
+    """

    # Constants 
    _LOG_PATH = "/var/log/airtime/airtime_analyzer.log"
@ -29,7 +33,12 @@ class AirtimeAnalyzerServer:
    

    def setup_logging(self, debug):
-   
+        """Set up nicely formatted logging and log rotation.
+        
+        Keyword arguments:
+        debug -- a boolean indicating whether to enable super verbose logging
+                 to the screen and disk.
+        """
        if debug:
            self._log_level = logging.DEBUG
        else:
@ -37,8 +46,6 @@ class AirtimeAnalyzerServer:
            pika_logger = logging.getLogger('pika')
            pika_logger.setLevel(logging.CRITICAL)
        
-        #self.log = logging.getLogger(__name__)
-
        # Set up logging
        logFormatter = logging.Formatter("%(asctime)s [%(module)s] [%(levelname)-5.5s]  %(message)s")
        rootLogger = logging.getLogger()
@ -55,6 +62,7 @@ class AirtimeAnalyzerServer:


    def read_config_file(self, config_path):
+        """Parse the application's config file located at config_path."""
        config = ConfigParser.SafeConfigParser()
        try:
            config.readfp(open(config_path))
@ -66,12 +74,4 @@ class AirtimeAnalyzerServer:
            exit(-1)

        return config
-        
-
-''' When being run from the command line, analyze a file passed
-    as an argument. '''
-if __name__ == "__main__":
-    import sys
-    analyzers = AnalyzerPipeline()
-
-
+        
--- a/python_apps/airtime_analyzer/airtime_analyzer/analyzer.py
+++ b/python_apps/airtime_analyzer/airtime_analyzer/analyzer.py
@ -1,12 +1,13 @@

 class Analyzer:
-
+    """ Abstract base class fpr all "analyzers".
+    """
    @staticmethod
    def analyze(filename, metadata):
        raise NotImplementedError

 '''
-class AnalyzerError(Exception):
+class AnalyzerError(Error):
    def __init__(self):
        super.__init__(self)
 '''
--- a/python_apps/airtime_analyzer/airtime_analyzer/analyzer_pipeline.py
+++ b/python_apps/airtime_analyzer/airtime_analyzer/analyzer_pipeline.py
@ -1,14 +1,35 @@
+""" Analyzes and imports an audio file into the Airtime library. 
+"""
 import logging
 import multiprocessing 
 from metadata_analyzer import MetadataAnalyzer
 from filemover_analyzer import FileMoverAnalyzer

 class AnalyzerPipeline:
-
-    # Take message dictionary and perform the necessary analysis.
+    """ Analyzes and imports an audio file into the Airtime library. 
+    
+        This currently performs metadata extraction (eg. gets the ID3 tags from an MP3),
+        then moves the file to the Airtime music library (stor/imported), and returns
+        the results back to the parent process. This class is used in an isolated process
+        so that if it crashes, it does not kill the entire airtime_analyzer daemon and
+        the failure to import can be reported back to the web application.
+    """
+    
    @staticmethod
    def run_analysis(queue, audio_file_path, import_directory, original_filename):
-
+        """Analyze and import an audio file, and put all extracted metadata into queue.
+        
+        Keyword arguments:
+            queue: A multiprocessing.queues.Queue which will be used to pass the
+                   extracted metadata back to the parent process.
+            audio_file_path: Path on disk to the audio file to analyze.
+            import_directory: Path to the final Airtime "import" directory where 
+                              we will move the file.
+            original_filename: The original filename of the file, which we'll try to 
+                               preserve. The file at audio_file_path typically has a 
+                               temporary randomly generated name, which is why we want
+                               to know what the original name was.  
+        """
        if not isinstance(queue, multiprocessing.queues.Queue):
            raise TypeError("queue must be a multiprocessing.Queue()")
        if not isinstance(audio_file_path, unicode):
@ -18,8 +39,6 @@ class AnalyzerPipeline:
        if not isinstance(original_filename, unicode):
            raise TypeError("original_filename must be unicode. Was of type " + type(original_filename).__name__ + " instead.")

-        #print ReplayGainAnalyzer.analyze("foo.mp3")
-
        # Analyze the audio file we were told to analyze:
        # First, we extract the ID3 tags and other metadata:
        metadata = dict()
@ -30,8 +49,8 @@ class AnalyzerPipeline:
        # Note that the queue we're putting the results into is our interprocess communication 
        # back to the main process.

-        #Pass all the file metadata back to the main analyzer process, which then passes
-        #it back to the Airtime web application.
+        # Pass all the file metadata back to the main analyzer process, which then passes
+        # it back to the Airtime web application.
        queue.put(metadata)


--- a/python_apps/airtime_analyzer/airtime_analyzer/filemover_analyzer.py
+++ b/python_apps/airtime_analyzer/airtime_analyzer/filemover_analyzer.py
@ -9,13 +9,26 @@ import uuid
 from analyzer import Analyzer

 class FileMoverAnalyzer(Analyzer):
-    
+    """This analyzer copies a file over from a temporary directory (stor/organize) 
+       into the Airtime library (stor/imported).
+    """
    @staticmethod
    def analyze(audio_file_path, metadata):
+        """Dummy method because we need more info than analyze gets passed to it"""
        raise Exception("Use FileMoverAnalyzer.move() instead.")
    
    @staticmethod
    def move(audio_file_path, import_directory, original_filename, metadata):
+        """Move the file at audio_file_path over into the import_directory/import,
+           renaming it to original_filename.
+           
+           Keyword arguments:
+               audio_file_path: Path to the file to be imported.
+               import_directory: Path to the "import" directory inside the Airtime stor directory.
+                                 (eg. /srv/airtime/stor/import)
+               original_filename: The filename of the file when it was uploaded to Airtime.
+               metadata: A dictionary where the "full_path" of where the file is moved to will be added.
+        """
        if not isinstance(audio_file_path, unicode):
            raise TypeError("audio_file_path must be unicode. Was of type " + type(audio_file_path).__name__)
        if not isinstance(import_directory, unicode):
@ -68,6 +81,7 @@ class FileMoverAnalyzer(Analyzer):
        return metadata
    
 def mkdir_p(path):
+    """ Make all directories in a tree (like mkdir -p)"""
    if path == "":
        return
    try:
--- a/python_apps/airtime_analyzer/airtime_analyzer/message_listener.py
+++ b/python_apps/airtime_analyzer/airtime_analyzer/message_listener.py
@ -9,19 +9,56 @@ from status_reporter import StatusReporter

 EXCHANGE = "airtime-uploads"
 EXCHANGE_TYPE = "topic"
-ROUTING_KEY = "" #"airtime.analyzer.tasks"
+ROUTING_KEY = ""
 QUEUE = "airtime-uploads"


-''' TODO: Document me
-    - round robin messaging
-    - acking
-    - why we use the multiprocess architecture
-    - in general, how it works and why it works this way
-'''
+""" A message listener class that waits for messages from Airtime through RabbitMQ
+    notifying us about new uploads.
+    
+    This is probably the most important class in this application. It connects
+    to RabbitMQ (or an AMQP server) and listens for messages that notify us
+    when a user uploads a new file to Airtime, either through the web interface
+    or via FTP (on Airtime Pro). When we get a notification, we spawn a child
+    process that extracts the uploaded audio file's metadata and moves it into
+    Airtime's music library directory. Lastly, the extracted metadata is 
+    reported back to the Airtime web application.
+    
+    There's a couple of Very Important technical details and contraints that you
+    need to know if you're going to work on this code:
+    
+    1) airtime_analyzer is designed so it doesn't have to run on the same 
+       computer as the web server. It just needs access to your Airtime library
+       folder (stor). 
+    2) airtime_analyzer is multi-tenant - One process can be used for many
+       Airtime instances. It's designed NOT to know about whether it's running
+       in a single tenant or multi-tenant environment. All the information it 
+       needs to import a file into an Airtime instance is passed in via those
+       RabbitMQ messages.
+    3) We're using a "topic exchange" for the new upload notification RabbitMQ
+       messages. This means if we run several airtime_analyzer processes on 
+       different computers, RabbitMQ will do round-robin dispatching of the
+       file notification. This is cheap, easy load balancing and
+       redundancy for us. You can even run multiple airtime_analyzer processes
+       on one machine if you want.
+    4) We run the actual work (metadata analysis and file moving) in a separate
+       child process so that if it crashes, we can stop RabbitMQ from resending
+       the file notification message to another airtime_analyzer process (NACK), 
+       which would otherwise cause cascading failure. We also do this so that we 
+       can report the problem file to the Airtime web interface ("import failed").
+
+    So that is a quick overview of the design constraints for this application, and
+    why airtime_analyzer is written this way.
+"""
 class MessageListener:

    def __init__(self, config):
+        ''' Start listening for file upload notification messages
+            from RabbitMQ
+            
+            Keyword arguments:
+                config: A ConfigParser object containing the [rabbitmq] configuration.
+        '''

        # Read the RabbitMQ connection settings from the config file
        # The exceptions throw here by default give good error messages. 
@ -49,7 +86,7 @@ class MessageListener:


    def connect_to_messaging_server(self):
-
+        '''Connect to the RabbitMQ server and start listening for messages.'''
        self._connection = pika.BlockingConnection(pika.ConnectionParameters(host=self._host, 
            port=self._port, virtual_host=self._vhost, 
            credentials=pika.credentials.PlainCredentials(self._username, self._password)))
@ -64,15 +101,21 @@ class MessageListener:
                                    queue=QUEUE, no_ack=False)

    def wait_for_messages(self):
+        '''Wait until we've received a RabbitMQ message.'''
        self._channel.start_consuming()

    def disconnect_from_messaging_server(self):
+        '''Stop consuming RabbitMQ messages and disconnect'''
        self._channel.stop_consuming()


-    # consume callback function
    @staticmethod
    def msg_received_callback(channel, method_frame, header_frame, body):
+        ''' A callback method that runs when a RabbitMQ message is received. 
+        
+            Here we parse the message, spin up an analyzer process, and report the 
+            metadata back to the Airtime web application (or report an error).
+        ''' 
        logging.info(" - Received '%s' on routing_key '%s'" % (body, method_frame.routing_key))
        
        #Declare all variables here so they exist in the exception handlers below, no matter what.
@ -83,11 +126,12 @@ class MessageListener:
        callback_url    = ""
        api_key         = ""

-        # Spin up a worker process. We use the multiprocessing module and multiprocessing.Queue 
-        # to pass objects between the processes so that if the analyzer process crashes, it does not
-        # take down the rest of the daemon and we NACK that message so that it doesn't get 
-        # propagated to other airtime_analyzer daemons (eg. running on other servers). 
-        # We avoid cascading failure this way.
+        ''' Spin up a worker process. We use the multiprocessing module and multiprocessing.Queue 
+            to pass objects between the processes so that if the analyzer process crashes, it does not
+            take down the rest of the daemon and we NACK that message so that it doesn't get 
+            propagated to other airtime_analyzer daemons (eg. running on other servers). 
+            We avoid cascading failure this way.
+        '''
        try:
            msg_dict = json.loads(body)
            audio_file_path = msg_dict["tmp_file_path"]
@ -109,13 +153,14 @@ class MessageListener:
        
        except Exception as e:
            logging.exception(e)
-            #If ANY exception happens while processing a file, we're going to NACK to the 
-            #messaging server and tell it to remove the message from the queue. 
-            #(NACK is a negative acknowledgement. We could use ACK instead, but this might come
-            # in handy in the future.)
-            #Exceptions in this context are unexpected, unhandled errors. We try to recover
-            #from as many errors as possble in AnalyzerPipeline, but we're safeguarding ourselves
-            #here from any catastrophic or genuinely unexpected errors:
+            ''' If ANY exception happens while processing a file, we're going to NACK to the 
+                messaging server and tell it to remove the message from the queue. 
+                (NACK is a negative acknowledgement. We could use ACK instead, but this might come
+                 in handy in the future.)
+                Exceptions in this context are unexpected, unhandled errors. We try to recover
+                from as many errors as possble in AnalyzerPipeline, but we're safeguarding ourselves
+                here from any catastrophic or genuinely unexpected errors:
+            '''
            channel.basic_nack(delivery_tag=method_frame.delivery_tag, multiple=False,
                               requeue=False) #Important that it doesn't requeue the message

@ -136,7 +181,7 @@ class MessageListener:
    
    @staticmethod
    def spawn_analyzer_process(audio_file_path, import_directory, original_filename):
-
+        ''' Spawn a child process to analyze and import a new audio file. '''
        q = multiprocessing.Queue()
        p = multiprocessing.Process(target=AnalyzerPipeline.run_analysis, 
                        args=(q, audio_file_path, import_directory, original_filename))
--- a/python_apps/airtime_analyzer/airtime_analyzer/metadata_analyzer.py
+++ b/python_apps/airtime_analyzer/airtime_analyzer/metadata_analyzer.py
@ -8,6 +8,12 @@ class MetadataAnalyzer(Analyzer):

    @staticmethod
    def analyze(filename, metadata):
+        ''' Extract audio metadata from tags embedded in the file (eg. ID3 tags)
+        
+            Keyword arguments:
+                filename: The path to the audio file to extract metadata from.
+                metadata: A dictionary that the extracted metadata will be added to. 
+        '''
        if not isinstance(filename, unicode):
            raise TypeError("filename must be unicode. Was of type " + type(filename).__name__)
        if not isinstance(metadata, dict):
@ -25,7 +31,6 @@ class MetadataAnalyzer(Analyzer):
        track_length = datetime.timedelta(seconds=info.length)
        metadata["length"] = str(track_length) #time.strftime("%H:%M:%S.%f", track_length)
        metadata["bit_rate"] = info.bitrate
-        #metadata["channels"] = info.channels
      
        #Use the python-magic module to get the MIME type.
        mime_magic = magic.Magic(mime=True)
--- a/python_apps/airtime_analyzer/airtime_analyzer/replaygain_analyzer.py
+++ b/python_apps/airtime_analyzer/airtime_analyzer/replaygain_analyzer.py
@ -1,6 +1,8 @@
 from analyzer import Analyzer

-''' TODO: everything '''
+''' TODO: ReplayGain is currently calculated by pypo but it should
+          be done here in the analyzer.
+'''
 class ReplayGainAnalyzer(Analyzer):
    
    def __init__(self):
--- a/python_apps/airtime_analyzer/airtime_analyzer/status_reporter.py
+++ b/python_apps/airtime_analyzer/airtime_analyzer/status_reporter.py
@ -3,15 +3,18 @@ import json
 import logging

 class StatusReporter():
-
+    ''' Reports the extracted audio file metadata and job status back to the
+        Airtime web application.
+    '''
    _HTTP_REQUEST_TIMEOUT = 30

-    # Report the extracted metadata and status of the successfully imported file 
-    # to the callback URL (which should be the Airtime File Upload API)
    @classmethod
    def report_success_to_callback_url(self, callback_url, api_key, audio_metadata):
-
-        # encode the audio metadata as json and post it back to the callback_url
+        ''' Report the extracted metadata and status of the successfully imported file 
+            to the callback URL (which should be the Airtime File Upload API)
+        '''
+        
+        # Encode the audio metadata as json and post it back to the callback_url
        put_payload = json.dumps(audio_metadata)
        logging.debug("sending http put with payload: " + put_payload)
        r = requests.put(callback_url, data=put_payload, 
@ -20,13 +23,14 @@ class StatusReporter():
        logging.debug("HTTP request returned status: " + str(r.status_code))
        logging.debug(r.text) # log the response body

-        #todo: queue up failed requests and try them again later.
-        r.raise_for_status() # raise an exception if there was an http error code returned
+        #TODO: queue up failed requests and try them again later.
+        r.raise_for_status() # Raise an exception if there was an http error code returned

    @classmethod
    def report_failure_to_callback_url(self, callback_url, api_key, import_status, reason):
-        # TODO: Make import_status is an int?
-      
+        if not isinstance(import_status, (int, long) ):
+            raise TypeError("import_status must be an integer. Was of type " + type(import_status).__name__)
+
        logging.debug("Reporting import failure to Airtime REST API...")
        audio_metadata = dict()
        audio_metadata["import_status"] = import_status
--- a/python_apps/airtime_analyzer/bin/airtime_analyzer
+++ b/python_apps/airtime_analyzer/bin/airtime_analyzer
@ -1,3 +1,5 @@
+"""Runs the airtime_analyzer application.
+"""
 #!/usr/bin/env python

 import daemon
@ -8,19 +10,37 @@ import airtime_analyzer.airtime_analyzer as aa
 VERSION = "1.0"
 DEFAULT_CONFIG_PATH = '/etc/airtime/airtime.conf'

-print "Airtime Analyzer " + VERSION
+def run():
+    '''Entry-point for this application'''
+    print "Airtime Analyzer " + VERSION
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--daemon", help="run as a daemon", action="store_true")
+    parser.add_argument("--debug", help="log full debugging output", action="store_true")
+    parser.add_argument("--rmq-config-file", help="specify a configuration file with RabbitMQ settings (default is /etc/airtime/airtime.conf)")
+    args = parser.parse_args()
+
+    check_if_media_monitor_is_running()
+
+    #Default config file path
+    config_path = DEFAULT_CONFIG_PATH
+    if args.rmq_config_file:
+        config_path = args.rmq_config_file
+
+    if args.daemon:
+        with daemon.DaemonContext():
+            aa.AirtimeAnalyzerServer(config_path=config_path, debug=args.debug)
+    else: 
+        # Run without daemonizing
+        aa.AirtimeAnalyzerServer(config_path=config_path, debug=args.debug)

-parser = argparse.ArgumentParser()
-parser.add_argument("-d", "--daemon", help="run as a daemon", action="store_true")
-parser.add_argument("--debug", help="log full debugging output", action="store_true")
-parser.add_argument("--rmq-config-file", help="specify a configuration file with RabbitMQ settings (default is /etc/airtime/airtime.conf)")
-args = parser.parse_args()

-'''Ensure media_monitor isn't running before we start, because it'll move newly uploaded
-   files into the library on us and screw up the operation of airtime_analyzer.
-   media_monitor is deprecated.
-'''
 def check_if_media_monitor_is_running():
+    """Ensure media_monitor isn't running before we start.
+    
+       We do this because media_monitor will move newly uploaded
+       files into the library on us and screw up the operation of airtime_analyzer.
+       media_monitor is deprecated.
+    """
    pids = [pid for pid in os.listdir('/proc') if pid.isdigit()]

    for pid in pids:
@ -33,17 +53,6 @@ def check_if_media_monitor_is_running():
        except IOError: # proc has already terminated
            continue

-check_if_media_monitor_is_running()
+run()

-#Default config file path
-config_path = DEFAULT_CONFIG_PATH
-if args.rmq_config_file:
-    config_path = args.rmq_config_file
-
-if args.daemon:
-    with daemon.DaemonContext():
-        analyzer = aa.AirtimeAnalyzerServer(config_path=config_path, debug=args.debug)
-else: 
-    # Run without daemonizing
-    analyzer = aa.AirtimeAnalyzerServer(config_path=config_path, debug=args.debug)