From 1373d4984f5de0fe16cd9babc9b28a382477e8fc Mon Sep 17 00:00:00 2001 From: Albert Santoni Date: Fri, 30 May 2014 13:02:19 -0400 Subject: [PATCH] CC-5862: Invalid UTF-8 chars cause DB error * Strip and validate UTF-8 strings in the Media API * Also properly parse track numbers containing "-" --- .../rest/controllers/MediaController.php | 25 +++++++++++++++++++ .../airtime_analyzer/metadata_analyzer.py | 6 ++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/airtime_mvc/application/modules/rest/controllers/MediaController.php b/airtime_mvc/application/modules/rest/controllers/MediaController.php index 704ee2052..e10fda28c 100644 --- a/airtime_mvc/application/modules/rest/controllers/MediaController.php +++ b/airtime_mvc/application/modules/rest/controllers/MediaController.php @@ -402,10 +402,15 @@ class Rest_MediaController extends Zend_Rest_Controller if ($stringLengthValidator) { $value = substr($value, 0, $stringLengthValidator->getMax()); } + + $value = $this->stripInvalidUtf8Characters($value); } } if (!$fileForm->isValidPartial($whiteList)) { + $errors = $fileForm->getErrors(); + $messages = $fileForm->getMessages(); + Logging::error($messages); $file->setDbImportStatus(2); $file->setDbHidden(true); $this->invalidDataResponse(); @@ -526,5 +531,25 @@ class Rest_MediaController extends Zend_Rest_Controller } return $metadata; } + + private function stripInvalidUtf8Characters($string) + { + //Remove invalid UTF-8 characters + //reject overly long 2 byte sequences, as well as characters above U+10000 and replace with ? + $string = preg_replace('/[\x00-\x08\x10\x0B\x0C\x0E-\x19\x7F]'. + '|[\x00-\x7F][\x80-\xBF]+'. + '|([\xC0\xC1]|[\xF0-\xFF])[\x80-\xBF]*'. + '|[\xC2-\xDF]((?![\x80-\xBF])|[\x80-\xBF]{2,})'. + '|[\xE0-\xEF](([\x80-\xBF](?![\x80-\xBF]))|(?![\x80-\xBF]{2})|[\x80-\xBF]{3,})/S', + '?', $string ); + + //reject overly long 3 byte sequences and UTF-16 surrogates and replace with ? + $string = preg_replace('/\xE0[\x80-\x9F][\x80-\xBF]'. + '|\xED[\xA0-\xBF][\x80-\xBF]/S','?', $string ); + + //Do a final encoding conversion to + $string = mb_convert_encoding($string, 'UTF-8', 'UTF-8'); + return $string; + } } diff --git a/python_apps/airtime_analyzer/airtime_analyzer/metadata_analyzer.py b/python_apps/airtime_analyzer/airtime_analyzer/metadata_analyzer.py index 59ef4ba7c..ec5889596 100644 --- a/python_apps/airtime_analyzer/airtime_analyzer/metadata_analyzer.py +++ b/python_apps/airtime_analyzer/airtime_analyzer/metadata_analyzer.py @@ -83,7 +83,11 @@ class MetadataAnalyzer(Analyzer): track_number = audio_file["tracknumber"] if isinstance(track_number, list): # Sometimes tracknumber is a list, ugh track_number = track_number[0] - track_number_tokens = track_number.split(u'/') + track_number_tokens = track_number + if u'/' in track_number: + track_number_tokens = track_number.split(u'/') + elif u'-' in track_number: + track_number_tokens = track_number.split(u'-') track_number = track_number_tokens[0] metadata["track_number"] = track_number track_total = track_number_tokens[1]