Bug 1300812 - Make Mozharness downloads handle better intermittent S3/EC2 issues draft
authorArmen Zambrano Gasparnian <armenzg@mozilla.com>
Mon, 12 Sep 2016 11:41:04 -0400
changeset 413074 a6d2ff10d19641a83a07776e5cec34f4a07fdc9d
parent 412625 1851b78b5a9673ee422f189b92e5f1e86b82a01c
child 531117 6d46e5ec7d7548df6cc54b7e29f0a834f041f227
push id29315
push userarmenzg@mozilla.com
push dateTue, 13 Sep 2016 13:13:24 +0000
bugs1300812
milestone51.0a1
Bug 1300812 - Make Mozharness downloads handle better intermittent S3/EC2 issues We believe we're getting incomplete bytes when fetching files from S3 to our EC2 instances. This patch should help give us more information and retry few times before failing. MozReview-Commit-ID: 7tUzZmS8Zph
testing/mozharness/mozharness/base/script.py
--- a/testing/mozharness/mozharness/base/script.py
+++ b/testing/mozharness/mozharness/base/script.py
@@ -45,24 +45,28 @@ if os.name == 'nt':
         PYWIN32 = False
 
 try:
     import simplejson as json
     assert json
 except ImportError:
     import json
 
-from cStringIO import StringIO
+from io import BytesIO
 
 from mozprocess import ProcessHandler
 from mozharness.base.config import BaseConfig
 from mozharness.base.log import SimpleFileLogger, MultiFileLogger, \
     LogMixin, OutputParser, DEBUG, INFO, ERROR, FATAL
 
 
+class FetchedIncorrectFilesize(Exception):
+    pass
+
+
 def platform_name():
     pm = PlatformMixin()
 
     if pm._is_linux() and pm._is_64_bit():
         return 'linux64'
     elif pm._is_linux() and not pm._is_64_bit():
         return 'linux'
     elif pm._is_darwin():
@@ -336,16 +340,81 @@ class ScriptMixin(PlatformMixin):
 
         .. _urllib2.urlopen:
         https://docs.python.org/2/library/urllib2.html#urllib2.urlopen
         """
         # http://bugs.python.org/issue13359 - urllib2 does not automatically quote the URL
         url_quoted = urllib2.quote(url, safe='%/:=&?~#+!$,;\'@()*[]|')
         return urllib2.urlopen(url_quoted, **kwargs)
 
+
+
+    def fetch_url_into_memory(self, url):
+        ''' Downloads a file from a url into memory instead of disk.
+
+        Args:
+            url (str): URL path where the file to be downloaded is located.
+
+        Raises:
+            IOError: When the url points to a file on disk and cannot be found
+            FetchedIncorrectFilesize: When the size of the fetched file does not match the
+                                      expected file size.
+            ValueError: When the scheme of a url is not what is expected.
+
+        Returns:
+            BytesIO: contents of url
+        '''
+        self.info('Fetch {} into memory'.format(url))
+        parsed_url = urlparse.urlparse(url)
+
+        if parsed_url.scheme in ('', 'file'):
+            if not os.path.isfile(url):
+                raise IOError('Could not find file to extract: {}'.format(url))
+
+            expected_file_size = os.stat(url.replace('file://', '')).st_size
+
+            # In case we're referrencing a file without file://
+            if parsed_url.scheme == '':
+                url = 'file://%s' % os.path.abspath(url)
+                parsed_url = urlparse.urlparse(url)
+
+        request = urllib2.Request(url)
+        # Exceptions to be retried:
+        # Bug 1300663 - HTTPError: HTTP Error 404: Not Found
+        # Bug 1300413 - HTTPError: HTTP Error 500: Internal Server Error
+        # Bug 1300943 - HTTPError: HTTP Error 503: Service Unavailable
+        # Bug 1300953 - URLError: <urlopen error [Errno -2] Name or service not known>
+        # Bug 1301594 - URLError: <urlopen error [Errno 10054] An existing connection was ...
+        # Bug 1301597 - URLError: <urlopen error [Errno 8] _ssl.c:504: EOF occurred in ...
+        # Bug 1301855 - URLError: <urlopen error [Errno 60] Operation timed out>
+        # Bug 1302237 - URLError: <urlopen error [Errno 104] Connection reset by peer>
+        # Bug 1301807 - BadStatusLine: ''
+        response = urllib2.urlopen(request)
+
+        if parsed_url.scheme in ('http', 'https'):
+            expected_file_size = int(response.headers.get('Content-Length'))
+
+        self.info('Expected file size: {}'.format(expected_file_size))
+        self.debug('Url: {}'.format(url))
+        self.debug('Content-Encoding {}'.format(response.headers.get('Content-Encoding')))
+
+        file_contents = response.read()
+        obtained_file_size = len(file_contents)
+
+        if obtained_file_size != expected_file_size:
+            raise FetchedIncorrectFilesize(
+                'The expected file size is {} while we got instead {}'.format(
+                    expected_file_size, obtained_file_size)
+            )
+
+        # Use BytesIO instead of StringIO
+        # http://stackoverflow.com/questions/34162017/unzip-buffer-with-python/34162395#34162395
+        return BytesIO(file_contents)
+
+
     def _download_file(self, url, file_name):
         """ Helper script for download_file()
         Additionaly this function logs all exceptions as warnings before
         re-raising them
 
         Args:
             url (str): string containing the URL with the file location
             file_name (str): name of the file where the downloaded file
@@ -465,157 +534,157 @@ class ScriptMixin(PlatformMixin):
         """Filter entries of the archive based on the specified list of to extract dirs."""
         filter_partial = functools.partial(fnmatch.filter, namelist)
         entries = itertools.chain(*map(filter_partial, extract_dirs or ['*']))
 
         for entry in entries:
             yield entry
 
 
-    def unzip(self, file_object, extract_to='.', extract_dirs='*', verbose=False):
+    def unzip(self, compressed_file, extract_to, extract_dirs='*', verbose=False):
         """This method allows to extract a zip file without writing to disk first.
 
         Args:
-            file_object (object): Any file like object that is seekable.
-            extract_to (str, optional): where to extract the compressed file.
+            compressed_file (object): File-like object with the contents of a compressed zip file.
+            extract_to (str): where to extract the compressed file.
             extract_dirs (list, optional): directories inside the archive file to extract.
                                            Defaults to '*'.
+            verbose (bool, optional): whether or not extracted content should be displayed.
+                                      Defaults to False.
+
+        Raises:
+            zipfile.BadZipFile: on contents of zipfile being invalid
         """
-        compressed_file = StringIO(file_object.read())
-        try:
-            with zipfile.ZipFile(compressed_file) as bundle:
-                entries = self._filter_entries(bundle.namelist(), extract_dirs)
+        with zipfile.ZipFile(compressed_file) as bundle:
+            entries = self._filter_entries(bundle.namelist(), extract_dirs)
 
-                for entry in entries:
-                    if verbose:
-                        self.info(' {}'.format(entry))
-                    bundle.extract(entry, path=extract_to)
+            for entry in entries:
+                if verbose:
+                    self.info(' {}'.format(entry))
 
-                    # ZipFile doesn't preserve permissions during extraction:
-                    # http://bugs.python.org/issue15795
-                    fname = os.path.realpath(os.path.join(extract_to, entry))
+                # Exception to be retried:
+                # Bug 1301645 - BadZipfile: Bad CRC-32 for file ...
+                #    http://stackoverflow.com/questions/5624669/strange-badzipfile-bad-crc-32-problem/5626098#5626098
+                # Bug 1301802 - error: Error -3 while decompressing: invalid stored block lengths
+                bundle.extract(entry, path=extract_to)
+
+                # ZipFile doesn't preserve permissions during extraction:
+                # http://bugs.python.org/issue15795
+                fname = os.path.realpath(os.path.join(extract_to, entry))
+                try:
+                    # getinfo() can raise KeyError
                     mode = bundle.getinfo(entry).external_attr >> 16 & 0x1FF
                     # Only set permissions if attributes are available. Otherwise all
                     # permissions will be removed eg. on Windows.
                     if mode:
                         os.chmod(fname, mode)
 
-        except zipfile.BadZipfile as e:
-            self.exception('{}'.format(e.message))
+                except KeyError:
+                    self.warning('{} was not found in the zip file'.format(entry))
 
 
-    def deflate(self, file_object, mode, extract_to='.', extract_dirs='*', verbose=False):
-        """This method allows to extract a tar, tar.bz2 and tar.gz file without writing to disk first.
+    def deflate(self, compressed_file, mode, extract_to='.', *args, **kwargs):
+        """This method allows to extract a compressed file from a tar, tar.bz2 and tar.gz files.
 
         Args:
-            file_object (object): Any file like object that is seekable.
+            compressed_file (object): File-like object with the contents of a compressed file.
+            mode (str): string of the form 'filemode[:compression]' (e.g. 'r:gz' or 'r:bz2')
             extract_to (str, optional): where to extract the compressed file.
-            extract_dirs (list, optional): directories inside the archive file to extract.
-                                           Defaults to `*`.
-            verbose (bool, optional): whether or not extracted content should be displayed.
-                                      Defaults to False.
         """
-        compressed_file = StringIO(file_object.read())
         t = tarfile.open(fileobj=compressed_file, mode=mode)
         t.extractall(path=extract_to)
 
 
     def download_unpack(self, url, extract_to='.', extract_dirs='*', verbose=False):
         """Generic method to download and extract a compressed file without writing it to disk first.
 
         Args:
             url (str): URL where the file to be downloaded is located.
             extract_to (str, optional): directory where the downloaded file will
                                         be extracted to.
             extract_dirs (list, optional): directories inside the archive to extract.
                                            Defaults to `*`. It currently only applies to zip files.
-
-        Raises:
-            IOError: on `filename` file not found.
+            verbose (bool, optional): whether or not extracted content should be displayed.
+                                      Defaults to False.
 
         """
-        # Many scripts overwrite this method and set extract_dirs to None
-        extract_dirs = '*' if extract_dirs is None else extract_dirs
-        EXTENSION_TO_MIMETYPE = {
-            'bz2': 'application/x-bzip2',
-            'gz':  'application/x-gzip',
-            'tar': 'application/x-tar',
-            'zip': 'application/zip',
-        }
-        MIMETYPES = {
-            'application/x-bzip2': {
-                'function': self.deflate,
-                'kwargs': {'mode': 'r:bz2'},
-            },
-            'application/x-gzip': {
-                'function': self.deflate,
-                'kwargs': {'mode': 'r:gz'},
-            },
-            'application/x-tar': {
-                'function': self.deflate,
-                'kwargs': {'mode': 'r'},
-            },
-            'application/zip': {
-                'function': self.unzip,
-            },
-        }
+        def _determine_extraction_method_and_kwargs(url):
+            EXTENSION_TO_MIMETYPE = {
+                'bz2': 'application/x-bzip2',
+                'gz':  'application/x-gzip',
+                'tar': 'application/x-tar',
+                'zip': 'application/zip',
+            }
+            MIMETYPES = {
+                'application/x-bzip2': {
+                    'function': self.deflate,
+                    'kwargs': {'mode': 'r:bz2'},
+                },
+                'application/x-gzip': {
+                    'function': self.deflate,
+                    'kwargs': {'mode': 'r:gz'},
+                },
+                'application/x-tar': {
+                    'function': self.deflate,
+                    'kwargs': {'mode': 'r'},
+                },
+                'application/zip': {
+                    'function': self.unzip,
+                },
+            }
 
-        parsed_url = urlparse.urlparse(url)
-
-        # In case we're referrencing a file without file://
-        if parsed_url.scheme == '':
-            if not os.path.isfile(url):
-                raise IOError('Could not find file to extract: {}'.format(url))
-
-            url = 'file://%s' % os.path.abspath(url)
-            parsed_fd = urlparse.urlparse(url)
-
-        request = urllib2.Request(url)
-        response = urllib2.urlopen(request)
-
-        if parsed_url.scheme == 'file':
             filename = url.split('/')[-1]
             # XXX: bz2/gz instead of tar.{bz2/gz}
             extension = filename[filename.rfind('.')+1:]
             mimetype = EXTENSION_TO_MIMETYPE[extension]
-        else:
-            mimetype = response.headers.type
-
-        self.debug('Url: {}'.format(url))
-        self.debug('Mimetype: {}'.format(mimetype))
-        self.debug('Content-Encoding {}'.format(response.headers.get('Content-Encoding')))
+            self.debug('Mimetype: {}'.format(mimetype))
 
-        function = MIMETYPES[mimetype]['function']
-        kwargs = {
-            'file_object': response,
-            'extract_to': extract_to,
-            'extract_dirs': extract_dirs,
-            'verbose': verbose,
-        }
-        kwargs.update(MIMETYPES[mimetype].get('kwargs', {}))
+            function = MIMETYPES[mimetype]['function']
+            kwargs = {
+                'compressed_file': compressed_file,
+                'extract_to': extract_to,
+                'extract_dirs': extract_dirs,
+                'verbose': verbose,
+            }
+            kwargs.update(MIMETYPES[mimetype].get('kwargs', {}))
 
+            return function, kwargs
+
+        # Many scripts overwrite this method and set extract_dirs to None
+        extract_dirs = '*' if extract_dirs is None else extract_dirs
         self.info('Downloading and extracting to {} these dirs {} from {}'.format(
             extract_to,
             ', '.join(extract_dirs),
             url,
         ))
+
+        # 1) Let's fetch the file
         retry_args = dict(
-            failure_status=None,
-            retry_exceptions=(urllib2.HTTPError, urllib2.URLError,
-                              httplib.BadStatusLine,
-                              socket.timeout, socket.error),
+            retry_exceptions=(
+                urllib2.HTTPError,
+                urllib2.URLError,
+                httplib.BadStatusLine,
+                socket.timeout,
+                socket.error,
+                FetchedIncorrectFilesize,
+            ),
             error_message="Can't download from {}".format(url),
             error_level=FATAL,
         )
-        self.retry(
-            function,
-            kwargs=kwargs,
+        compressed_file = self.retry(
+            self.fetch_url_into_memory,
+            kwargs={'url': url},
             **retry_args
         )
 
+        # 2) We're guaranteed to have download the file with error_level=FATAL
+        #    Let's unpack the file
+        function, kwargs = _determine_extraction_method_and_kwargs(url)
+        function(**kwargs)
+
 
     def load_json_url(self, url, error_level=None, *args, **kwargs):
         """ Returns a json object from a url (it retries). """
         contents = self._retry_download(
             url=url, error_level=error_level, *args, **kwargs
         )
         return json.loads(contents.read())