author Armen Zambrano Gasparnian <armenzg@mozilla.com>

Mon, 12 Sep 2016 11:41:04 -0400

changeset 413074 a6d2ff10d19641a83a07776e5cec34f4a07fdc9d

parent 412625 1851b78b5a9673ee422f189b92e5f1e86b82a01c

child 531117 6d46e5ec7d7548df6cc54b7e29f0a834f041f227

push id 29315

push user armenzg@mozilla.com

push date Tue, 13 Sep 2016 13:13:24 +0000

bugs 1300812

milestone 51.0a1

testing/mozharness/mozharness/base/script.py file | annotate | diff | comparison | revisions
--- a/testing/mozharness/mozharness/base/script.py
+++ b/testing/mozharness/mozharness/base/script.py
@@ -45,24 +45,28 @@ if os.name == 'nt':
         PYWIN32 = False
 
 try:
     import simplejson as json
     assert json
 except ImportError:
     import json
 
-from cStringIO import StringIO
+from io import BytesIO
 
 from mozprocess import ProcessHandler
 from mozharness.base.config import BaseConfig
 from mozharness.base.log import SimpleFileLogger, MultiFileLogger, \
     LogMixin, OutputParser, DEBUG, INFO, ERROR, FATAL
 
 
+class FetchedIncorrectFilesize(Exception):
+    pass
+
+
 def platform_name():
     pm = PlatformMixin()
 
     if pm._is_linux() and pm._is_64_bit():
         return 'linux64'
     elif pm._is_linux() and not pm._is_64_bit():
         return 'linux'
     elif pm._is_darwin():
@@ -336,16 +340,81 @@ class ScriptMixin(PlatformMixin):
 
         .. _urllib2.urlopen:
         https://docs.python.org/2/library/urllib2.html#urllib2.urlopen
         """
         # http://bugs.python.org/issue13359 - urllib2 does not automatically quote the URL
         url_quoted = urllib2.quote(url, safe='%/:=&?~#+!$,;\'@()*[]|')
         return urllib2.urlopen(url_quoted, **kwargs)
 
+
+
+    def fetch_url_into_memory(self, url):
+        ''' Downloads a file from a url into memory instead of disk.
+
+        Args:
+            url (str): URL path where the file to be downloaded is located.
+
+        Raises:
+            IOError: When the url points to a file on disk and cannot be found
+            FetchedIncorrectFilesize: When the size of the fetched file does not match the
+                                      expected file size.
+            ValueError: When the scheme of a url is not what is expected.
+
+        Returns:
+            BytesIO: contents of url
+        '''
+        self.info('Fetch {} into memory'.format(url))
+        parsed_url = urlparse.urlparse(url)
+
+        if parsed_url.scheme in ('', 'file'):
+            if not os.path.isfile(url):
+                raise IOError('Could not find file to extract: {}'.format(url))
+
+            expected_file_size = os.stat(url.replace('file://', '')).st_size
+
+            # In case we're referrencing a file without file://
+            if parsed_url.scheme == '':
+                url = 'file://%s' % os.path.abspath(url)
+                parsed_url = urlparse.urlparse(url)
+
+        request = urllib2.Request(url)
+        # Exceptions to be retried:
+        # Bug 1300663 - HTTPError: HTTP Error 404: Not Found
+        # Bug 1300413 - HTTPError: HTTP Error 500: Internal Server Error
+        # Bug 1300943 - HTTPError: HTTP Error 503: Service Unavailable
+        # Bug 1300953 - URLError: <urlopen error [Errno -2] Name or service not known>
+        # Bug 1301594 - URLError: <urlopen error [Errno 10054] An existing connection was ...
+        # Bug 1301597 - URLError: <urlopen error [Errno 8] _ssl.c:504: EOF occurred in ...
+        # Bug 1301855 - URLError: <urlopen error [Errno 60] Operation timed out>
+        # Bug 1302237 - URLError: <urlopen error [Errno 104] Connection reset by peer>
+        # Bug 1301807 - BadStatusLine: ''
+        response = urllib2.urlopen(request)
+
+        if parsed_url.scheme in ('http', 'https'):
+            expected_file_size = int(response.headers.get('Content-Length'))
+
+        self.info('Expected file size: {}'.format(expected_file_size))
+        self.debug('Url: {}'.format(url))
+        self.debug('Content-Encoding {}'.format(response.headers.get('Content-Encoding')))
+
+        file_contents = response.read()
+        obtained_file_size = len(file_contents)
+
+        if obtained_file_size != expected_file_size:
+            raise FetchedIncorrectFilesize(
+                'The expected file size is {} while we got instead {}'.format(
+                    expected_file_size, obtained_file_size)
+            )
+
+        # Use BytesIO instead of StringIO
+        # http://stackoverflow.com/questions/34162017/unzip-buffer-with-python/34162395#34162395
+        return BytesIO(file_contents)
+
+
     def _download_file(self, url, file_name):
         """ Helper script for download_file()
         Additionaly this function logs all exceptions as warnings before
         re-raising them
 
         Args:
             url (str): string containing the URL with the file location
             file_name (str): name of the file where the downloaded file
@@ -465,157 +534,157 @@ class ScriptMixin(PlatformMixin):
         """Filter entries of the archive based on the specified list of to extract dirs."""
         filter_partial = functools.partial(fnmatch.filter, namelist)
         entries = itertools.chain(*map(filter_partial, extract_dirs or ['*']))
 
         for entry in entries:
             yield entry
 
 
-    def unzip(self, file_object, extract_to='.', extract_dirs='*', verbose=False):
+    def unzip(self, compressed_file, extract_to, extract_dirs='*', verbose=False):
         """This method allows to extract a zip file without writing to disk first.
 
         Args:
-            file_object (object): Any file like object that is seekable.
-            extract_to (str, optional): where to extract the compressed file.
+            compressed_file (object): File-like object with the contents of a compressed zip file.
+            extract_to (str): where to extract the compressed file.
             extract_dirs (list, optional): directories inside the archive file to extract.
                                            Defaults to '*'.
+            verbose (bool, optional): whether or not extracted content should be displayed.
+                                      Defaults to False.
+
+        Raises:
+            zipfile.BadZipFile: on contents of zipfile being invalid
         """
-        compressed_file = StringIO(file_object.read())
-        try:
-            with zipfile.ZipFile(compressed_file) as bundle:
-                entries = self._filter_entries(bundle.namelist(), extract_dirs)
+        with zipfile.ZipFile(compressed_file) as bundle:
+            entries = self._filter_entries(bundle.namelist(), extract_dirs)
 
-                for entry in entries:
-                    if verbose:
-                        self.info(' {}'.format(entry))
-                    bundle.extract(entry, path=extract_to)
+            for entry in entries:
+                if verbose:
+                    self.info(' {}'.format(entry))
 
-                    # ZipFile doesn't preserve permissions during extraction:
-                    # http://bugs.python.org/issue15795
-                    fname = os.path.realpath(os.path.join(extract_to, entry))
+                # Exception to be retried:
+                # Bug 1301645 - BadZipfile: Bad CRC-32 for file ...
+                #    http://stackoverflow.com/questions/5624669/strange-badzipfile-bad-crc-32-problem/5626098#5626098
+                # Bug 1301802 - error: Error -3 while decompressing: invalid stored block lengths
+                bundle.extract(entry, path=extract_to)
+
+                # ZipFile doesn't preserve permissions during extraction:
+                # http://bugs.python.org/issue15795
+                fname = os.path.realpath(os.path.join(extract_to, entry))
+                try:
+                    # getinfo() can raise KeyError
                     mode = bundle.getinfo(entry).external_attr >> 16 & 0x1FF
                     # Only set permissions if attributes are available. Otherwise all
                     # permissions will be removed eg. on Windows.
                     if mode:
                         os.chmod(fname, mode)
 
-        except zipfile.BadZipfile as e:
-            self.exception('{}'.format(e.message))
+                except KeyError:
+                    self.warning('{} was not found in the zip file'.format(entry))
 
 
-    def deflate(self, file_object, mode, extract_to='.', extract_dirs='*', verbose=False):
-        """This method allows to extract a tar, tar.bz2 and tar.gz file without writing to disk first.
+    def deflate(self, compressed_file, mode, extract_to='.', *args, **kwargs):
+        """This method allows to extract a compressed file from a tar, tar.bz2 and tar.gz files.
 
         Args:
-            file_object (object): Any file like object that is seekable.
+            compressed_file (object): File-like object with the contents of a compressed file.
+            mode (str): string of the form 'filemode[:compression]' (e.g. 'r:gz' or 'r:bz2')
             extract_to (str, optional): where to extract the compressed file.
-            extract_dirs (list, optional): directories inside the archive file to extract.
-                                           Defaults to `*`.
-            verbose (bool, optional): whether or not extracted content should be displayed.
-                                      Defaults to False.
         """
-        compressed_file = StringIO(file_object.read())
         t = tarfile.open(fileobj=compressed_file, mode=mode)
         t.extractall(path=extract_to)
 
 
     def download_unpack(self, url, extract_to='.', extract_dirs='*', verbose=False):
         """Generic method to download and extract a compressed file without writing it to disk first.
 
         Args:
             url (str): URL where the file to be downloaded is located.
             extract_to (str, optional): directory where the downloaded file will
                                         be extracted to.
             extract_dirs (list, optional): directories inside the archive to extract.
                                            Defaults to `*`. It currently only applies to zip files.
-
-        Raises:
-            IOError: on `filename` file not found.
+            verbose (bool, optional): whether or not extracted content should be displayed.
+                                      Defaults to False.
 
         """
-        # Many scripts overwrite this method and set extract_dirs to None
-        extract_dirs = '*' if extract_dirs is None else extract_dirs
-        EXTENSION_TO_MIMETYPE = {
-            'bz2': 'application/x-bzip2',
-            'gz':  'application/x-gzip',
-            'tar': 'application/x-tar',
-            'zip': 'application/zip',
-        }
-        MIMETYPES = {
-            'application/x-bzip2': {
-                'function': self.deflate,
-                'kwargs': {'mode': 'r:bz2'},
-            },
-            'application/x-gzip': {
-                'function': self.deflate,
-                'kwargs': {'mode': 'r:gz'},
-            },
-            'application/x-tar': {
-                'function': self.deflate,
-                'kwargs': {'mode': 'r'},
-            },
-            'application/zip': {
-                'function': self.unzip,
-            },
-        }
+        def _determine_extraction_method_and_kwargs(url):
+            EXTENSION_TO_MIMETYPE = {
+                'bz2': 'application/x-bzip2',
+                'gz':  'application/x-gzip',
+                'tar': 'application/x-tar',
+                'zip': 'application/zip',
+            }
+            MIMETYPES = {
+                'application/x-bzip2': {
+                    'function': self.deflate,
+                    'kwargs': {'mode': 'r:bz2'},
+                },
+                'application/x-gzip': {
+                    'function': self.deflate,
+                    'kwargs': {'mode': 'r:gz'},
+                },
+                'application/x-tar': {
+                    'function': self.deflate,
+                    'kwargs': {'mode': 'r'},
+                },
+                'application/zip': {
+                    'function': self.unzip,
+                },
+            }
 
-        parsed_url = urlparse.urlparse(url)
-
-        # In case we're referrencing a file without file://
-        if parsed_url.scheme == '':
-            if not os.path.isfile(url):
-                raise IOError('Could not find file to extract: {}'.format(url))
-
-            url = 'file://%s' % os.path.abspath(url)
-            parsed_fd = urlparse.urlparse(url)
-
-        request = urllib2.Request(url)
-        response = urllib2.urlopen(request)
-
-        if parsed_url.scheme == 'file':
             filename = url.split('/')[-1]
             # XXX: bz2/gz instead of tar.{bz2/gz}
             extension = filename[filename.rfind('.')+1:]
             mimetype = EXTENSION_TO_MIMETYPE[extension]
-        else:
-            mimetype = response.headers.type
-
-        self.debug('Url: {}'.format(url))
-        self.debug('Mimetype: {}'.format(mimetype))
-        self.debug('Content-Encoding {}'.format(response.headers.get('Content-Encoding')))
+            self.debug('Mimetype: {}'.format(mimetype))
 
-        function = MIMETYPES[mimetype]['function']
-        kwargs = {
-            'file_object': response,
-            'extract_to': extract_to,
-            'extract_dirs': extract_dirs,
-            'verbose': verbose,
-        }
-        kwargs.update(MIMETYPES[mimetype].get('kwargs', {}))
+            function = MIMETYPES[mimetype]['function']
+            kwargs = {
+                'compressed_file': compressed_file,
+                'extract_to': extract_to,
+                'extract_dirs': extract_dirs,
+                'verbose': verbose,
+            }
+            kwargs.update(MIMETYPES[mimetype].get('kwargs', {}))
 
+            return function, kwargs
+
+        # Many scripts overwrite this method and set extract_dirs to None
+        extract_dirs = '*' if extract_dirs is None else extract_dirs
         self.info('Downloading and extracting to {} these dirs {} from {}'.format(
             extract_to,
             ', '.join(extract_dirs),
             url,
         ))
+
+        # 1) Let's fetch the file
         retry_args = dict(
-            failure_status=None,
-            retry_exceptions=(urllib2.HTTPError, urllib2.URLError,
-                              httplib.BadStatusLine,
-                              socket.timeout, socket.error),
+            retry_exceptions=(
+                urllib2.HTTPError,
+                urllib2.URLError,
+                httplib.BadStatusLine,
+                socket.timeout,
+                socket.error,
+                FetchedIncorrectFilesize,
+            ),
             error_message="Can't download from {}".format(url),
             error_level=FATAL,
         )
-        self.retry(
-            function,
-            kwargs=kwargs,
+        compressed_file = self.retry(
+            self.fetch_url_into_memory,
+            kwargs={'url': url},
             **retry_args
         )
 
+        # 2) We're guaranteed to have download the file with error_level=FATAL
+        #    Let's unpack the file
+        function, kwargs = _determine_extraction_method_and_kwargs(url)
+        function(**kwargs)
+
 
     def load_json_url(self, url, error_level=None, *args, **kwargs):
         """ Returns a json object from a url (it retries). """
         contents = self._retry_download(
             url=url, error_level=error_level, *args, **kwargs
         )
         return json.loads(contents.read())
author	Armen Zambrano Gasparnian <armenzg@mozilla.com>
	Mon, 12 Sep 2016 11:41:04 -0400
changeset 413074	a6d2ff10d19641a83a07776e5cec34f4a07fdc9d
parent 412625	1851b78b5a9673ee422f189b92e5f1e86b82a01c
child 531117	6d46e5ec7d7548df6cc54b7e29f0a834f041f227
push id	29315
push user	armenzg@mozilla.com
push date	Tue, 13 Sep 2016 13:13:24 +0000
bugs	1300812
milestone	51.0a1