Bug 1272083 - Download and unpacking should be performed in process. draft
authorArmen Zambrano Gasparnian <armenzg@mozilla.com>
Thu, 25 Aug 2016 11:04:16 -0400
changeset 408706 f54fc25438431bce343254709fe8cb313231f353
parent 408083 b3ec8a3373e8faca26c39e9ff577a2d4f8b6927a
child 530167 55bcf824be266553721f846117b8fb91442f245e
push id28275
push userbmo:armenzg@mozilla.com
push dateThu, 01 Sep 2016 15:16:29 +0000
bugs1272083
milestone51.0a1
Bug 1272083 - Download and unpacking should be performed in process. Instead of downloading a file first and then unpacking it, we would like to fetch it into memory and then unpacking directly from there. This saves writing the file first to disk, thus, saving on IO. MozReview-Commit-ID: JdNGnxIYEvy
testing/mozharness/mozharness/base/script.py
testing/mozharness/test/test_base_script.py
--- a/testing/mozharness/mozharness/base/script.py
+++ b/testing/mozharness/mozharness/base/script.py
@@ -45,16 +45,18 @@ if os.name == 'nt':
         PYWIN32 = False
 
 try:
     import simplejson as json
     assert json
 except ImportError:
     import json
 
+from cStringIO import StringIO
+
 from mozprocess import ProcessHandler
 from mozharness.base.config import BaseConfig
 from mozharness.base.log import SimpleFileLogger, MultiFileLogger, \
     LogMixin, OutputParser, DEBUG, INFO, ERROR, FATAL
 
 
 def platform_name():
     pm = PlatformMixin()
@@ -453,38 +455,167 @@ class ScriptMixin(PlatformMixin):
             kwargs = {"url": url, "file_name": file_name}
 
         return self.retry(
             download_func,
             kwargs=kwargs,
             **retry_args
         )
 
-    def download_unpack(self, url, extract_to, extract_dirs=None,
-                        error_level=FATAL):
-        """Generic method to download and extract a compressed file.
+
+    def _filter_entries(self, namelist, extract_dirs):
+        """Filter entries of the archive based on the specified list of to extract dirs."""
+        filter_partial = functools.partial(fnmatch.filter, namelist)
+        entries = itertools.chain(*map(filter_partial, extract_dirs or ['*']))
+
+        for entry in entries:
+            yield entry
+
+
+    def unzip(self, file_object, extract_to='.', extract_dirs='*', verbose=False):
+        """This method allows to extract a zip file without writing to disk first.
+
+        Args:
+            file_object (object): Any file like object that is seekable.
+            extract_to (str, optional): where to extract the compressed file.
+            extract_dirs (list, optional): directories inside the archive file to extract.
+                                           Defaults to '*'.
+        """
+        compressed_file = StringIO(file_object.read())
+        try:
+            with zipfile.ZipFile(compressed_file) as bundle:
+                entries = self._filter_entries(bundle.namelist(), extract_dirs)
+
+                for entry in entries:
+                    if verbose:
+                        self.info(' {}'.format(entry))
+                    bundle.extract(entry, path=extract_to)
 
-        The downloaded file will always be saved to the working directory and is not getting
-        deleted after extracting.
+                    # ZipFile doesn't preserve permissions during extraction:
+                    # http://bugs.python.org/issue15795
+                    fname = os.path.realpath(os.path.join(extract_to, entry))
+                    mode = bundle.getinfo(entry).external_attr >> 16 & 0x1FF
+                    # Only set permissions if attributes are available. Otherwise all
+                    # permissions will be removed eg. on Windows.
+                    if mode:
+                        os.chmod(fname, mode)
+
+        except zipfile.BadZipfile as e:
+            self.exception('{}'.format(e.message))
+
+
+    def deflate(self, file_object, mode, extract_to='.', extract_dirs='*', verbose=False):
+        """This method allows to extract a tar, tar.bz2 and tar.gz file without writing to disk first.
+
+        Args:
+            file_object (object): Any file like object that is seekable.
+            extract_to (str, optional): where to extract the compressed file.
+            extract_dirs (list, optional): directories inside the archive file to extract.
+                                           Defaults to `*`.
+            verbose (bool, optional): whether or not extracted content should be displayed.
+                                      Defaults to False.
+        """
+        compressed_file = StringIO(file_object.read())
+        t = tarfile.open(fileobj=compressed_file, mode=mode)
+        t.extractall(path=extract_to)
+
+
+    def download_unpack(self, url, extract_to='.', extract_dirs='*', verbose=False):
+        """Generic method to download and extract a compressed file without writing it to disk first.
 
         Args:
             url (str): URL where the file to be downloaded is located.
-            extract_to (str): directory where the downloaded file will
-                              be extracted to.
+            extract_to (str, optional): directory where the downloaded file will
+                                        be extracted to.
             extract_dirs (list, optional): directories inside the archive to extract.
-                                           Defaults to `None`.
-            error_level (str, optional): log level to use in case an error occurs.
-                                         Defaults to `FATAL`.
+                                           Defaults to `*`. It currently only applies to zip files.
+
+        Raises:
+            IOError: on `filename` file not found.
 
         """
-        dirs = self.query_abs_dirs()
-        archive = self.download_file(url, parent_dir=dirs['abs_work_dir'],
-                                     error_level=error_level)
-        self.unpack(archive, extract_to, extract_dirs=extract_dirs,
-                    error_level=error_level)
+        # Many scripts overwrite this method and set extract_dirs to None
+        extract_dirs = '*' if extract_dirs is None else extract_dirs
+        EXTENSION_TO_MIMETYPE = {
+            'bz2': 'application/x-bzip2',
+            'gz':  'application/x-gzip',
+            'tar': 'application/x-tar',
+            'zip': 'application/zip',
+        }
+        MIMETYPES = {
+            'application/x-bzip2': {
+                'function': self.deflate,
+                'kwargs': {'mode': 'r:bz2'},
+            },
+            'application/x-gzip': {
+                'function': self.deflate,
+                'kwargs': {'mode': 'r:gz'},
+            },
+            'application/x-tar': {
+                'function': self.deflate,
+                'kwargs': {'mode': 'r'},
+            },
+            'application/zip': {
+                'function': self.unzip,
+            },
+        }
+
+        parsed_url = urlparse.urlparse(url)
+
+        # In case we're referrencing a file without file://
+        if parsed_url.scheme == '':
+            if not os.path.isfile(url):
+                raise IOError('Could not find file to extract: {}'.format(url))
+
+            url = 'file://%s' % os.path.abspath(url)
+            parsed_fd = urlparse.urlparse(url)
+
+        request = urllib2.Request(url)
+        response = urllib2.urlopen(request)
+
+        if parsed_url.scheme == 'file':
+            filename = url.split('/')[-1]
+            # XXX: bz2/gz instead of tar.{bz2/gz}
+            extension = filename[filename.rfind('.')+1:]
+            mimetype = EXTENSION_TO_MIMETYPE[extension]
+        else:
+            mimetype = response.headers.type
+
+        self.debug('Url: {}'.format(url))
+        self.debug('Mimetype: {}'.format(mimetype))
+        self.debug('Content-Encoding {}'.format(response.headers.get('Content-Encoding')))
+
+        function = MIMETYPES[mimetype]['function']
+        kwargs = {
+            'file_object': response,
+            'extract_to': extract_to,
+            'extract_dirs': extract_dirs,
+            'verbose': verbose,
+        }
+        kwargs.update(MIMETYPES[mimetype].get('kwargs', {}))
+
+        self.info('Downloading and extracting to {} these dirs {} from {}'.format(
+            extract_to,
+            ', '.join(extract_dirs),
+            url,
+        ))
+        retry_args = dict(
+            failure_status=None,
+            retry_exceptions=(urllib2.HTTPError, urllib2.URLError,
+                              httplib.BadStatusLine,
+                              socket.timeout, socket.error),
+            error_message="Can't download from {}".format(url),
+            error_level=FATAL,
+        )
+        self.retry(
+            function,
+            kwargs=kwargs,
+            **retry_args
+        )
+
 
     def load_json_url(self, url, error_level=None, *args, **kwargs):
         """ Returns a json object from a url (it retries). """
         contents = self._retry_download(
             url=url, error_level=error_level, *args, **kwargs
         )
         return json.loads(contents.read())
 
@@ -1404,30 +1535,24 @@ class ScriptMixin(PlatformMixin):
               of the command is not in `success_codes`. Defaults to 2.
             verbose (bool, optional): whether or not extracted content should be displayed.
                                       Defaults to False.
 
         Raises:
             IOError: on `filename` file not found.
 
         """
-        def _filter_entries(namelist):
-            """Filter entries of the archive based on the specified list of to extract dirs."""
-            filter_partial = functools.partial(fnmatch.filter, namelist)
-            for entry in itertools.chain(*map(filter_partial, extract_dirs or ['*'])):
-                yield entry
-
         if not os.path.isfile(filename):
             raise IOError('Could not find file to extract: %s' % filename)
 
         if zipfile.is_zipfile(filename):
             try:
                 self.info('Using ZipFile to extract {} to {}'.format(filename, extract_to))
                 with zipfile.ZipFile(filename) as bundle:
-                    for entry in _filter_entries(bundle.namelist()):
+                    for entry in self._filter_entries(bundle.namelist(), extract_dirs):
                         if verbose:
                             self.info(' %s' % entry)
                         bundle.extract(entry, path=extract_to)
 
                         # ZipFile doesn't preserve permissions during extraction:
                         # http://bugs.python.org/issue15795
                         fname = os.path.realpath(os.path.join(extract_to, entry))
                         mode = bundle.getinfo(entry).external_attr >> 16 & 0x1FF
@@ -1439,17 +1564,17 @@ class ScriptMixin(PlatformMixin):
                 self.log('%s (%s)' % (e.message, filename),
                          level=error_level, exit_code=fatal_exit_code)
 
         # Bug 1211882 - is_tarfile cannot be trusted for dmg files
         elif tarfile.is_tarfile(filename) and not filename.lower().endswith('.dmg'):
             try:
                 self.info('Using TarFile to extract {} to {}'.format(filename, extract_to))
                 with tarfile.open(filename) as bundle:
-                    for entry in _filter_entries(bundle.getnames()):
+                    for entry in self._filter_entries(bundle.getnames(), extract_dirs):
                         if verbose:
                             self.info(' %s' % entry)
                         bundle.extract(entry, path=extract_to)
             except tarfile.TarError as e:
                 self.log('%s (%s)' % (e.message, filename),
                          level=error_level, exit_code=fatal_exit_code)
         else:
             self.log('No extraction method found for: %s' % filename,
--- a/testing/mozharness/test/test_base_script.py
+++ b/testing/mozharness/test/test_base_script.py
@@ -254,16 +254,60 @@ class TestScript(unittest.TestCase):
                 'regex': re.compile(',$'), 'level': IGNORE,
             }, {
                 'substr': ']$', 'level': WARNING,
             }])
         error_logsize = os.path.getsize("test_logs/test_error.log")
         self.assertTrue(error_logsize > 0,
                         msg="error list not working properly")
 
+    def test_download_unpack(self):
+        # NOTE: The action is called *download*, however, it can work for files in disk
+        self.s = get_debug_script_obj()
+
+        archives_path = os.path.join(here, 'helper_files', 'archives')
+
+        # Test basic decompression
+        for archive in ('archive.tar', 'archive.tar.bz2', 'archive.tar.gz', 'archive.zip'):
+            self.s.download_unpack(
+                url=os.path.join(archives_path, archive),
+                extract_to=self.tmpdir
+            )
+            self.assertIn('script.sh', os.listdir(os.path.join(self.tmpdir, 'bin')))
+            self.assertIn('lorem.txt', os.listdir(self.tmpdir))
+            shutil.rmtree(self.tmpdir)
+
+        # Test permissions for extracted entries from zip archive
+        self.s.download_unpack(
+            url=os.path.join(archives_path, 'archive.zip'),
+            extract_to=self.tmpdir,
+        )
+        file_stats = os.stat(os.path.join(self.tmpdir, 'bin', 'script.sh'))
+        orig_fstats = os.stat(os.path.join(archives_path, 'reference', 'bin', 'script.sh'))
+        self.assertEqual(file_stats.st_mode, orig_fstats.st_mode)
+        shutil.rmtree(self.tmpdir)
+
+        # Test unzip specific dirs only
+        self.s.download_unpack(
+            url=os.path.join(archives_path, 'archive.zip'),
+            extract_to=self.tmpdir,
+            extract_dirs=['bin/*']
+        )
+        self.assertIn('bin', os.listdir(self.tmpdir))
+        self.assertNotIn('lorem.txt', os.listdir(self.tmpdir))
+        shutil.rmtree(self.tmpdir)
+
+        # Test for invalid filenames (Windows only)
+        if PYWIN32:
+            with self.assertRaises(IOError):
+                self.s.download_unpack(
+                    url=os.path.join(archives_path, 'archive_invalid_filename.zip'),
+                    extract_to=self.tmpdir
+                )
+
     def test_unpack(self):
         self.s = get_debug_script_obj()
 
         archives_path = os.path.join(here, 'helper_files', 'archives')
 
         # Test basic decompression
         for archive in ('archive.tar', 'archive.tar.bz2', 'archive.tar.gz', 'archive.zip'):
             self.s.unpack(os.path.join(archives_path, archive), self.tmpdir)