Bug 1348229 - Add support to unzip directly from a remote URL. r?mshal draft
authorMike Hommey <mh+mozilla@glandium.org>
Fri, 17 Mar 2017 17:19:00 +0900
changeset 500495 6a1e8dccc27df226f725bec35eedf2112c245a4d
parent 500494 90860909bd1c84618b6c2650bbe2f82605f11084
child 549649 789050ef5cc094c71639246f8b4c6a21177409df
push id49734
push userbmo:mh+mozilla@glandium.org
push dateFri, 17 Mar 2017 08:40:19 +0000
reviewersmshal
bugs1348229
milestone55.0a1
Bug 1348229 - Add support to unzip directly from a remote URL. r?mshal
python/mozbuild/mozbuild/action/unzip.py
python/mozbuild/mozpack/mozjar.py
--- a/python/mozbuild/mozbuild/action/unzip.py
+++ b/python/mozbuild/mozbuild/action/unzip.py
@@ -4,34 +4,110 @@
 
 # This script creates a zip file, but will also strip any binaries
 # it finds before adding them to the zip.
 
 from __future__ import absolute_import
 
 import argparse
 import mozpack.path as mozpath
+import requests
 import sys
 from mozbuild.util import ensureParentDir
 from mozpack.mozjar import JarReader
 from mozpack.files import DeflatedFile
 
 
+class RangeHelperTrait(object):
+    def _normalize_slice(self, slice):
+        assert slice.step is None
+        start = slice.start or 0
+        if start < 0:
+            start += len(self)
+        end = slice.stop
+        if end is None:
+            return start, None
+        if end < 0:
+            end += len(self) + 1
+        return start, end
+
+
+# The JarReader class can take any kind of sliceable object as raw data.
+# The HTTPRangeData class is such an object, that does HTTP Range
+# requests when slices are requested.
+# Because the JarReader relies on memoryview-like behavior and uses
+# open-ended slices, we don't actually emit HTTP requests until we
+# have been asked for an exact length. However, because the JarReader
+# does a lot of small reads (2 or 4 bytes), we also always read at
+# least 1024 bytes at once.
+# While we don't cache all the ranges of data we've requested so far,
+# we do cache the last one. In practice, this is enough to avoid doing
+# multiple requests for the same data.
+class HTTPRangeData(RangeHelperTrait):
+    class HTTPOpenEndedRange(RangeHelperTrait):
+        def __init__(self, range, start):
+            self._range = range
+            self._start = start
+
+        def __getitem__(self, key):
+            assert isinstance(key, slice)
+            start, end = self._normalize_slice(key)
+            if end is None:
+                return HTTPRangeData.HTTPOpenEndedRange(
+                    self._range, self._start + start)
+
+            return self._range[self._start + start: self._start + end]
+
+        def __len__(self):
+            return len(self._range) - self._start
+
+    def __init__(self, url):
+        self._session = requests.Session()
+        r = self._session.head(url, allow_redirects=True)
+        self._len = long(r.headers['Content-Length'])
+        self._url = r.url
+        self._last = None
+
+    def __getitem__(self, key):
+        assert isinstance(key, slice)
+        start, end = self._normalize_slice(key)
+        if end is None:
+            return HTTPRangeData.HTTPOpenEndedRange(self, start)
+
+        if self._last:
+            last_start, last_content = self._last
+            last_end = last_start + len(last_content)
+            if start >= last_start and end <= last_end:
+                return last_content[start - last_start:end - last_start]
+
+        r = self._session.get(self._url, headers={
+            'Range': 'bytes={}-{}'.format(start, max(start + 1024, end)),
+        })
+        self._last = (start, r.content)
+        return memoryview(r.content)[:end - start]
+
+    def __len__(self):
+        return self._len
+
+
 def main(args):
     parser = argparse.ArgumentParser()
     parser.add_argument("-C", metavar='DIR', default=".",
                         help="Change to given directory before extracting")
     parser.add_argument("-l", action='store_true',
                         help="List files")
     parser.add_argument("zip", help="Path to zip file")
     parser.add_argument("files", nargs="*",
                         help="Path to files to extract from zip")
     args = parser.parse_args(args)
 
-    jar = JarReader(file=args.zip)
+    if '://' in args.zip:
+        jar = JarReader(data=HTTPRangeData(args.zip))
+    else:
+        jar = JarReader(file=args.zip)
 
     for entry in jar.entries:
         if not args.files or any(mozpath.match(entry, f)
                                  for f in args.files):
             if args.l:
                 print entry
             else:
                 print 'Extracting {}'.format(entry)
--- a/python/mozbuild/mozpack/mozjar.py
+++ b/python/mozbuild/mozpack/mozjar.py
@@ -336,17 +336,19 @@ class JarReader(object):
         '''
         Opens the given file as a Jar archive. Use the given file-like object
         if one is given instead of opening the given file name.
         '''
         if fileobj:
             data = fileobj.read()
         elif file:
             data = open(file, 'rb').read()
-        self._data = memoryview(data)
+        self._data = data
+        if isinstance(self._data, str):
+            self._data = memoryview(self._data)
         # The End of Central Directory Record has a variable size because of
         # comments it may contain, so scan for it from the end of the file.
         offset = -CDIR_END_SIZE
         while True:
             signature = JarStruct.get_data('uint32', self._data[offset:])[0]
             if signature == JarCdirEnd.MAGIC:
                 break
             if offset == -len(self._data):
@@ -368,19 +370,22 @@ class JarReader(object):
         directory. Directory entries are skipped.
         '''
         if hasattr(self, '_entries'):
             return self._entries
         preload = 0
         if self.is_optimized:
             preload = JarStruct.get_data('uint32', self._data)[0]
         entries = OrderedDict()
-        offset = self._cdir_end['cdir_offset']
+        cdir_offset = self._cdir_end['cdir_offset']
+        cdir_size = self._cdir_end['cdir_size']
+        cdir_data = self._data[cdir_offset:cdir_offset + cdir_size]
+        offset = 0
         for e in xrange(self._cdir_end['cdir_entries']):
-            entry = JarCdirEntry(self._data[offset:])
+            entry = JarCdirEntry(cdir_data[offset:])
             offset += entry.size
             # Creator host system. 0 is MSDOS, 3 is Unix
             host = entry['creator_version'] >> 8
             # External attributes values depend on host above. On Unix the
             # higher bits are the stat.st_mode value. On MSDOS, the lower bits
             # are the FAT attributes.
             xattr = entry['external_attr']
             # Skip directories