--- a/testing/mozharness/mozharness/base/script.py
+++ b/testing/mozharness/mozharness/base/script.py
@@ -45,24 +45,28 @@ if os.name == 'nt':
PYWIN32 = False
try:
import simplejson as json
assert json
except ImportError:
import json
-from cStringIO import StringIO
+from io import BytesIO
from mozprocess import ProcessHandler
from mozharness.base.config import BaseConfig
from mozharness.base.log import SimpleFileLogger, MultiFileLogger, \
LogMixin, OutputParser, DEBUG, INFO, ERROR, FATAL
+class FetchedIncorrectFilesize(Exception):
+ pass
+
+
def platform_name():
pm = PlatformMixin()
if pm._is_linux() and pm._is_64_bit():
return 'linux64'
elif pm._is_linux() and not pm._is_64_bit():
return 'linux'
elif pm._is_darwin():
@@ -336,16 +340,81 @@ class ScriptMixin(PlatformMixin):
.. _urllib2.urlopen:
https://docs.python.org/2/library/urllib2.html#urllib2.urlopen
"""
# http://bugs.python.org/issue13359 - urllib2 does not automatically quote the URL
url_quoted = urllib2.quote(url, safe='%/:=&?~#+!$,;\'@()*[]|')
return urllib2.urlopen(url_quoted, **kwargs)
+
+
+ def fetch_url_into_memory(self, url):
+ ''' Downloads a file from a url into memory instead of disk.
+
+ Args:
+ url (str): URL path where the file to be downloaded is located.
+
+ Raises:
+ IOError: When the url points to a file on disk and cannot be found
+ FetchedIncorrectFilesize: When the size of the fetched file does not match the
+ expected file size.
+ ValueError: When the scheme of a url is not what is expected.
+
+ Returns:
+ BytesIO: contents of url
+ '''
+ self.info('Fetch {} into memory'.format(url))
+ parsed_url = urlparse.urlparse(url)
+
+ if parsed_url.scheme in ('', 'file'):
+ if not os.path.isfile(url):
+ raise IOError('Could not find file to extract: {}'.format(url))
+
+ expected_file_size = os.stat(url.replace('file://', '')).st_size
+
+ # In case we're referrencing a file without file://
+ if parsed_url.scheme == '':
+ url = 'file://%s' % os.path.abspath(url)
+ parsed_url = urlparse.urlparse(url)
+
+ request = urllib2.Request(url)
+ # Exceptions to be retried:
+ # Bug 1300663 - HTTPError: HTTP Error 404: Not Found
+ # Bug 1300413 - HTTPError: HTTP Error 500: Internal Server Error
+ # Bug 1300943 - HTTPError: HTTP Error 503: Service Unavailable
+ # Bug 1300953 - URLError: <urlopen error [Errno -2] Name or service not known>
+ # Bug 1301594 - URLError: <urlopen error [Errno 10054] An existing connection was ...
+ # Bug 1301597 - URLError: <urlopen error [Errno 8] _ssl.c:504: EOF occurred in ...
+ # Bug 1301855 - URLError: <urlopen error [Errno 60] Operation timed out>
+ # Bug 1302237 - URLError: <urlopen error [Errno 104] Connection reset by peer>
+ # Bug 1301807 - BadStatusLine: ''
+ response = urllib2.urlopen(request)
+
+ if parsed_url.scheme in ('http', 'https'):
+ expected_file_size = int(response.headers.get('Content-Length'))
+
+ self.info('Expected file size: {}'.format(expected_file_size))
+ self.debug('Url: {}'.format(url))
+ self.debug('Content-Encoding {}'.format(response.headers.get('Content-Encoding')))
+
+ file_contents = response.read()
+ obtained_file_size = len(file_contents)
+
+ if obtained_file_size != expected_file_size:
+ raise FetchedIncorrectFilesize(
+ 'The expected file size is {} while we got instead {}'.format(
+ expected_file_size, obtained_file_size)
+ )
+
+ # Use BytesIO instead of StringIO
+ # http://stackoverflow.com/questions/34162017/unzip-buffer-with-python/34162395#34162395
+ return BytesIO(file_contents)
+
+
def _download_file(self, url, file_name):
""" Helper script for download_file()
Additionaly this function logs all exceptions as warnings before
re-raising them
Args:
url (str): string containing the URL with the file location
file_name (str): name of the file where the downloaded file
@@ -465,157 +534,157 @@ class ScriptMixin(PlatformMixin):
"""Filter entries of the archive based on the specified list of to extract dirs."""
filter_partial = functools.partial(fnmatch.filter, namelist)
entries = itertools.chain(*map(filter_partial, extract_dirs or ['*']))
for entry in entries:
yield entry
- def unzip(self, file_object, extract_to='.', extract_dirs='*', verbose=False):
+ def unzip(self, compressed_file, extract_to, extract_dirs='*', verbose=False):
"""This method allows to extract a zip file without writing to disk first.
Args:
- file_object (object): Any file like object that is seekable.
- extract_to (str, optional): where to extract the compressed file.
+ compressed_file (object): File-like object with the contents of a compressed zip file.
+ extract_to (str): where to extract the compressed file.
extract_dirs (list, optional): directories inside the archive file to extract.
Defaults to '*'.
+ verbose (bool, optional): whether or not extracted content should be displayed.
+ Defaults to False.
+
+ Raises:
+ zipfile.BadZipFile: on contents of zipfile being invalid
"""
- compressed_file = StringIO(file_object.read())
- try:
- with zipfile.ZipFile(compressed_file) as bundle:
- entries = self._filter_entries(bundle.namelist(), extract_dirs)
+ with zipfile.ZipFile(compressed_file) as bundle:
+ entries = self._filter_entries(bundle.namelist(), extract_dirs)
- for entry in entries:
- if verbose:
- self.info(' {}'.format(entry))
- bundle.extract(entry, path=extract_to)
+ for entry in entries:
+ if verbose:
+ self.info(' {}'.format(entry))
- # ZipFile doesn't preserve permissions during extraction:
- # http://bugs.python.org/issue15795
- fname = os.path.realpath(os.path.join(extract_to, entry))
+ # Exception to be retried:
+ # Bug 1301645 - BadZipfile: Bad CRC-32 for file ...
+ # http://stackoverflow.com/questions/5624669/strange-badzipfile-bad-crc-32-problem/5626098#5626098
+ # Bug 1301802 - error: Error -3 while decompressing: invalid stored block lengths
+ bundle.extract(entry, path=extract_to)
+
+ # ZipFile doesn't preserve permissions during extraction:
+ # http://bugs.python.org/issue15795
+ fname = os.path.realpath(os.path.join(extract_to, entry))
+ try:
+ # getinfo() can raise KeyError
mode = bundle.getinfo(entry).external_attr >> 16 & 0x1FF
# Only set permissions if attributes are available. Otherwise all
# permissions will be removed eg. on Windows.
if mode:
os.chmod(fname, mode)
- except zipfile.BadZipfile as e:
- self.exception('{}'.format(e.message))
+ except KeyError:
+ self.warning('{} was not found in the zip file'.format(entry))
- def deflate(self, file_object, mode, extract_to='.', extract_dirs='*', verbose=False):
- """This method allows to extract a tar, tar.bz2 and tar.gz file without writing to disk first.
+ def deflate(self, compressed_file, mode, extract_to='.', *args, **kwargs):
+ """This method allows to extract a compressed file from a tar, tar.bz2 and tar.gz files.
Args:
- file_object (object): Any file like object that is seekable.
+ compressed_file (object): File-like object with the contents of a compressed file.
+ mode (str): string of the form 'filemode[:compression]' (e.g. 'r:gz' or 'r:bz2')
extract_to (str, optional): where to extract the compressed file.
- extract_dirs (list, optional): directories inside the archive file to extract.
- Defaults to `*`.
- verbose (bool, optional): whether or not extracted content should be displayed.
- Defaults to False.
"""
- compressed_file = StringIO(file_object.read())
t = tarfile.open(fileobj=compressed_file, mode=mode)
t.extractall(path=extract_to)
def download_unpack(self, url, extract_to='.', extract_dirs='*', verbose=False):
"""Generic method to download and extract a compressed file without writing it to disk first.
Args:
url (str): URL where the file to be downloaded is located.
extract_to (str, optional): directory where the downloaded file will
be extracted to.
extract_dirs (list, optional): directories inside the archive to extract.
Defaults to `*`. It currently only applies to zip files.
-
- Raises:
- IOError: on `filename` file not found.
+ verbose (bool, optional): whether or not extracted content should be displayed.
+ Defaults to False.
"""
- # Many scripts overwrite this method and set extract_dirs to None
- extract_dirs = '*' if extract_dirs is None else extract_dirs
- EXTENSION_TO_MIMETYPE = {
- 'bz2': 'application/x-bzip2',
- 'gz': 'application/x-gzip',
- 'tar': 'application/x-tar',
- 'zip': 'application/zip',
- }
- MIMETYPES = {
- 'application/x-bzip2': {
- 'function': self.deflate,
- 'kwargs': {'mode': 'r:bz2'},
- },
- 'application/x-gzip': {
- 'function': self.deflate,
- 'kwargs': {'mode': 'r:gz'},
- },
- 'application/x-tar': {
- 'function': self.deflate,
- 'kwargs': {'mode': 'r'},
- },
- 'application/zip': {
- 'function': self.unzip,
- },
- }
+ def _determine_extraction_method_and_kwargs(url):
+ EXTENSION_TO_MIMETYPE = {
+ 'bz2': 'application/x-bzip2',
+ 'gz': 'application/x-gzip',
+ 'tar': 'application/x-tar',
+ 'zip': 'application/zip',
+ }
+ MIMETYPES = {
+ 'application/x-bzip2': {
+ 'function': self.deflate,
+ 'kwargs': {'mode': 'r:bz2'},
+ },
+ 'application/x-gzip': {
+ 'function': self.deflate,
+ 'kwargs': {'mode': 'r:gz'},
+ },
+ 'application/x-tar': {
+ 'function': self.deflate,
+ 'kwargs': {'mode': 'r'},
+ },
+ 'application/zip': {
+ 'function': self.unzip,
+ },
+ }
- parsed_url = urlparse.urlparse(url)
-
- # In case we're referrencing a file without file://
- if parsed_url.scheme == '':
- if not os.path.isfile(url):
- raise IOError('Could not find file to extract: {}'.format(url))
-
- url = 'file://%s' % os.path.abspath(url)
- parsed_fd = urlparse.urlparse(url)
-
- request = urllib2.Request(url)
- response = urllib2.urlopen(request)
-
- if parsed_url.scheme == 'file':
filename = url.split('/')[-1]
# XXX: bz2/gz instead of tar.{bz2/gz}
extension = filename[filename.rfind('.')+1:]
mimetype = EXTENSION_TO_MIMETYPE[extension]
- else:
- mimetype = response.headers.type
-
- self.debug('Url: {}'.format(url))
- self.debug('Mimetype: {}'.format(mimetype))
- self.debug('Content-Encoding {}'.format(response.headers.get('Content-Encoding')))
+ self.debug('Mimetype: {}'.format(mimetype))
- function = MIMETYPES[mimetype]['function']
- kwargs = {
- 'file_object': response,
- 'extract_to': extract_to,
- 'extract_dirs': extract_dirs,
- 'verbose': verbose,
- }
- kwargs.update(MIMETYPES[mimetype].get('kwargs', {}))
+ function = MIMETYPES[mimetype]['function']
+ kwargs = {
+ 'compressed_file': compressed_file,
+ 'extract_to': extract_to,
+ 'extract_dirs': extract_dirs,
+ 'verbose': verbose,
+ }
+ kwargs.update(MIMETYPES[mimetype].get('kwargs', {}))
+ return function, kwargs
+
+ # Many scripts overwrite this method and set extract_dirs to None
+ extract_dirs = '*' if extract_dirs is None else extract_dirs
self.info('Downloading and extracting to {} these dirs {} from {}'.format(
extract_to,
', '.join(extract_dirs),
url,
))
+
+ # 1) Let's fetch the file
retry_args = dict(
- failure_status=None,
- retry_exceptions=(urllib2.HTTPError, urllib2.URLError,
- httplib.BadStatusLine,
- socket.timeout, socket.error),
+ retry_exceptions=(
+ urllib2.HTTPError,
+ urllib2.URLError,
+ httplib.BadStatusLine,
+ socket.timeout,
+ socket.error,
+ FetchedIncorrectFilesize,
+ ),
error_message="Can't download from {}".format(url),
error_level=FATAL,
)
- self.retry(
- function,
- kwargs=kwargs,
+ compressed_file = self.retry(
+ self.fetch_url_into_memory,
+ kwargs={'url': url},
**retry_args
)
+ # 2) We're guaranteed to have download the file with error_level=FATAL
+ # Let's unpack the file
+ function, kwargs = _determine_extraction_method_and_kwargs(url)
+ function(**kwargs)
+
def load_json_url(self, url, error_level=None, *args, **kwargs):
""" Returns a json object from a url (it retries). """
contents = self._retry_download(
url=url, error_level=error_level, *args, **kwargs
)
return json.loads(contents.read())