Source code for chainer.dataset.download

from __future__ import print_function
import hashlib
import os
import shutil
import tempfile

import filelock
from six.moves.urllib import request


_dataset_root = os.environ.get('CHAINER_DATASET_ROOT',
                               os.path.expanduser('~/.chainer/dataset'))


[docs]def get_dataset_root():
    """Gets the path to the root directory to download and cache datasets.

    Returns:
        str: The path to the dataset root directory.

    """
    return _dataset_root


[docs]def set_dataset_root(path):
    """Sets the root directory to download and cache datasets.

    There are two ways to set the dataset root directory. One is by setting the
    environment variable ``CHAINER_DATASET_ROOT``. The other is by using this
    function. If both are specified, one specified via this function is used.
    The default dataset root is ``$HOME/.chainer/dataset``.

    Args:
        path (str): Path to the new dataset root directory.

    """
    global _dataset_root
    _dataset_root = path


def get_dataset_directory(dataset_name, create_directory=True):
    """Gets the path to the directory of given dataset.

    The generated path is just a concatenation of the global root directory
    (see :func:`set_dataset_root` for how to change it) and the dataset name.
    The dataset name can contain slashes, which are treated as path separators.

    Args:
        dataset_name (str): Name of the dataset.
        create_directory (bool): If True (default), this function also creates
            the directory at the first time. If the directory already exists,
            then this option is ignored.

    Returns:
        str: Path to the dataset directory.

    """
    path = os.path.join(_dataset_root, dataset_name)
    if create_directory:
        try:
            os.makedirs(path)
        except OSError:
            pass
    return path


[docs]def cached_download(url):
    """Downloads a file and caches it.

    It downloads a file from the URL if there is no corresponding cache. After
    the download, this function stores a cache to the directory under the
    dataset root (see :func:`set_dataset_root`). If there is already a cache
    for the given URL, it just returns the path to the cache without
    downloading the same file.

    Args:
        url (str): URL to download from.

    Returns:
        str: Path to the downloaded file.

    """
    cache_root = os.path.join(_dataset_root, '_dl_cache')
    try:
        os.makedirs(cache_root)
    except OSError:
        if not os.path.isdir(cache_root):
            raise RuntimeError('cannot create download cache directory')

    lock_path = os.path.join(cache_root, '_dl_lock')
    urlhash = hashlib.md5(url.encode('utf-8')).hexdigest()
    cache_path = os.path.join(cache_root, urlhash)

    with filelock.FileLock(lock_path):
        if os.path.exists(cache_path):
            return cache_path

    temp_root = tempfile.mkdtemp(dir=cache_root)
    try:
        temp_path = os.path.join(temp_root, 'dl')
        print('Downloading from {}...'.format(url))
        request.urlretrieve(url, temp_path)
        with filelock.FileLock(lock_path):
            shutil.move(temp_path, cache_path)
    finally:
        shutil.rmtree(temp_root)

    return cache_path


[docs]def cache_or_load_file(path, creator, loader):
    """Caches a file if it does not exist, or loads it otherwise.

    This is a utility function used in dataset loading routines. The
    ``creator`` creates the file to given path, and returns the content. If the
    file already exists, the ``loader`` is called instead, and it loads the
    file and returns the content.

    Note that the path passed to the creator is temporary one, and not same as
    the path given to this function. This function safely renames the file
    created by the creator to a given path, even if this function is called
    simultaneously by multiple threads or processes.

    Args:
        path (str): Path to save the cached file.
        creator: Function to create the file and returns the content. It takes
            a path to temporary place as the argument. Before calling the
            creator, there is no file at the temporary path.
        loader: Function to load the cached file and returns the content.

    Returns:
        It returns the returned values by the creator or the loader.

    """
    if os.path.exists(path):
        return loader(path)

    file_name = os.path.basename(path)
    temp_dir = tempfile.mkdtemp()
    temp_path = os.path.join(temp_dir, file_name)

    try:
        os.makedirs(_dataset_root)
    except OSError:
        if not os.path.isdir(_dataset_root):
            raise RuntimeError('cannot create dataset directory')

    lock_path = os.path.join(_dataset_root, '_create_lock')

    try:
        content = creator(temp_path)
        with filelock.FileLock(lock_path):
            if not os.path.exists(path):
                shutil.move(temp_path, path)
    finally:
        shutil.rmtree(temp_dir)

    return content