Source code for pysap.data

# -*- coding: utf-8 -*-
##########################################################################
# pySAP - Copyright (C) CEA, 2017 - 2018
# Distributed under the terms of the CeCILL-B license, as published by
# the CEA-CNRS-INRIA. Refer to the LICENSE file or to
# http://www.cecill.info/licences/Licence_CeCILL-B_V1-en.html
# for details.
##########################################################################

"""
A module that privides the utility functions to download toy datasets.
"""

# System import
import os
import sys
import copy
from urllib.request import FancyURLopener
from urllib.request import urlopen
from urllib.request import urlparse
from urllib.request import HTTPError
import urllib
import time
import shutil
import numpy

import hashlib

# Package import
import pysap
from pysap.base.exceptions import Exception


# Global parameters
SAMPLE_DATA_FILES = {
    "dict-learn-dataset": {
        "url": ("ftp://ftp.cea.fr/pub/unati/nsap/pysap/datasets/"
                "training_database.npy"),
        "md5sum": "4fa7669901cfeef410429be8640b594a"
    },
    "3d-pmri": {
        "url": ("ftp://ftp.cea.fr/pub/unati/nsap/pysap/datasets/"
                "orange_phantom_3d_pmri_images.npy"),
        "md5sum": "e4ac268fde0226c6fdcf2e9b62b240f0",
        "dtype": numpy.complex_
    },
    "2d-pmri": {
        "url": ("https://github.com/CEA-COSMIC/pysap-data/raw/"
                "master/pysap-data/CartesianRefrence2DpMRI.npy"),
        "md5sum": None,
    },
    "mri-radial-3d-samples": {
        "url": ("ftp://ftp.cea.fr/pub/unati/nsap/pysap/datasets/"
                "samples_3D_radial_spi_N256_nc1997x3073.mat"),
        "md5sum": "0324b15ed8368e20fe7315281f31b6e6",
        "image_field": "samples"
    },
    "mri-radial-samples": {
        "url": ("https://github.com/CEA-COSMIC/pysap-data/raw/"
                "master/pysap-data/samples_radial_GA_nc64_ns512.npy"),
        "md5sum": None,
    },
    "mri-nifti": {
        "url": ("ftp://ftp.cea.fr/pub/unati/nsap/pysap/datasets/"
                "t1_localizer.nii.gz"),
        "md5sum": "9617b36e5510a4783038c63241da21d4"
     },
    "mri-slice-nifti": {
        "url": ("ftp://ftp.cea.fr/pub/unati/nsap/pysap/datasets/"
                "BrainPhantom512.nii.gz"),
        "md5sum": "19983e6003ae94487d03131f4bacae2e"
    },
    "2d-mri": {
        "url": ("https://github.com/CEA-COSMIC/pysap-data/raw/"
                "master/pysap-data/example_mri_ref_image_2d.npy"),
        "md5sum": None
    },
    "cartesian-mri-mask": {
        "url": ("http://github.com/CEA-COSMIC/pysap-data/raw/"
                "master/pysap-data/example_mri_cartesian_mask_2d.npy"),
        "md5sum": None
    },
    "mri-mask": {
        "url": ("ftp://ftp.cea.fr/pub/unati/nsap/pysap/datasets/"
                "mask_BrainPhantom512.nii.gz"),
        "md5sum": "078760d89e737e69b5578d47e368c42f"
    },
    "2d-poisson-disk-mask": {
        "url": ("https://github.com/CEA-COSMIC/pysap-data/raw/"
                "master/pysap-data/2d_cartesian_poisson_disk.npy"),
        "md5sum": None
    },
    "astro-fits": {
        "url": "ftp://ftp.cea.fr/pub/unati/nsap/pysap/datasets/M31_128.fits",
        "md5sum": None
    },
    "astro-mask": {
        "url": ("ftp://ftp.cea.fr/pub/unati/nsap/pysap/datasets/"
                "mask25_sig40.fits"),
        "md5sum": "8d7fd9b4d7c2aaf407fa1331860a130f"
    },
    "astro-galaxy": {
        "url": ("ftp://ftp.cea.fr/pub/unati/nsap/pysap/datasets/"
                "example_galaxy_image.npy"),
        "md5sum": None
    },
    "astro-psf": {
        "url": ("ftp://ftp.cea.fr/pub/unati/nsap/pysap/datasets/"
                "example_psf_image.npy"),
        "md5sum": None
    },
    "astro-ngc2997": {
        "url": ("https://github.com/CEA-COSMIC/pysap-data/raw/"
                "master/pysap-data/ngc2997.fits"),
        "md5sum": None
    },
    "multiresolution": {
        "url": ("https://github.com/CEA-COSMIC/pysap-data/raw/"
                "master/pysap-data/gen.mr"),
        "md5sum": None
    },
    "eels-gst-2d-etomo": {
        "url": ("https://github.com/CEA-COSMIC/pysap-data/raw/"
                "master/pysap-data/EELS_GST_4_5_ali.tif"),
        "md5sum": None
    }
}
DATADIR = os.path.join(os.path.expanduser("~"), ".local", "share", "pysap")
PACKAGEDIR = os.path.dirname(pysap.__file__)


[docs]def get_sample_data(dataset_name, datadir=DATADIR, verbose=1):
    """ Get a sample dataset.

    This function download the requested dataset in the
    '$HOME/.local/share/pysap' directory.

    Parameters
    ----------
    dataset_name: str
        which sample data you want, must be defined in the 'SAMPLE_DATA_FILES'
        dictionary.
    verbose: int (optional, default 1)
        control the verbosity level.

    Returns
    -------
    image: Image
        the loaded dataset.
    """
    # First get the data url
    dataset = copy.deepcopy(SAMPLE_DATA_FILES.get(dataset_name))
    if dataset is None:
        raise Exception("No '{0}' sample data available - allowed sample data "
                        "are {1}.".format(dataset_name,
                                          SAMPLE_DATA_FILES.keys()))

    # Get the resource on the web or on the local machine
    dataset["url"] = dataset["url"].format(**{"PYSAP": PACKAGEDIR})
    if os.path.isfile(dataset["url"]):
        path = copy_file(dataset["url"], data_dir=DATADIR, overwrite=False,
                         verbose=verbose)
    else:
        path = download_file(dataset["url"], data_dir=DATADIR, resume=True,
                             overwrite=False, verbose=verbose)

    # md5 check sum
    if dataset["md5sum"] is not None:
        if (md5_sum_file(path) != dataset["md5sum"]):
            raise Exception("File '{0}' checksum verification has "
                            "failed.".format(path))

    # Load the dataset
    for key in ("url", "md5sum"):
        dataset.pop(key)
    image = pysap.io.load(path, **dataset)

    return image


[docs]def md5_sum_file(fname):
    """ Calculates the MD5 sum of a file.

    Parameters
    ----------
    fname: str (mandatory)
        the path to a file

    Returns
    -------
    md5: int
        the md5 sum of the input file
    """
    f = open(fname, 'rb')
    m = hashlib.md5()
    while True:
        data = f.read(8192)
        if not data:
            break
        m.update(data)
    return m.hexdigest()


[docs]def progress_bar(ratio, title, bar_length=20, maxsize=40):
    """ Generate a progress bar

    Parameters
    ----------
    ratio: float (mandatory)
        the progress status (0<=ratio<1)
    fname: str (optional)
        the name of the file beeing dowloaded
    bar_length: int (optional)
        the size of the progress bar
    maxsize: int (optional)
        use to justify title.
    """
    progress = int(ratio * 100.)
    block = int(round(bar_length * ratio))
    title = title.ljust(maxsize, " ")
    text = "\r[{0}] {1}% {2}".format(
        "=" * block + " " * (bar_length - block), progress, title)
    sys.stdout.write(text)
    sys.stdout.flush()


[docs]class ResumeURLOpener(FancyURLopener):
    """Create sub-class in order to overide error 206. This error means a
    partial file is being sent, which is fine in this case.
    Do nothing with this error.

    Note: This was adapted from:
    http://code.activestate.com/recipes/83208-resuming-download-of-a-file/
    """
    def __init__(self):
        super(FancyURLopener, self).__init__()

[docs]    def http_error_206(self, url, fp, errcode, errmsg, headers, data=None):
        pass


[docs]def download_file(url, data_dir, resume=True, overwrite=False, verbose=0):
    """ Load requested file if needed or requested.

    Parameters
    ----------
    url: str
        the url of the file to be downloaded.
    data_dir: str
        path of the data directory.
    resume: bool (optional, default True)
        if True, try to resume partially downloaded files
    overwrite: bool (optional, default False)
        if True and file already exists, delete it.
    verbose: int (optional, default 0)
        control the verbosity level.

    Returns
    -------
    str
        absolute path to the downloaded file.

    Notes
    -----
    If, for any reason, the download procedure fails, all downloaded
    files are removed.

    """
    # Create the download directory if necessary
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # Determine filename using URL
    parse = urlparse(url)
    fname = os.path.basename(parse.path)

    # Generate the download file name
    download_fname = os.path.join(data_dir, fname)

    # Generate a temporary file for the download
    temp_fname = os.path.join(data_dir, fname + ".part")

    # If the file is already created remove it if the overwrite option is set
    # or return the file
    if os.path.exists(download_fname):
        if overwrite:
            os.remove(download_fname)
        else:
            return download_fname

    # If the temporary file is already created remove it if the overwrite
    # option is set
    if os.path.exists(temp_fname):
        if overwrite:
            os.remove(temp_fname)

    # Start a timer to evaluate the download time
    t0 = time.time()

    # Test if the dataset has been released
    try:
        urlopen(url)
    except Exception as exc:
        raise ValueError(
            "The '{0}' dataset has not been released yet.".format(url)
        ) from exc

    # Start downloading dataset
    local_file = None
    bytes_so_far = 0
    try:
        # Prepare the download
        if verbose > 0:
            print("Downloading data from {0}...".format(url))
        # Case 1: continue the downloading from an existing temporary file
        if resume and os.path.exists(temp_fname):
            url_opener = ResumeURLOpener()
            # Download has been interrupted, we try to resume it.
            local_file_size = os.path.getsize(temp_fname)
            # If the file exists, then only download the remainder
            url_opener.addheader("Range", "bytes={0}-".format(local_file_size))
            try:
                data = url_opener.open(url)
            except HTTPError:
                # There is a problem that may be due to resuming
                # Restart the downloading from scratch
                return download_file(url, data_dir, resume=False,
                                     overwrite=False)
            local_file = open(temp_fname, "ab")
            bytes_so_far = local_file_size
        # Case 2: just download the file
        else:
            data = urlopen(url)
            local_file = open(temp_fname, "wb")
        # Get the total file size
        try:
            total_size = data.info().get_all("Content-Length")[0].strip()
            total_size = int(total_size) + bytes_so_far
        except Exception as e:
            if verbose > 0:
                print("Total size could not be determined.")
            total_size = "?"

        # Download data
        chunk_size = 8192
        while True:
            # Read chunk
            chunk = data.read(chunk_size)
            # Stoping criterion
            if not chunk:
                break
            # Write to local file
            bytes_so_far += len(chunk)
            local_file.write(chunk)
            # Write report status and print a progress bar
            if isinstance(total_size, int):
                ratio = float(bytes_so_far) / float(total_size)
            else:
                ratio = 0
            progress_bar(ratio, title=os.path.basename(url))
        print()

        # Temporary file must be closed prior to the move
        if not local_file.closed:
            local_file.close()
        shutil.move(temp_fname, download_fname)

        # Get process duration and print it
        dt = time.time() - t0
        exit_message = ("Download was done in {0} minutes, {1: .2f} "
                        "seconds").format(int(numpy.floor(dt / 60)), dt % 60)
        if verbose > 0:
            print(exit_message)
    except HTTPError as e:
        raise Exception("{0}\nError while downloading file '{1}'. "
                        "Dataset download aborted.".format(e, fname))
    finally:
        # Temporary file must be closed
        if local_file is not None:
            if not local_file.closed:
                local_file.close()

    return download_fname


[docs]def copy_file(path, data_dir, overwrite=False, verbose=0):
    """ Copy the requested file if needed or requested.

    Parameters
    ----------
    path: str
        the path to the file to be downloaded.
    data_dir: str
        path of the data directory.
    overwrite: bool (optional, default False)
        if True and file already exists, delete it.
    verbose: int (optional, default 0)
        control the verbosity level.

    Returns
    -------
    copy_file: str
        absolute path to the copied file.
    """
    # Create the download directory if necessary
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # Generate the download file name
    copy_fname = os.path.join(data_dir, os.path.basename(path))

    # If the file is already created remove it if the overwrite option is set
    # or return the file
    if os.path.exists(copy_fname):
        if overwrite:
            os.remove(copy_fname)
        else:
            return copy_fname

    # Start a timer to evaluate the copy time
    t0 = time.time()

    # Start copying dataset
    print("Copying data from {0}...".format(path))
    shutil.copy2(path, copy_fname)

    # Get process duration and print it
    dt = time.time() - t0
    exit_message = ("Copy was done in {0} minutes, {1: .2f} "
                    "seconds").format(int(numpy.floor(dt / 60)), dt % 60)
    if verbose > 0:
        print(exit_message)

    return copy_fname