import os import shutil import tempfile import warnings import numpy from pickle import loads from pickle import dumps from sklearn.datasets import get_data_home from sklearn.datasets import clear_data_home from sklearn.datasets import load_files from sklearn.datasets import load_sample_images from sklearn.datasets import load_sample_image from sklearn.datasets import load_digits from sklearn.datasets import load_diabetes from sklearn.datasets import load_linnerud from sklearn.datasets import load_iris from sklearn.datasets import load_breast_cancer from sklearn.datasets import load_boston from sklearn.datasets import load_wine from sklearn.datasets.base import Bunch from sklearn.externals.six import b, u from sklearn.utils.testing import assert_false from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import with_setup DATA_HOME = tempfile.mkdtemp(prefix="scikit_learn_data_home_test_") LOAD_FILES_ROOT = tempfile.mkdtemp(prefix="scikit_learn_load_files_test_") TEST_CATEGORY_DIR1 = "" TEST_CATEGORY_DIR2 = "" def _remove_dir(path): if os.path.isdir(path): shutil.rmtree(path) def teardown_module(): """Test fixture (clean up) run once after all tests of this module""" for path in [DATA_HOME, LOAD_FILES_ROOT]: _remove_dir(path) def setup_load_files(): global TEST_CATEGORY_DIR1 global TEST_CATEGORY_DIR2 TEST_CATEGORY_DIR1 = tempfile.mkdtemp(dir=LOAD_FILES_ROOT) TEST_CATEGORY_DIR2 = tempfile.mkdtemp(dir=LOAD_FILES_ROOT) sample_file = tempfile.NamedTemporaryFile(dir=TEST_CATEGORY_DIR1, delete=False) sample_file.write(b("Hello World!\n")) sample_file.close() def teardown_load_files(): _remove_dir(TEST_CATEGORY_DIR1) _remove_dir(TEST_CATEGORY_DIR2) def test_data_home(): # get_data_home will point to a pre-existing folder data_home = get_data_home(data_home=DATA_HOME) assert_equal(data_home, DATA_HOME) assert_true(os.path.exists(data_home)) # clear_data_home will delete both the content and the folder it-self clear_data_home(data_home=data_home) assert_false(os.path.exists(data_home)) # if the folder is missing it will be created again data_home = get_data_home(data_home=DATA_HOME) assert_true(os.path.exists(data_home)) def test_default_empty_load_files(): res = load_files(LOAD_FILES_ROOT) assert_equal(len(res.filenames), 0) assert_equal(len(res.target_names), 0) assert_equal(res.DESCR, None) @with_setup(setup_load_files, teardown_load_files) def test_default_load_files(): res = load_files(LOAD_FILES_ROOT) assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 2) assert_equal(res.DESCR, None) assert_equal(res.data, [b("Hello World!\n")]) @with_setup(setup_load_files, teardown_load_files) def test_load_files_w_categories_desc_and_encoding(): category = os.path.abspath(TEST_CATEGORY_DIR1).split('/').pop() res = load_files(LOAD_FILES_ROOT, description="test", categories=category, encoding="utf-8") assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 1) assert_equal(res.DESCR, "test") assert_equal(res.data, [u("Hello World!\n")]) @with_setup(setup_load_files, teardown_load_files) def test_load_files_wo_load_content(): res = load_files(LOAD_FILES_ROOT, load_content=False) assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 2) assert_equal(res.DESCR, None) assert_equal(res.get('data'), None) def test_load_sample_images(): try: res = load_sample_images() assert_equal(len(res.images), 2) assert_equal(len(res.filenames), 2) assert_true(res.DESCR) except ImportError: warnings.warn("Could not load sample images, PIL is not available.") def test_load_digits(): digits = load_digits() assert_equal(digits.data.shape, (1797, 64)) assert_equal(numpy.unique(digits.target).size, 10) # test return_X_y option X_y_tuple = load_digits(return_X_y=True) bunch = load_digits() assert_true(isinstance(X_y_tuple, tuple)) assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target) def test_load_digits_n_class_lt_10(): digits = load_digits(9) assert_equal(digits.data.shape, (1617, 64)) assert_equal(numpy.unique(digits.target).size, 9) def test_load_sample_image(): try: china = load_sample_image('china.jpg') assert_equal(china.dtype, 'uint8') assert_equal(china.shape, (427, 640, 3)) except ImportError: warnings.warn("Could not load sample images, PIL is not available.") def test_load_missing_sample_image_error(): have_PIL = True try: try: from scipy.misc import imread except ImportError: from scipy.misc.pilutil import imread # noqa except ImportError: have_PIL = False if have_PIL: assert_raises(AttributeError, load_sample_image, 'blop.jpg') else: warnings.warn("Could not load sample images, PIL is not available.") def test_load_diabetes(): res = load_diabetes() assert_equal(res.data.shape, (442, 10)) assert_true(res.target.size, 442) assert_equal(len(res.feature_names), 10) assert_true(res.DESCR) # test return_X_y option X_y_tuple = load_diabetes(return_X_y=True) bunch = load_diabetes() assert_true(isinstance(X_y_tuple, tuple)) assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target) def test_load_linnerud(): res = load_linnerud() assert_equal(res.data.shape, (20, 3)) assert_equal(res.target.shape, (20, 3)) assert_equal(len(res.target_names), 3) assert_true(res.DESCR) # test return_X_y option X_y_tuple = load_linnerud(return_X_y=True) bunch = load_linnerud() assert_true(isinstance(X_y_tuple, tuple)) assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target) def test_load_iris(): res = load_iris() assert_equal(res.data.shape, (150, 4)) assert_equal(res.target.size, 150) assert_equal(res.target_names.size, 3) assert_true(res.DESCR) # test return_X_y option X_y_tuple = load_iris(return_X_y=True) bunch = load_iris() assert_true(isinstance(X_y_tuple, tuple)) assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target) def test_load_wine(): res = load_wine() assert_equal(res.data.shape, (178, 13)) assert_equal(res.target.size, 178) assert_equal(res.target_names.size, 3) assert_true(res.DESCR) # test return_X_y option X_y_tuple = load_wine(return_X_y=True) bunch = load_wine() assert_true(isinstance(X_y_tuple, tuple)) assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target) def test_load_breast_cancer(): res = load_breast_cancer() assert_equal(res.data.shape, (569, 30)) assert_equal(res.target.size, 569) assert_equal(res.target_names.size, 2) assert_true(res.DESCR) # test return_X_y option X_y_tuple = load_breast_cancer(return_X_y=True) bunch = load_breast_cancer() assert_true(isinstance(X_y_tuple, tuple)) assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target) def test_load_boston(): res = load_boston() assert_equal(res.data.shape, (506, 13)) assert_equal(res.target.size, 506) assert_equal(res.feature_names.size, 13) assert_true(res.DESCR) # test return_X_y option X_y_tuple = load_boston(return_X_y=True) bunch = load_boston() assert_true(isinstance(X_y_tuple, tuple)) assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target) def test_loads_dumps_bunch(): bunch = Bunch(x="x") bunch_from_pkl = loads(dumps(bunch)) bunch_from_pkl.x = "y" assert_equal(bunch_from_pkl['x'], bunch_from_pkl.x) def test_bunch_pickle_generated_with_0_16_and_read_with_0_17(): bunch = Bunch(key='original') # This reproduces a problem when Bunch pickles have been created # with scikit-learn 0.16 and are read with 0.17. Basically there # is a suprising behaviour because reading bunch.key uses # bunch.__dict__ (which is non empty for 0.16 Bunch objects) # whereas assigning into bunch.key uses bunch.__setattr__. See # https://github.com/scikit-learn/scikit-learn/issues/6196 for # more details bunch.__dict__['key'] = 'set from __dict__' bunch_from_pkl = loads(dumps(bunch)) # After loading from pickle the __dict__ should have been ignored assert_equal(bunch_from_pkl.key, 'original') assert_equal(bunch_from_pkl['key'], 'original') # Making sure that changing the attr does change the value # associated with __getitem__ as well bunch_from_pkl.key = 'changed' assert_equal(bunch_from_pkl.key, 'changed') assert_equal(bunch_from_pkl['key'], 'changed') def test_bunch_dir(): # check that dir (important for autocomplete) shows attributes data = load_iris() assert_true("data" in dir(data))