"""California housing dataset. The original database is available from StatLib http://lib.stat.cmu.edu/datasets/ The data contains 20,640 observations on 9 variables. This dataset contains the average house value as target variable and the following input variables (features): average income, housing average age, average rooms, average bedrooms, population, average occupation, latitude, and longitude in that order. References ---------- Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions, Statistics and Probability Letters, 33 (1997) 291-297. """ # Authors: Peter Prettenhofer # License: BSD 3 clause from os.path import exists from os import makedirs, remove import tarfile import numpy as np import logging from .base import get_data_home from .base import _fetch_remote from .base import _pkl_filepath from .base import RemoteFileMetadata from ..utils import Bunch from ..externals import joblib # The original data can be found at: # http://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz ARCHIVE = RemoteFileMetadata( filename='cal_housing.tgz', url='https://ndownloader.figshare.com/files/5976036', checksum=('aaa5c9a6afe2225cc2aed2723682ae40' '3280c4a3695a2ddda4ffb5d8215ea681')) # Grab the module-level docstring to use as a description of the # dataset MODULE_DOCS = __doc__ logger = logging.getLogger(__name__) def fetch_california_housing(data_home=None, download_if_missing=True): """Loader for the California housing dataset from StatLib. Read more in the :ref:`User Guide `. Parameters ---------- data_home : optional, default: None Specify another download and cache folder for the datasets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. download_if_missing : optional, True by default If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- dataset : dict-like object with the following attributes: dataset.data : ndarray, shape [20640, 8] Each row corresponding to the 8 feature values in order. dataset.target : numpy array of shape (20640,) Each value corresponds to the average house value in units of 100,000. dataset.feature_names : array of length 8 Array of ordered feature names used in the dataset. dataset.DESCR : string Description of the California housing dataset. Notes ------ This dataset consists of 20,640 samples and 9 features. """ data_home = get_data_home(data_home=data_home) if not exists(data_home): makedirs(data_home) filepath = _pkl_filepath(data_home, 'cal_housing.pkz') if not exists(filepath): if not download_if_missing: raise IOError("Data not found and `download_if_missing` is False") logger.info('Downloading Cal. housing from {} to {}'.format( ARCHIVE.url, data_home)) archive_path = _fetch_remote(ARCHIVE, dirname=data_home) with tarfile.open(mode="r:gz", name=archive_path) as f: cal_housing = np.loadtxt( f.extractfile('CaliforniaHousing/cal_housing.data'), delimiter=',') # Columns are not in the same order compared to the previous # URL resource on lib.stat.cmu.edu columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0] cal_housing = cal_housing[:, columns_index] joblib.dump(cal_housing, filepath, compress=6) remove(archive_path) else: cal_housing = joblib.load(filepath) feature_names = ["MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", "Latitude", "Longitude"] target, data = cal_housing[:, 0], cal_housing[:, 1:] # avg rooms = total rooms / households data[:, 2] /= data[:, 5] # avg bed rooms = total bed rooms / households data[:, 3] /= data[:, 5] # avg occupancy = population / households data[:, 5] = data[:, 4] / data[:, 5] # target in units of 100,000 target = target / 100000.0 return Bunch(data=data, target=target, feature_names=feature_names, DESCR=MODULE_DOCS)