112 lines
4 KiB
Python
112 lines
4 KiB
Python
|
# Copyright (c) 2010 Olivier Grisel <olivier.grisel@ensta.org>
|
||
|
# License: BSD 3 clause
|
||
|
"""Glue code to load http://mlcomp.org data as a scikit.learn dataset"""
|
||
|
|
||
|
import os
|
||
|
import numbers
|
||
|
from sklearn.datasets.base import load_files
|
||
|
from sklearn.utils import deprecated
|
||
|
|
||
|
|
||
|
def _load_document_classification(dataset_path, metadata, set_=None, **kwargs):
|
||
|
if set_ is not None:
|
||
|
dataset_path = os.path.join(dataset_path, set_)
|
||
|
return load_files(dataset_path, metadata.get('description'), **kwargs)
|
||
|
|
||
|
|
||
|
LOADERS = {
|
||
|
'DocumentClassification': _load_document_classification,
|
||
|
# TODO: implement the remaining domain formats
|
||
|
}
|
||
|
|
||
|
|
||
|
@deprecated("since the http://mlcomp.org/ website will shut down "
|
||
|
"in March 2017, the load_mlcomp function was deprecated "
|
||
|
"in version 0.19 and will be removed in 0.21.")
|
||
|
def load_mlcomp(name_or_id, set_="raw", mlcomp_root=None, **kwargs):
|
||
|
"""Load a datasets as downloaded from http://mlcomp.org
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
|
||
|
name_or_id : the integer id or the string name metadata of the MLComp
|
||
|
dataset to load
|
||
|
|
||
|
set_ : select the portion to load: 'train', 'test' or 'raw'
|
||
|
|
||
|
mlcomp_root : the filesystem path to the root folder where MLComp datasets
|
||
|
are stored, if mlcomp_root is None, the MLCOMP_DATASETS_HOME
|
||
|
environment variable is looked up instead.
|
||
|
|
||
|
**kwargs : domain specific kwargs to be passed to the dataset loader.
|
||
|
|
||
|
Read more in the :ref:`User Guide <datasets>`.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
|
||
|
data : Bunch
|
||
|
Dictionary-like object, the interesting attributes are:
|
||
|
'filenames', the files holding the raw to learn, 'target', the
|
||
|
classification labels (integer index), 'target_names',
|
||
|
the meaning of the labels, and 'DESCR', the full description of the
|
||
|
dataset.
|
||
|
|
||
|
Note on the lookup process: depending on the type of name_or_id,
|
||
|
will choose between integer id lookup or metadata name lookup by
|
||
|
looking at the unzipped archives and metadata file.
|
||
|
|
||
|
TODO: implement zip dataset loading too
|
||
|
"""
|
||
|
|
||
|
if mlcomp_root is None:
|
||
|
try:
|
||
|
mlcomp_root = os.environ['MLCOMP_DATASETS_HOME']
|
||
|
except KeyError:
|
||
|
raise ValueError("MLCOMP_DATASETS_HOME env variable is undefined")
|
||
|
|
||
|
mlcomp_root = os.path.expanduser(mlcomp_root)
|
||
|
mlcomp_root = os.path.abspath(mlcomp_root)
|
||
|
mlcomp_root = os.path.normpath(mlcomp_root)
|
||
|
|
||
|
if not os.path.exists(mlcomp_root):
|
||
|
raise ValueError("Could not find folder: " + mlcomp_root)
|
||
|
|
||
|
# dataset lookup
|
||
|
if isinstance(name_or_id, numbers.Integral):
|
||
|
# id lookup
|
||
|
dataset_path = os.path.join(mlcomp_root, str(name_or_id))
|
||
|
else:
|
||
|
# assume name based lookup
|
||
|
dataset_path = None
|
||
|
expected_name_line = "name: " + name_or_id
|
||
|
for dataset in os.listdir(mlcomp_root):
|
||
|
metadata_file = os.path.join(mlcomp_root, dataset, 'metadata')
|
||
|
if not os.path.exists(metadata_file):
|
||
|
continue
|
||
|
with open(metadata_file) as f:
|
||
|
for line in f:
|
||
|
if line.strip() == expected_name_line:
|
||
|
dataset_path = os.path.join(mlcomp_root, dataset)
|
||
|
break
|
||
|
if dataset_path is None:
|
||
|
raise ValueError("Could not find dataset with metadata line: " +
|
||
|
expected_name_line)
|
||
|
|
||
|
# loading the dataset metadata
|
||
|
metadata = dict()
|
||
|
metadata_file = os.path.join(dataset_path, 'metadata')
|
||
|
if not os.path.exists(metadata_file):
|
||
|
raise ValueError(dataset_path + ' is not a valid MLComp dataset')
|
||
|
with open(metadata_file) as f:
|
||
|
for line in f:
|
||
|
if ":" in line:
|
||
|
key, value = line.split(":", 1)
|
||
|
metadata[key.strip()] = value.strip()
|
||
|
|
||
|
format = metadata.get('format', 'unknow')
|
||
|
loader = LOADERS.get(format)
|
||
|
if loader is None:
|
||
|
raise ValueError("No loader implemented for format: " + format)
|
||
|
return loader(dataset_path, metadata, set_=set_, **kwargs)
|