You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

457 lines
14 KiB

"""
This module is an API for downloading, getting information and loading datasets/models.
Give information about available models/datasets:
>>> import gensim.downloader as api
>>>
>>> api.info() # return dict with info about available models/datasets
>>> api.info("text8") # return dict with info about "text8" dataset
Model example:
>>> import gensim.downloader as api
>>>
>>> model = api.load("glove-twitter-25") # load glove vectors
>>> model.most_similar("cat") # show words that similar to word 'cat'
Dataset example:
>>> import gensim.downloader as api
>>> from gensim.models import Word2Vec
>>>
>>> dataset = api.load("text8") # load dataset as iterable
>>> model = Word2Vec(dataset) # train w2v model
Also, this API available via CLI::
python -m gensim.downloader --info <dataname> # same as api.info(dataname)
python -m gensim.downloader --info name # same as api.info(name_only=True)
python -m gensim.downloader --download <dataname> # same as api.load(dataname, return_path=True)
"""
from __future__ import absolute_import
import argparse
import os
import json
import logging
import sys
import errno
import hashlib
import math
import shutil
import tempfile
from functools import partial
if sys.version_info[0] == 2:
import urllib
from urllib2 import urlopen
else:
import urllib.request as urllib
from urllib.request import urlopen
user_dir = os.path.expanduser('~')
base_dir = os.path.join(user_dir, 'gensim-data')
logger = logging.getLogger('gensim.api')
DATA_LIST_URL = "https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/list.json"
DOWNLOAD_BASE_URL = "https://github.com/RaRe-Technologies/gensim-data/releases/download"
def _progress(chunks_downloaded, chunk_size, total_size, part=1, total_parts=1):
"""Reporthook for :func:`urllib.urlretrieve`, code from [1]_.
Parameters
----------
chunks_downloaded : int
Number of chunks of data that have been downloaded.
chunk_size : int
Size of each chunk of data.
total_size : int
Total size of the dataset/model.
part : int, optional
Number of current part, used only if `no_parts` > 1.
total_parts : int, optional
Total number of parts.
References
----------
[1] https://gist.github.com/vladignatyev/06860ec2040cb497f0f3
"""
bar_len = 50
size_downloaded = float(chunks_downloaded * chunk_size)
filled_len = int(math.floor((bar_len * size_downloaded) / total_size))
percent_downloaded = round(((size_downloaded * 100) / total_size), 1)
bar = '=' * filled_len + '-' * (bar_len - filled_len)
if total_parts == 1:
sys.stdout.write(
'\r[%s] %s%s %s/%sMB downloaded' % (
bar, percent_downloaded, "%",
round(size_downloaded / (1024 * 1024), 1),
round(float(total_size) / (1024 * 1024), 1))
)
sys.stdout.flush()
else:
sys.stdout.write(
'\r Part %s/%s [%s] %s%s %s/%sMB downloaded' % (
part + 1, total_parts, bar, percent_downloaded, "%",
round(size_downloaded / (1024 * 1024), 1),
round(float(total_size) / (1024 * 1024), 1))
)
sys.stdout.flush()
def _create_base_dir():
"""Create the gensim-data directory in home directory, if it has not been already created.
Raises
------
Exception
An exception is raised when read/write permissions are not available or a file named gensim-data
already exists in the home directory.
"""
if not os.path.isdir(base_dir):
try:
logger.info("Creating %s", base_dir)
os.makedirs(base_dir)
except OSError as e:
if e.errno == errno.EEXIST:
raise Exception(
"Not able to create folder gensim-data in {}. File gensim-data "
"exists in the direcory already.".format(user_dir)
)
else:
raise Exception(
"Can't create {}. Make sure you have the read/write permissions "
"to the directory or you can try creating the folder manually"
.format(base_dir)
)
def _calculate_md5_checksum(fname):
"""Calculate the checksum of the file, exactly same as md5-sum linux util.
Parameters
----------
fname : str
Path to the file.
Returns
-------
str
MD5-hash of file names as `fname`.
"""
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def info(name=None, show_only_latest=True, name_only=False):
"""Provide the information related to model/dataset.
Parameters
----------
name : str, optional
Name of model/dataset. If not set - shows all available data.
show_only_latest : bool, optional
If storage contains different versions for one data/model, this flag allow to hide outdated versions.
Affects only if `name` is None.
name_only : bool, optional
If True, will return only the names of available models and corpora.
Returns
-------
dict
Detailed information about one or all models/datasets.
If name is specified, return full information about concrete dataset/model,
otherwise, return information about all available datasets/models.
Raises
------
Exception
If name that has been passed is incorrect.
Examples
--------
>>> import gensim.downloader as api
>>> api.info("text8") # retrieve information about text8 dataset
{u'checksum': u'68799af40b6bda07dfa47a32612e5364',
u'description': u'Cleaned small sample from wikipedia',
u'file_name': u'text8.gz',
u'parts': 1,
u'source': u'http://mattmahoney.net/dc/text8.zip'}
>>>
>>> api.info() # retrieve information about all available datasets and models
"""
information = json.loads(urlopen(DATA_LIST_URL).read().decode("utf-8"))
if name is not None:
corpora = information['corpora']
models = information['models']
if name in corpora:
return information['corpora'][name]
elif name in models:
return information['models'][name]
else:
raise ValueError("Incorrect model/corpus name")
if not show_only_latest:
return information
if name_only:
return {"corpora": list(information['corpora'].keys()), "models": list(information['models'])}
return {
"corpora": {name: data for (name, data) in information['corpora'].items() if data.get("latest", True)},
"models": {name: data for (name, data) in information['models'].items() if data.get("latest", True)}
}
def _get_checksum(name, part=None):
"""Retrieve the checksum of the model/dataset from gensim-data repository.
Parameters
----------
name : str
Dataset/model name.
part : int, optional
Number of part (for multipart data only).
Returns
-------
str
Retrieved checksum of dataset/model.
"""
information = info()
corpora = information['corpora']
models = information['models']
if part is None:
if name in corpora:
return information['corpora'][name]["checksum"]
elif name in models:
return information['models'][name]["checksum"]
else:
if name in corpora:
return information['corpora'][name]["checksum-{}".format(part)]
elif name in models:
return information['models'][name]["checksum-{}".format(part)]
def _get_parts(name):
"""Retrieve the number of parts in which dataset/model has been split.
Parameters
----------
name: str
Dataset/model name.
Returns
-------
int
Number of parts in which dataset/model has been split.
"""
information = info()
corpora = information['corpora']
models = information['models']
if name in corpora:
return information['corpora'][name]["parts"]
elif name in models:
return information['models'][name]["parts"]
def _download(name):
"""Download and extract the dataset/model.
Parameters
----------
name: str
Dataset/model name which has to be downloaded.
Raises
------
Exception
If md5sum on client and in repo are different.
"""
url_load_file = "{base}/{fname}/__init__.py".format(base=DOWNLOAD_BASE_URL, fname=name)
data_folder_dir = os.path.join(base_dir, name)
data_folder_dir_tmp = data_folder_dir + '_tmp'
tmp_dir = tempfile.mkdtemp()
init_path = os.path.join(tmp_dir, "__init__.py")
urllib.urlretrieve(url_load_file, init_path)
total_parts = _get_parts(name)
if total_parts > 1:
concatenated_folder_name = "{fname}.gz".format(fname=name)
concatenated_folder_dir = os.path.join(tmp_dir, concatenated_folder_name)
for part in range(0, total_parts):
url_data = "{base}/{fname}/{fname}.gz_0{part}".format(base=DOWNLOAD_BASE_URL, fname=name, part=part)
fname = "{f}.gz_0{p}".format(f=name, p=part)
dst_path = os.path.join(tmp_dir, fname)
urllib.urlretrieve(
url_data, dst_path,
reporthook=partial(_progress, part=part, total_parts=total_parts)
)
if _calculate_md5_checksum(dst_path) == _get_checksum(name, part):
sys.stdout.write("\n")
sys.stdout.flush()
logger.info("Part %s/%s downloaded", part + 1, total_parts)
else:
shutil.rmtree(tmp_dir)
raise Exception("Checksum comparison failed, try again")
with open(concatenated_folder_dir, 'wb') as wfp:
for part in range(0, total_parts):
part_path = os.path.join(tmp_dir, "{fname}.gz_0{part}".format(fname=name, part=part))
with open(part_path, "rb") as rfp:
shutil.copyfileobj(rfp, wfp)
os.remove(part_path)
else:
url_data = "{base}/{fname}/{fname}.gz".format(base=DOWNLOAD_BASE_URL, fname=name)
fname = "{fname}.gz".format(fname=name)
dst_path = os.path.join(tmp_dir, fname)
urllib.urlretrieve(url_data, dst_path, reporthook=_progress)
if _calculate_md5_checksum(dst_path) == _get_checksum(name):
sys.stdout.write("\n")
sys.stdout.flush()
logger.info("%s downloaded", name)
else:
shutil.rmtree(tmp_dir)
raise Exception("Checksum comparison failed, try again")
if os.path.exists(data_folder_dir_tmp):
os.remove(data_folder_dir_tmp)
shutil.move(tmp_dir, data_folder_dir_tmp)
os.rename(data_folder_dir_tmp, data_folder_dir)
def _get_filename(name):
"""Retrieve the filename of the dataset/model.
Parameters
----------
name: str
Name of dataset/model.
Returns
-------
str:
Filename of the dataset/model.
"""
information = info()
corpora = information['corpora']
models = information['models']
if name in corpora:
return information['corpora'][name]["file_name"]
elif name in models:
return information['models'][name]["file_name"]
def load(name, return_path=False):
"""Download (if needed) dataset/model and load it to memory (unless `return_path` is set).
Parameters
----------
name: str
Name of the model/dataset.
return_path: bool, optional
If True, return full path to file, otherwise, return loaded model / iterable dataset.
Returns
-------
Model
Requested model, if `name` is model and `return_path` == False.
Dataset (iterable)
Requested dataset, if `name` is dataset and `return_path` == False.
str
Path to file with dataset / model, only when `return_path` == True.
Raises
------
Exception
Raised if `name` is incorrect.
Examples
--------
Model example:
>>> import gensim.downloader as api
>>>
>>> model = api.load("glove-twitter-25") # load glove vectors
>>> model.most_similar("cat") # show words that similar to word 'cat'
Dataset example:
>>> import gensim.downloader as api
>>>
>>> wiki = api.load("wiki-en") # load extracted Wikipedia dump, around 6 Gb
>>> for article in wiki: # iterate over all wiki script
>>> ...
Download only example
>>> import gensim.downloader as api
>>>
>>> print(api.load("wiki-en", return_path=True)) # output: /home/user/gensim-data/wiki-en/wiki-en.gz
"""
_create_base_dir()
file_name = _get_filename(name)
if file_name is None:
raise ValueError("Incorrect model/corpus name")
folder_dir = os.path.join(base_dir, name)
path = os.path.join(folder_dir, file_name)
if not os.path.exists(folder_dir):
_download(name)
if return_path:
return path
else:
sys.path.insert(0, base_dir)
module = __import__(name)
return module.load_data()
if __name__ == '__main__':
logging.basicConfig(
format='%(asctime)s : %(name)s : %(levelname)s : %(message)s', stream=sys.stdout, level=logging.INFO
)
parser = argparse.ArgumentParser(
description="Gensim console API",
usage="python -m gensim.api.downloader [-h] [-d data_name | -i data_name | -c]"
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
"-d", "--download", metavar="data_name", nargs=1,
help="To download a corpus/model : python -m gensim.downloader -d <dataname>"
)
full_information = 1
group.add_argument(
"-i", "--info", metavar="data_name", nargs='?', const=full_information,
help="To get information about a corpus/model : python -m gensim.downloader -i <dataname>"
)
args = parser.parse_args()
if args.download is not None:
data_path = load(args.download[0], return_path=True)
logger.info("Data has been installed and data path is %s", data_path)
elif args.info is not None:
if args.info == 'name':
print(json.dumps(info(name_only=True), indent=4))
else:
output = info() if (args.info == full_information) else info(name=args.info)
print(json.dumps(output, indent=4))