#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2016 Loreto Parisi # Copyright (C) 2016 Silvio Olivastri # Copyright (C) 2016 Radim Rehurek """This script allows converting word-vectors from word2vec format into Tensorflow 2D tensor and metadata format. This script used for for word-vector visualization on `Embedding Visualization `_. How to use ---------- #. Convert your word-vector with this script (for example, we'll use model from `gensim-data `_) :: python -m gensim.downloader -d glove-wiki-gigaword-50 # download model in word2vec format python -m gensim.scripts.word2vec2tensor -i ~/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz \ -o /tmp/my_model_prefix #. Open http://projector.tensorflow.org/ #. Click "Load Data" button from the left menu. #. Select "Choose file" in "Load a TSV file of vectors." and choose "/tmp/my_model_prefix_tensor.tsv" file. #. Select "Choose file" in "Load a TSV file of metadata." and choose "/tmp/my_model_prefix_metadata.tsv" file. #. ??? #. PROFIT! For more information about TensorBoard TSV format please visit: https://www.tensorflow.org/versions/master/how_tos/embedding_viz/ Command line arguments ---------------------- .. program-output:: python -m gensim.scripts.word2vec2tensor --help :ellipsis: 0, -7 """ import os import sys import logging import argparse import gensim logger = logging.getLogger(__name__) def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False): """Convert file in Word2Vec format and writes two files 2D tensor TSV file. File "tensor_filename"_tensor.tsv contains word-vectors, "tensor_filename"_metadata.tsv contains words. Parameters ---------- word2vec_model_path : str Path to file in Word2Vec format. tensor_filename : str Prefix for output files. binary : bool, optional True if input file in binary format. """ model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=binary) outfiletsv = tensor_filename + '_tensor.tsv' outfiletsvmeta = tensor_filename + '_metadata.tsv' with open(outfiletsv, 'w+') as file_vector: with open(outfiletsvmeta, 'w+') as file_metadata: for word in model.index2word: file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n')) vector_row = '\t'.join(str(x) for x in model[word]) file_vector.write(vector_row + '\n') logger.info("2D tensor file saved to %s", outfiletsv) logger.info("Tensor metadata file saved to %s", outfiletsvmeta) if __name__ == "__main__": logging.basicConfig(format='%(asctime)s - %(module)s - %(levelname)s - %(message)s', level=logging.INFO) parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__[:-138]) parser.add_argument("-i", "--input", required=True, help="Path to input file in word2vec format") parser.add_argument("-o", "--output", required=True, help="Prefix path for output files") parser.add_argument( "-b", "--binary", action='store_const', const=True, default=False, help="Set this flag if word2vec model in binary format (default: %(default)s)" ) args = parser.parse_args() logger.info("running %s", ' '.join(sys.argv)) word2vec2tensor(args.input, args.output, args.binary) logger.info("finished running %s", os.path.basename(sys.argv[0]))