laywerrobot/lib/python3.6/site-packages/gensim/scripts/word2vec2tensor.py
2020-08-27 21:55:39 +02:00

94 lines
3.6 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2016 Loreto Parisi <loretoparisi@gmail.com>
# Copyright (C) 2016 Silvio Olivastri <silvio.olivastri@gmail.com>
# Copyright (C) 2016 Radim Rehurek <radim@rare-technologies.com>
"""This script allows converting word-vectors from word2vec format into Tensorflow 2D tensor and metadata format.
This script used for for word-vector visualization on `Embedding Visualization <http://projector.tensorflow.org/>`_.
How to use
----------
#. Convert your word-vector with this script (for example, we'll use model from
`gensim-data <https://rare-technologies.com/new-download-api-for-pretrained-nlp-models-and-datasets-in-gensim/>`_) ::
python -m gensim.downloader -d glove-wiki-gigaword-50 # download model in word2vec format
python -m gensim.scripts.word2vec2tensor -i ~/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz \
-o /tmp/my_model_prefix
#. Open http://projector.tensorflow.org/
#. Click "Load Data" button from the left menu.
#. Select "Choose file" in "Load a TSV file of vectors." and choose "/tmp/my_model_prefix_tensor.tsv" file.
#. Select "Choose file" in "Load a TSV file of metadata." and choose "/tmp/my_model_prefix_metadata.tsv" file.
#. ???
#. PROFIT!
For more information about TensorBoard TSV format please visit:
https://www.tensorflow.org/versions/master/how_tos/embedding_viz/
Command line arguments
----------------------
.. program-output:: python -m gensim.scripts.word2vec2tensor --help
:ellipsis: 0, -7
"""
import os
import sys
import logging
import argparse
import gensim
logger = logging.getLogger(__name__)
def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False):
"""Convert file in Word2Vec format and writes two files 2D tensor TSV file.
File "tensor_filename"_tensor.tsv contains word-vectors, "tensor_filename"_metadata.tsv contains words.
Parameters
----------
word2vec_model_path : str
Path to file in Word2Vec format.
tensor_filename : str
Prefix for output files.
binary : bool, optional
True if input file in binary format.
"""
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=binary)
outfiletsv = tensor_filename + '_tensor.tsv'
outfiletsvmeta = tensor_filename + '_metadata.tsv'
with open(outfiletsv, 'w+') as file_vector:
with open(outfiletsvmeta, 'w+') as file_metadata:
for word in model.index2word:
file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n'))
vector_row = '\t'.join(str(x) for x in model[word])
file_vector.write(vector_row + '\n')
logger.info("2D tensor file saved to %s", outfiletsv)
logger.info("Tensor metadata file saved to %s", outfiletsvmeta)
if __name__ == "__main__":
logging.basicConfig(format='%(asctime)s - %(module)s - %(levelname)s - %(message)s', level=logging.INFO)
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__[:-138])
parser.add_argument("-i", "--input", required=True, help="Path to input file in word2vec format")
parser.add_argument("-o", "--output", required=True, help="Prefix path for output files")
parser.add_argument(
"-b", "--binary", action='store_const', const=True, default=False,
help="Set this flag if word2vec model in binary format (default: %(default)s)"
)
args = parser.parse_args()
logger.info("running %s", ' '.join(sys.argv))
word2vec2tensor(args.input, args.output, args.binary)
logger.info("finished running %s", os.path.basename(sys.argv[0]))