laywerrobot/lib/python3.6/site-packages/gensim/scripts/glove2word2vec.py

124 lines
3.8 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2016 Radim Rehurek <radimrehurek@seznam.cz>
# Copyright (C) 2016 Manas Ranjan Kar <manasrkar91@gmail.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""This script allows to convert GloVe vectors into the word2vec. Both files are
presented in text format and almost identical except that word2vec includes
number of vectors and its dimension which is only difference regard to GloVe.
Notes
-----
GloVe format (real example can be founded `on Stanford size <https://nlp.stanford.edu/projects/glove/>`_) ::
word1 0.123 0.134 0.532 0.152
word2 0.934 0.412 0.532 0.159
word3 0.334 0.241 0.324 0.188
...
word9 0.334 0.241 0.324 0.188
Word2Vec format (real example can be founded `on w2v old repository <https://code.google.com/archive/p/word2vec/>`_) ::
9 4
word1 0.123 0.134 0.532 0.152
word2 0.934 0.412 0.532 0.159
word3 0.334 0.241 0.324 0.188
...
word9 0.334 0.241 0.324 0.188
How to use
----------
>>> from gensim.test.utils import datapath, get_tmpfile
>>> from gensim.models import KeyedVectors
>>>
>>> glove_file = datapath('test_glove.txt')
>>> tmp_file = get_tmpfile("test_word2vec.txt")
>>>
>>> # call glove2word2vec script
>>> # default way (through CLI): python -m gensim.scripts.glove2word2vec --input <glove_file> --output <w2v_file>
>>> from gensim.scripts.glove2word2vec import glove2word2vec
>>> glove2word2vec(glove_file, tmp_file)
>>>
>>> model = KeyedVectors.load_word2vec_format(tmp_file)
Command line arguments
----------------------
.. program-output:: python -m gensim.scripts.glove2word2vec --help
:ellipsis: 0, -5
"""
import sys
import logging
import argparse
from smart_open import smart_open
logger = logging.getLogger(__name__)
def get_glove_info(glove_file_name):
"""Get number of vectors in provided `glove_file_name` and dimension of vectors.
Parameters
----------
glove_file_name : str
Path to file in GloVe format.
Returns
-------
(int, int)
Number of vectors (lines) of input file and its dimension.
"""
with smart_open(glove_file_name) as f:
num_lines = sum(1 for _ in f)
with smart_open(glove_file_name) as f:
num_dims = len(f.readline().split()) - 1
return num_lines, num_dims
def glove2word2vec(glove_input_file, word2vec_output_file):
"""Convert `glove_input_file` in GloVe format to word2vec format and write it to `word2vec_output_file`.
Parameters
----------
glove_input_file : str
Path to file in GloVe format.
word2vec_output_file: str
Path to output file.
Returns
-------
(int, int)
Number of vectors (lines) of input file and its dimension.
"""
num_lines, num_dims = get_glove_info(glove_input_file)
logger.info("converting %i vectors from %s to %s", num_lines, glove_input_file, word2vec_output_file)
with smart_open(word2vec_output_file, 'wb') as fout:
fout.write("{0} {1}\n".format(num_lines, num_dims).encode('utf-8'))
with smart_open(glove_input_file, 'rb') as fin:
for line in fin:
fout.write(line)
return num_lines, num_dims
if __name__ == "__main__":
logging.basicConfig(format='%(asctime)s - %(module)s - %(levelname)s - %(message)s', level=logging.INFO)
parser = argparse.ArgumentParser(description=__doc__[:-135], formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("-i", "--input", required=True, help="Path to input file in GloVe format")
parser.add_argument("-o", "--output", required=True, help="Path to output file")
args = parser.parse_args()
logger.info("running %s", ' '.join(sys.argv))
num_lines, num_dims = glove2word2vec(args.input, args.output)
logger.info('Converted model with %i vectors and %i dimensions', num_lines, num_dims)