123 lines
3.8 KiB
Python
123 lines
3.8 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright (C) 2016 Radim Rehurek <radimrehurek@seznam.cz>
|
|
# Copyright (C) 2016 Manas Ranjan Kar <manasrkar91@gmail.com>
|
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
|
|
|
|
|
"""This script allows to convert GloVe vectors into the word2vec. Both files are
|
|
presented in text format and almost identical except that word2vec includes
|
|
number of vectors and its dimension which is only difference regard to GloVe.
|
|
|
|
Notes
|
|
-----
|
|
|
|
GloVe format (real example can be founded `on Stanford size <https://nlp.stanford.edu/projects/glove/>`_) ::
|
|
|
|
word1 0.123 0.134 0.532 0.152
|
|
word2 0.934 0.412 0.532 0.159
|
|
word3 0.334 0.241 0.324 0.188
|
|
...
|
|
word9 0.334 0.241 0.324 0.188
|
|
|
|
|
|
Word2Vec format (real example can be founded `on w2v old repository <https://code.google.com/archive/p/word2vec/>`_) ::
|
|
|
|
9 4
|
|
word1 0.123 0.134 0.532 0.152
|
|
word2 0.934 0.412 0.532 0.159
|
|
word3 0.334 0.241 0.324 0.188
|
|
...
|
|
word9 0.334 0.241 0.324 0.188
|
|
|
|
|
|
How to use
|
|
----------
|
|
>>> from gensim.test.utils import datapath, get_tmpfile
|
|
>>> from gensim.models import KeyedVectors
|
|
>>>
|
|
>>> glove_file = datapath('test_glove.txt')
|
|
>>> tmp_file = get_tmpfile("test_word2vec.txt")
|
|
>>>
|
|
>>> # call glove2word2vec script
|
|
>>> # default way (through CLI): python -m gensim.scripts.glove2word2vec --input <glove_file> --output <w2v_file>
|
|
>>> from gensim.scripts.glove2word2vec import glove2word2vec
|
|
>>> glove2word2vec(glove_file, tmp_file)
|
|
>>>
|
|
>>> model = KeyedVectors.load_word2vec_format(tmp_file)
|
|
|
|
|
|
Command line arguments
|
|
----------------------
|
|
|
|
.. program-output:: python -m gensim.scripts.glove2word2vec --help
|
|
:ellipsis: 0, -5
|
|
|
|
"""
|
|
import sys
|
|
import logging
|
|
import argparse
|
|
|
|
from smart_open import smart_open
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def get_glove_info(glove_file_name):
|
|
"""Get number of vectors in provided `glove_file_name` and dimension of vectors.
|
|
|
|
Parameters
|
|
----------
|
|
glove_file_name : str
|
|
Path to file in GloVe format.
|
|
|
|
Returns
|
|
-------
|
|
(int, int)
|
|
Number of vectors (lines) of input file and its dimension.
|
|
|
|
"""
|
|
with smart_open(glove_file_name) as f:
|
|
num_lines = sum(1 for _ in f)
|
|
with smart_open(glove_file_name) as f:
|
|
num_dims = len(f.readline().split()) - 1
|
|
return num_lines, num_dims
|
|
|
|
|
|
def glove2word2vec(glove_input_file, word2vec_output_file):
|
|
"""Convert `glove_input_file` in GloVe format to word2vec format and write it to `word2vec_output_file`.
|
|
|
|
Parameters
|
|
----------
|
|
glove_input_file : str
|
|
Path to file in GloVe format.
|
|
word2vec_output_file: str
|
|
Path to output file.
|
|
|
|
Returns
|
|
-------
|
|
(int, int)
|
|
Number of vectors (lines) of input file and its dimension.
|
|
|
|
"""
|
|
num_lines, num_dims = get_glove_info(glove_input_file)
|
|
logger.info("converting %i vectors from %s to %s", num_lines, glove_input_file, word2vec_output_file)
|
|
with smart_open(word2vec_output_file, 'wb') as fout:
|
|
fout.write("{0} {1}\n".format(num_lines, num_dims).encode('utf-8'))
|
|
with smart_open(glove_input_file, 'rb') as fin:
|
|
for line in fin:
|
|
fout.write(line)
|
|
return num_lines, num_dims
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(format='%(asctime)s - %(module)s - %(levelname)s - %(message)s', level=logging.INFO)
|
|
parser = argparse.ArgumentParser(description=__doc__[:-135], formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
parser.add_argument("-i", "--input", required=True, help="Path to input file in GloVe format")
|
|
parser.add_argument("-o", "--output", required=True, help="Path to output file")
|
|
args = parser.parse_args()
|
|
|
|
logger.info("running %s", ' '.join(sys.argv))
|
|
num_lines, num_dims = glove2word2vec(args.input, args.output)
|
|
logger.info('Converted model with %i vectors and %i dimensions', num_lines, num_dims)
|