#!/usr/bin/env python # -*- coding: utf-8 -*- # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ USAGE: %(program)s -train CORPUS -output VECTORS -size SIZE -window WINDOW -cbow CBOW -sample SAMPLE -hs HS -negative NEGATIVE -threads THREADS -iter ITER -min_count MIN-COUNT -alpha ALPHA -binary BINARY -accuracy FILE Trains a neural embedding model on text file CORPUS. Parameters essentially reproduce those used by the original C tool (see https://code.google.com/archive/p/word2vec/). Parameters for training: -train Use text data from to train the model -output Use to save the resulting word vectors / word clusters -size Set size of word vectors; default is 100 -window Set max skip length between words; default is 5 -sample Set threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5) -hs Use Hierarchical Softmax; default is 0 (not used) -negative Number of negative examples; default is 5, common values are 3 - 10 (0 = not used) -threads Use threads (default 3) -iter Run more training iterations (default 5) -min_count This will discard words that appear less than times; default is 5 -alpha Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW -binary Save the resulting vectors in binary moded; default is 0 (off) -cbow Use the continuous bag of words model; default is 1 (use 0 for skip-gram model) -accuracy Compute accuracy of the resulting model analogical inference power on questions file See an example of questions file at https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt Example: python -m gensim.scripts.word2vec_standalone -train data.txt \ -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3 """ import logging import os.path import sys import argparse from numpy import seterr from gensim.models.word2vec import Word2Vec, LineSentence # avoid referencing __main__ in pickle logger = logging.getLogger(__name__) if __name__ == "__main__": logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logger.info("running %s", " ".join(sys.argv)) seterr(all='raise') # don't ignore numpy errors parser = argparse.ArgumentParser() parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True) parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors") parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100) parser.add_argument( "-sample", help="Set threshold for occurrence of words. " "Those that appear with higher frequency in the training data will be randomly down-sampled; " "default is 1e-3, useful range is (0, 1e-5)", type=float, default=1e-3) parser.add_argument( "-hs", help="Use Hierarchical Softmax; default is 0 (not used)", type=int, default=0, choices=[0, 1] ) parser.add_argument( "-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", type=int, default=5 ) parser.add_argument("-threads", help="Use THREADS threads (default 3)", type=int, default=3) parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5) parser.add_argument( "-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", type=int, default=5 ) parser.add_argument( "-alpha", help="Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW", type=float ) parser.add_argument( "-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", type=int, default=1, choices=[0, 1] ) parser.add_argument( "-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", type=int, default=0, choices=[0, 1] ) parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model") args = parser.parse_args() if args.cbow == 0: skipgram = 1 if not args.alpha: args.alpha = 0.025 else: skipgram = 0 if not args.alpha: args.alpha = 0.05 corpus = LineSentence(args.train) model = Word2Vec( corpus, size=args.size, min_count=args.min_count, workers=args.threads, window=args.window, sample=args.sample, alpha=args.alpha, sg=skipgram, hs=args.hs, negative=args.negative, cbow_mean=1, iter=args.iter ) if args.output: outfile = args.output model.wv.save_word2vec_format(outfile, binary=args.binary) else: outfile = args.train.split('.')[0] model.save(outfile + '.model') if args.binary == 1: model.wv.save_word2vec_format(outfile + '.model.bin', binary=True) else: model.wv.save_word2vec_format(outfile + '.model.txt', binary=False) if args.accuracy: questions_file = args.accuracy model.accuracy(questions_file) logger.info("finished running %s", os.path.basename(sys.argv[0]))