laywerrobot/lib/python3.6/site-packages/gensim/scripts/word2vec_standalone.py
2020-08-27 21:55:39 +02:00

143 lines
5.8 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
USAGE: %(program)s -train CORPUS -output VECTORS -size SIZE -window WINDOW
-cbow CBOW -sample SAMPLE -hs HS -negative NEGATIVE -threads THREADS -iter ITER
-min_count MIN-COUNT -alpha ALPHA -binary BINARY -accuracy FILE
Trains a neural embedding model on text file CORPUS.
Parameters essentially reproduce those used by the original C tool
(see https://code.google.com/archive/p/word2vec/).
Parameters for training:
-train <file>
Use text data from <file> to train the model
-output <file>
Use <file> to save the resulting word vectors / word clusters
-size <int>
Set size of word vectors; default is 100
-window <int>
Set max skip length between words; default is 5
-sample <float>
Set threshold for occurrence of words. Those that appear with higher frequency in the training data
will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)
-hs <int>
Use Hierarchical Softmax; default is 0 (not used)
-negative <int>
Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)
-threads <int>
Use <int> threads (default 3)
-iter <int>
Run more training iterations (default 5)
-min_count <int>
This will discard words that appear less than <int> times; default is 5
-alpha <float>
Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW
-binary <int>
Save the resulting vectors in binary moded; default is 0 (off)
-cbow <int>
Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)
-accuracy <file>
Compute accuracy of the resulting model analogical inference power on questions file <file>
See an example of questions file
at https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt
Example: python -m gensim.scripts.word2vec_standalone -train data.txt \
-output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3
"""
import logging
import os.path
import sys
import argparse
from numpy import seterr
from gensim.models.word2vec import Word2Vec, LineSentence # avoid referencing __main__ in pickle
logger = logging.getLogger(__name__)
if __name__ == "__main__":
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
logger.info("running %s", " ".join(sys.argv))
seterr(all='raise') # don't ignore numpy errors
parser = argparse.ArgumentParser()
parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True)
parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors")
parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5)
parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100)
parser.add_argument(
"-sample",
help="Set threshold for occurrence of words. "
"Those that appear with higher frequency in the training data will be randomly down-sampled; "
"default is 1e-3, useful range is (0, 1e-5)",
type=float, default=1e-3)
parser.add_argument(
"-hs", help="Use Hierarchical Softmax; default is 0 (not used)",
type=int, default=0, choices=[0, 1]
)
parser.add_argument(
"-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)",
type=int, default=5
)
parser.add_argument("-threads", help="Use THREADS threads (default 3)", type=int, default=3)
parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5)
parser.add_argument(
"-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5",
type=int, default=5
)
parser.add_argument(
"-alpha", help="Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW",
type=float
)
parser.add_argument(
"-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)",
type=int, default=1, choices=[0, 1]
)
parser.add_argument(
"-binary", help="Save the resulting vectors in binary mode; default is 0 (off)",
type=int, default=0, choices=[0, 1]
)
parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model")
args = parser.parse_args()
if args.cbow == 0:
skipgram = 1
if not args.alpha:
args.alpha = 0.025
else:
skipgram = 0
if not args.alpha:
args.alpha = 0.05
corpus = LineSentence(args.train)
model = Word2Vec(
corpus, size=args.size, min_count=args.min_count, workers=args.threads,
window=args.window, sample=args.sample, alpha=args.alpha, sg=skipgram,
hs=args.hs, negative=args.negative, cbow_mean=1, iter=args.iter
)
if args.output:
outfile = args.output
model.wv.save_word2vec_format(outfile, binary=args.binary)
else:
outfile = args.train.split('.')[0]
model.save(outfile + '.model')
if args.binary == 1:
model.wv.save_word2vec_format(outfile + '.model.bin', binary=True)
else:
model.wv.save_word2vec_format(outfile + '.model.txt', binary=False)
if args.accuracy:
questions_file = args.accuracy
model.accuracy(questions_file)
logger.info("finished running %s", os.path.basename(sys.argv[0]))