diff --git a/.gitignore b/.gitignore index c3ba120f37..09b652c23f 100644 --- a/.gitignore +++ b/.gitignore @@ -76,3 +76,5 @@ data *.inv *.js docs/_images/ +*.c +*.cpp diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index 8f5e8fc61e..464f8f192f 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -27,6 +27,7 @@ Modules: corpora/textcorpus corpora/ucicorpus corpora/wikicorpus + corpora/utils models/ldamodel models/ldamulticore models/nmf diff --git a/docs/src/corpora/utils.rst b/docs/src/corpora/utils.rst new file mode 100644 index 0000000000..14955d2e0f --- /dev/null +++ b/docs/src/corpora/utils.rst @@ -0,0 +1,9 @@ +:mod:`corpora.utils` -- Implements various corpus utility +========================================================= + +.. automodule:: gensim.corpora.utils + :synopsis: Implements various corpus utility + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/gensim/corpora/utils.py b/gensim/corpora/utils.py new file mode 100644 index 0000000000..71c3360f8a --- /dev/null +++ b/gensim/corpora/utils.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Author: Gensim Contributors +# Copyright (C) 2020 RaRe Technologies s.r.o. +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +import logging +import os +import itertools +from collections import namedtuple + +from gensim import utils + +logger = logging.getLogger(__name__) + +#: Shared capped size, in count of words/tokens, for texts that should not be arbitrarily long +MAX_WORDS = 10000 + + +class LineSentence(object): + def __init__(self, source, max_sentence_length=MAX_WORDS, limit=None): + """Iterate over a file that contains sentences: one line = one sentence. + Words must be already preprocessed and separated by whitespace. + + Parameters + ---------- + source : string or a file-like object + Path to the file on disk, or an already-open file object (must support `seek(0)`). + limit : int or None + Clip the file to the first `limit` lines. Do no clipping if `limit is None` (the default). + + Examples + -------- + .. sourcecode:: pycon + + >>> from gensim.test.utils import datapath + >>> sentences = LineSentence(datapath('lee_background.cor')) + >>> for sentence in sentences: + ... pass + + """ + self.source = source + self.max_sentence_length = max_sentence_length + self.limit = limit + + def __iter__(self): + """Iterate through the lines in the source.""" + try: + # Assume it is a file-like object and try treating it as such + # Things that don't have seek will trigger an exception + self.source.seek(0) + for line in itertools.islice(self.source, self.limit): + line = utils.to_unicode(line).split() + i = 0 + while i < len(line): + yield line[i: i + self.max_sentence_length] + i += self.max_sentence_length + except AttributeError: + # If it didn't work like a file, use it as a string filename + with utils.open(self.source, 'rb') as fin: + for line in itertools.islice(fin, self.limit): + line = utils.to_unicode(line).split() + i = 0 + while i < len(line): + yield line[i: i + self.max_sentence_length] + i += self.max_sentence_length + + +class PathLineSentences(object): + def __init__(self, source, max_sentence_length=MAX_WORDS, limit=None): + """Like :class:`~gensim.corpora.utils.LineSentence`, but process all files in a directory + in alphabetical order by filename. + + The directory must only contain files that can be read by :class:`gensim.corpora.utils.LineSentence`: + .bz2, .gz, and text files. Any file not ending with .bz2 or .gz is assumed to be a text file. + + The format of files (either text, or compressed text files) in the path is one sentence = one line, + with words already preprocessed and separated by whitespace. + + Warnings + -------- + Does **not recurse** into subdirectories. + + Parameters + ---------- + source : str + Path to the directory. + limit : int or None + Read only the first `limit` lines from each file. Read all if limit is None (the default). + + """ + self.source = source + self.max_sentence_length = max_sentence_length + self.limit = limit + + if os.path.isfile(self.source): + logger.debug('single file given as source, rather than a directory of files') + logger.debug('consider using gensim.corpora.utils.LineSentence for a single file') + self.input_files = [self.source] # force code compatibility with list of files + elif os.path.isdir(self.source): + self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path + logger.info('reading directory %s', self.source) + self.input_files = os.listdir(self.source) + self.input_files = [self.source + filename for filename in self.input_files] # make full paths + self.input_files.sort() # makes sure it happens in filename order + else: # not a file or a directory, then we can't do anything with it + raise ValueError('input is neither a file nor a path') + logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files)) + + def __iter__(self): + """iterate through the files""" + for file_name in self.input_files: + logger.info('reading file %s', file_name) + with utils.open(file_name, 'rb') as fin: + for line in itertools.islice(fin, self.limit): + line = utils.to_unicode(line).split() + i = 0 + while i < len(line): + yield line[i:i + self.max_sentence_length] + i += self.max_sentence_length + + +class Text8Corpus(object): + def __init__(self, fname, max_sentence_length=MAX_WORDS): + """Iterate over sentences from the "text8" corpus, unzipped from http://mattmahoney.net/dc/text8.zip.""" + self.fname = fname + self.max_sentence_length = max_sentence_length + + def __iter__(self): + # the entire corpus is one gigantic line -- there are no sentence marks at all + # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens + sentence, rest = [], b'' + with utils.open(self.fname, 'rb') as fin: + while True: + text = rest + fin.read(8192) # avoid loading the entire file (=1 line) into RAM + if text == rest: # EOF + words = utils.to_unicode(text).split() + sentence.extend(words) # return the last chunk of words, too (may be shorter/longer) + if sentence: + yield sentence + break + last_token = text.rfind(b' ') # last token may have been split in two... keep for next iteration + words, rest = (utils.to_unicode(text[:last_token]).split(), + text[last_token:].strip()) if last_token >= 0 else ([], text) + sentence.extend(words) + while len(sentence) >= self.max_sentence_length: + yield sentence[:self.max_sentence_length] + sentence = sentence[self.max_sentence_length:] + + +class TaggedDocument(namedtuple('TaggedDocument', 'words tags')): + """Represents a document along with a tag, input document format for :class:`~gensim.models.doc2vec.Doc2Vec`. + + A single document, made up of `words` (a list of unicode string tokens) and `tags` (a list of tokens). + Tags may be one or more unicode string tokens, but typical practice (which will also be the most memory-efficient) + is for the tags list to include a unique integer id as the only tag. + + Replaces "sentence as a list of words" from :class:`gensim.models.word2vec.Word2Vec`. + + """ + def __str__(self): + """Human readable representation of the object's state, used for debugging. + + Returns + ------- + str + Human readable representation of the object's state (words and tags). + + """ + return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags) + + +class TaggedLineDocument(object): + def __init__(self, source): + """Iterate over a file that contains documents: one line = :class:`~gensim.corpora.utils.TaggedDocument` object. + + Words are expected to be already preprocessed and separated by whitespace. Document tags are constructed + automatically from the document line number (each document gets a unique integer tag). + + Parameters + ---------- + source : string or a file-like object + Path to the file on disk, or an already-open file object (must support `seek(0)`). + + Examples + -------- + .. sourcecode:: pycon + + >>> from gensim.test.utils import datapath + >>> from gensim.models.doc2vec import TaggedLineDocument + >>> + >>> for document in TaggedLineDocument(datapath("head500.noblanks.cor")): + ... pass + + """ + self.source = source + + def __iter__(self): + """Iterate through the lines in the source. + + Yields + ------ + :class:`~gensim.corpora.utils.TaggedDocument` + Document from `source` specified in the constructor. + + """ + try: + # Assume it is a file-like object and try treating it as such + # Things that don't have seek will trigger an exception + self.source.seek(0) + for item_no, line in enumerate(self.source): + yield TaggedDocument(utils.to_unicode(line).split(), [item_no]) + except AttributeError: + # If it didn't work like a file, use it as a string filename + with utils.open(self.source, 'rb') as fin: + for item_no, line in enumerate(fin): + yield TaggedDocument(utils.to_unicode(line).split(), [item_no]) + + +class BrownCorpus(object): + def __init__(self, dirname): + """Iterate over sentences from the `Brown corpus `_ + (part of `NLTK data `_). + + """ + self.dirname = dirname + + def __iter__(self): + for fname in os.listdir(self.dirname): + fname = os.path.join(self.dirname, fname) + if not os.path.isfile(fname): + continue + with utils.open(fname, 'rb') as fin: + for line in fin: + line = utils.to_unicode(line) + # each file line is a single sentence in the Brown corpus + # each token is WORD/POS_TAG + token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] + # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) + words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] + if not words: # don't bother sending out empty sentences + continue + yield words + + +class TaggedBrownCorpus(object): + def __init__(self, dirname): + """Reader for the `Brown corpus (part of NLTK data) `_. + + Parameters + ---------- + dirname : str + Path to folder with Brown corpus. + + """ + self.dirname = dirname + + def __iter__(self): + """Iterate through the corpus. + + Yields + ------ + :class:`~gensim.corpora.utils.TaggedDocument` + Document from `source`. + + """ + for fname in os.listdir(self.dirname): + fname = os.path.join(self.dirname, fname) + if not os.path.isfile(fname): + continue + with utils.open(fname, 'rb') as fin: + for item_no, line in enumerate(fin): + line = utils.to_unicode(line) + # each file line is a single document in the Brown corpus + # each token is WORD/POS_TAG + token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] + # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) + words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] + if not words: # don't bother sending out empty documents + continue + yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)]) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 3c6578a261..195269ea42 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -35,7 +35,8 @@ .. sourcecode:: pycon >>> from gensim.test.utils import common_texts - >>> from gensim.models.doc2vec import Doc2Vec, TaggedDocument + >>> from gensim.models.doc2vec import Doc2Vec + >>> from gensim.corpora.utils import TaggedDocument >>> >>> documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)] >>> model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4) @@ -67,7 +68,7 @@ import logging import os -from collections import namedtuple, defaultdict +from collections import defaultdict from collections.abc import Iterable from timeit import default_timer @@ -80,6 +81,9 @@ from gensim.models import Word2Vec, FAST_VERSION # noqa: F401 from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector +from gensim.corpora.utils import TaggedBrownCorpus +from gensim.corpora.utils import TaggedDocument, TaggedLineDocument + logger = logging.getLogger(__name__) try: @@ -117,28 +121,6 @@ def d2v_train_epoch_dm(model, corpus_file, offset, start_doctag, _cython_vocab, raise NotImplementedError("Training with corpus_file argument is not supported.") -class TaggedDocument(namedtuple('TaggedDocument', 'words tags')): - """Represents a document along with a tag, input document format for :class:`~gensim.models.doc2vec.Doc2Vec`. - - A single document, made up of `words` (a list of unicode string tokens) and `tags` (a list of tokens). - Tags may be one or more unicode string tokens, but typical practice (which will also be the most memory-efficient) - is for the tags list to include a unique integer id as the only tag. - - Replaces "sentence as a list of words" from :class:`gensim.models.word2vec.Word2Vec`. - - """ - def __str__(self): - """Human readable representation of the object's state, used for debugging. - - Returns - ------- - str - Human readable representation of the object's state (words and tags). - - """ - return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags) - - @dataclass class Doctag: """A dataclass shape-compatible with keyedvectors.SimpleVocab, extended to record @@ -170,16 +152,16 @@ def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=No Parameters ---------- - documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional + documents : iterable of list of :class:`~gensim.corpora.utils.TaggedDocument`, optional Input corpus, can be simply a list of elements, but for larger corpora,consider an iterable that streams the documents directly from disk/network. If you don't supply `documents` (or `corpus_file`), the model is left uninitialized -- use if you plan to initialize it in some other way. corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format. You may use this argument instead of `documents` to get performance boost. Only one of `documents` or `corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized). Documents' tags are assigned automatically and are equal to line number, as in - :class:`~gensim.models.doc2vec.TaggedLineDocument`. + :class:`~gensim.corpora.utils.TaggedLineDocument`. dm : {1,0}, optional Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used. Otherwise, `distributed bag of words` (PV-DBOW) is employed. @@ -393,7 +375,7 @@ def _do_train_job(self, job, alpha, inits): Parameters ---------- - job : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument` + job : iterable of list of :class:`~gensim.corpora.utils.TaggedDocument` The corpus chunk to be used for training this batch. alpha : float Learning rate to be used for training this batch. @@ -448,15 +430,15 @@ def train(self, corpus_iterable=None, corpus_file=None, total_examples=None, tot Parameters ---------- - corpus_iterable : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional + corpus_iterable : iterable of list of :class:`~gensim.corpora.utils.TaggedDocument`, optional Can be simply a list of elements, but for larger corpora,consider an iterable that streams the documents directly from disk/network. If you don't supply `documents` (or `corpus_file`), the model is left uninitialized -- use if you plan to initialize it in some other way. corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format. You may use this argument instead of `documents` to get performance boost. Only one of `documents` or `corpus_file` arguments need to be passed (not both of them). Documents' tags are assigned automatically - and are equal to line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`. + and are equal to line number, as in :class:`~gensim.corpora.utils.TaggedLineDocument`. total_examples : int, optional Count of documents. total_words : int, optional @@ -520,7 +502,7 @@ def _get_offsets_and_start_doctags_for_corpusfile(cls, corpus_file, workers): Parameters ---------- corpus_file : str - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format. workers : int Number of workers. @@ -558,7 +540,7 @@ def _raw_word_count(self, job): Parameters ---------- - job : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument` + job : iterable of list of :class:`~gensim.corpora.utils.TaggedDocument` Corpus chunk. Returns @@ -834,15 +816,16 @@ def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, prog Parameters ---------- - documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional - Can be simply a list of :class:`~gensim.models.doc2vec.TaggedDocument` elements, but for larger corpora, - consider an iterable that streams the documents directly from disk/network. - See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument` + documents : iterable of list of :class:`~gensim.corpora.utils.TaggedDocument`, optional + Can be simply a list of :class:`~gensim.corpora.utils.TaggedDocument` elements, but + for larger corpora, consider an iterable that streams the documents directly from disk/network. + See :class:`~gensim.corpora.utils.TaggedBrownCorpus` or + :class:`~gensim.corpora.utils.TaggedLineDocument` corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format. You may use this argument instead of `documents` to get performance boost. Only one of `documents` or `corpus_file` arguments need to be passed (not both of them). Documents' tags are assigned automatically - and are equal to a line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`. + and are equal to a line number, as in :class:`~gensim.corpora.utils.TaggedLineDocument`. update : bool If true, the new words in `documents` will be added to model's vocab. progress_per : int @@ -1005,10 +988,10 @@ def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=10000, Parameters ---------- - documents : iterable of :class:`~gensim.models.doc2vec.TaggedDocument`, optional + documents : iterable of :class:`~gensim.corpora.utils.TaggedDocument`, optional The tagged documents used to create the vocabulary. Their tags can be either str tokens or ints (faster). corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format. You may use this argument instead of `documents` to get performance boost. Only one of `documents` or `corpus_file` arguments need to be passed (not both of them). progress_per : int @@ -1083,86 +1066,7 @@ class Doc2VecTrainables(utils.SaveLoad): """Obsolete class retained for now as load-compatibility state capture""" -class TaggedBrownCorpus(object): - def __init__(self, dirname): - """Reader for the `Brown corpus (part of NLTK data) `_. - - Parameters - ---------- - dirname : str - Path to folder with Brown corpus. - - """ - self.dirname = dirname - - def __iter__(self): - """Iterate through the corpus. - - Yields - ------ - :class:`~gensim.models.doc2vec.TaggedDocument` - Document from `source`. - - """ - for fname in os.listdir(self.dirname): - fname = os.path.join(self.dirname, fname) - if not os.path.isfile(fname): - continue - with utils.open(fname, 'rb') as fin: - for item_no, line in enumerate(fin): - line = utils.to_unicode(line) - # each file line is a single document in the Brown corpus - # each token is WORD/POS_TAG - token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] - # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) - words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] - if not words: # don't bother sending out empty documents - continue - yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)]) - - -class TaggedLineDocument(object): - def __init__(self, source): - """Iterate over a file that contains documents: one line = :class:`~gensim.models.doc2vec.TaggedDocument` object. - - Words are expected to be already preprocessed and separated by whitespace. Document tags are constructed - automatically from the document line number (each document gets a unique integer tag). - - Parameters - ---------- - source : string or a file-like object - Path to the file on disk, or an already-open file object (must support `seek(0)`). - - Examples - -------- - .. sourcecode:: pycon - - >>> from gensim.test.utils import datapath - >>> from gensim.models.doc2vec import TaggedLineDocument - >>> - >>> for document in TaggedLineDocument(datapath("head500.noblanks.cor")): - ... pass - - """ - self.source = source - - def __iter__(self): - """Iterate through the lines in the source. - - Yields - ------ - :class:`~gensim.models.doc2vec.TaggedDocument` - Document from `source` specified in the constructor. - - """ - try: - # Assume it is a file-like object and try treating it as such - # Things that don't have seek will trigger an exception - self.source.seek(0) - for item_no, line in enumerate(self.source): - yield TaggedDocument(utils.to_unicode(line).split(), [item_no]) - except AttributeError: - # If it didn't work like a file, use it as a string filename - with utils.open(self.source, 'rb') as fin: - for item_no, line in enumerate(fin): - yield TaggedDocument(utils.to_unicode(line).split(), [item_no]) +# Alliases of classes so that code relies on original location works +TaggedBrownCorpus = TaggedBrownCorpus +TaggedDocument = TaggedDocument +TaggedLineDocument = TaggedLineDocument diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 64a21aafa7..03032c7de8 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -76,7 +76,7 @@ This is OK for smaller datasets, but for larger datasets, we recommend streaming the file, for example from disk or the network. In Gensim, we refer to such datasets as "corpora" (singular "corpus"), and keep them -in the format described in :class:`~gensim.models.word2vec.LineSentence`. +in the format described in :class:`~gensim.corpora.utils.LineSentence`. Passing a corpus is simple: .. sourcecode:: pycon @@ -264,10 +264,10 @@ from gensim.models.keyedvectors import KeyedVectors from gensim import utils from gensim.utils import deprecated +from gensim.corpora.utils import MAX_WORDS try: from gensim.models.fasttext_inner import ( # noqa: F401 train_batch_any, - MAX_WORDS_IN_BATCH, compute_ngrams, compute_ngrams_bytes, ft_hash_bytes, @@ -286,7 +286,7 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100 window=5, min_count=5, max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, min_n=3, max_n=6, - sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(), + sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS, callbacks=(), max_final_vocab=None): """Train, use and evaluate word representations learned using the method described in `Enriching Word Vectors with Subword Information `_, @@ -301,12 +301,12 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100 sentences : iterable of list of str, optional Can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus' - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such + See :class:`~gensim.corpora.utils.BrownCorpus`, :class:`~gensim.corpora.utils.Text8Corpus' + or :class:`~gensim.corpora.utils.LineSentence` in :mod:`~gensim.corpora.utils` module for such examples. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it in some other way. corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format. You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or `corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized). @@ -495,10 +495,11 @@ def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, prog corpus_iterable : iterable of list of str, optional Can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + See :class:`~gensim.corpora.utils.BrownCorpus`, :class:`~gensim.corpora.utils.Text8Corpus` + or :class:`~gensim.corpora.utils.LineSentence` in :mod:`~gensim.corpora.utils` + module for such examples. corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format. You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or `corpus_file` arguments need to be passed (not both of them). update : bool @@ -620,8 +621,9 @@ def _do_train_job(self, sentences, alpha, inits): sentences : iterable of list of str Can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + See :class:`~gensim.corpora.utils.BrownCorpus`, :class:`~gensim.corpora.utils.Text8Corpus` + or :class:`~gensim.corpora.utils.LineSentence` in :mod:`~gensim.corpora.utils` + module for such examples. alpha : float The current learning rate. inits : tuple of (:class:`numpy.ndarray`, :class:`numpy.ndarray`) @@ -659,10 +661,12 @@ def train(self, corpus_iterable=None, corpus_file=None, total_examples=None, tot sentences : iterable of list of str, optional The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + See :class:`~gensim.corpora.utils.BrownCorpus`, + :class:`~gensim.corpora.utils.Text8Corpus` + or :class:`~gensim.corpora.utils.LineSentence` in :mod:`~gensim.corpora.utils` + module for such examples. corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format. If you use this argument instead of `sentences`, you must provide `total_words` argument as well. Only one of `sentences` or `corpus_file` arguments need to be passed (not both of them). total_examples : int diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 959604a4fc..fb5a341577 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -20,7 +20,7 @@ .. sourcecode:: pycon >>> from gensim.test.utils import datapath - >>> from gensim.models.word2vec import Text8Corpus + >>> from gensim.corpora.utils import Text8Corpus >>> from gensim.models.phrases import Phrases >>> >>> # Create training corpus. Must be a sequence of sentences (e.g. an iterable or a generator). @@ -311,7 +311,7 @@ def export_phrases(self, sentences): .. sourcecode:: pycon >>> from gensim.test.utils import datapath - >>> from gensim.models.word2vec import Text8Corpus + >>> from gensim.corpora.utils import Text8Corpus >>> from gensim.models.phrases import Phrases >>> >>> sentences = Text8Corpus(datapath('testcorpus.txt')) @@ -423,8 +423,8 @@ def __init__( ---------- sentences : iterable of list of str, optional The `sentences` iterable can be simply a list, but for larger corpora, consider a generator that streams - the sentences directly from disk/network, See :class:`~gensim.models.word2vec.BrownCorpus`, - :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` + the sentences directly from disk/network, See :class:`~gensim.corpora.utils.BrownCorpus`, + :class:`~gensim.corpora.utils.Text8Corpus` or :class:`~gensim.corpora.utils.LineSentence` for such examples. min_count : float, optional Ignore all words and bigrams with total collected count lower than this value. @@ -474,7 +474,7 @@ def __init__( .. sourcecode:: pycon >>> from gensim.test.utils import datapath - >>> from gensim.models.word2vec import Text8Corpus + >>> from gensim.corpora.utils import Text8Corpus >>> from gensim.models.phrases import Phrases >>> >>> # Load corpus and train a model. @@ -563,8 +563,8 @@ def _learn_vocab( ---------- sentences : iterable of list of str The `sentences` iterable can be simply a list, but for larger corpora, consider a generator that streams - the sentences directly from disk/network, See :class:`~gensim.models.word2vec.BrownCorpus`, - :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` + the sentences directly from disk/network, See :class:`~gensim.corpora.utils.BrownCorpus`, + :class:`~gensim.corpora.utils.Text8Corpus` or :class:`~gensim.corpora.utils.LineSentence` for such examples. max_vocab_size : int Maximum size (number of tokens) of the vocabulary. Used to control pruning of less common words, @@ -628,7 +628,7 @@ def add_vocab(self, sentences): .. sourcecode:: pycon >>> from gensim.test.utils import datapath - >>> from gensim.models.word2vec import Text8Corpus + >>> from gensim.corpora.utils import Text8Corpus >>> from gensim.models.phrases import Phrases >>> >>> # Train a phrase detector from a text corpus. @@ -740,7 +740,7 @@ def __init__(self, phrases_model): .. sourcecode:: pycon >>> from gensim.test.utils import datapath - >>> from gensim.models.word2vec import Text8Corpus + >>> from gensim.corpora.utils import Text8Corpus >>> from gensim.models.phrases import Phrases >>> >>> # Load corpus and train a model. @@ -809,7 +809,7 @@ def score_candidate(self, word_a, word_b, in_between): infile = sys.argv[1] from gensim.models import Phrases # noqa:F811 for pickle - from gensim.models.word2vec import Text8Corpus + from gensim.corpora.utils import Text8Corpus sentences = Text8Corpus(infile) bigram = Phrases(sentences, min_count=5, threshold=100) diff --git a/gensim/models/translation_matrix.py b/gensim/models/translation_matrix.py index 528e3d6fa2..c6f4c39894 100644 --- a/gensim/models/translation_matrix.py +++ b/gensim/models/translation_matrix.py @@ -403,7 +403,7 @@ def __init__(self, source_lang_vec, target_lang_vec, tagged_docs=None, random_st Source Doc2Vec model. target_lang_vec : :class:`~gensim.models.doc2vec.Doc2Vec` Target Doc2Vec model. - tagged_docs : list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional. + tagged_docs : list of :class:`~gensim.corpora.utils.TaggedDocument`, optional. Documents that will be used for training, both the source language document vector and target language document vector trained on those tagged documents. random_state : {None, int, array_like}, optional @@ -425,7 +425,7 @@ def train(self, tagged_docs): Parameters ---------- - tagged_docs : list of :class:`~gensim.models.doc2vec.TaggedDocument`, Documents + tagged_docs : list of :class:`~gensim.corpora.utils.TaggedDocument`, Documents that will be used for training, both the source language document vector and target language document vector trained on those tagged documents. diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 806e087c56..af2fa47c43 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -50,8 +50,8 @@ Note the ``sentences`` iterable must be *restartable* (not just a generator), to allow the algorithm to stream over your dataset multiple times. For some examples of streamed iterables, -see :class:`~gensim.models.word2vec.BrownCorpus`, -:class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence`. +see :class:`~gensim.corpora.utils.BrownCorpus`, +:class:`~gensim.corpora.utils.Text8Corpus` or :class:`~gensim.corpora.utils.LineSentence`. If you save the model you can continue training it later: @@ -188,7 +188,6 @@ from collections import defaultdict, namedtuple from types import GeneratorType import threading -import itertools import copy from queue import Queue, Empty @@ -199,6 +198,9 @@ from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector from gensim import utils, matutils +from gensim.corpora.utils import BrownCorpus, Text8Corpus +from gensim.corpora.utils import LineSentence, PathLineSentences +from gensim.corpora.utils import MAX_WORDS logger = logging.getLogger(__name__) @@ -208,7 +210,6 @@ train_batch_cbow, score_sentence_sg, score_sentence_cbow, - MAX_WORDS_IN_BATCH, FAST_VERSION, ) except ImportError: @@ -238,7 +239,7 @@ def __init__( self, sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), + trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS, compute_loss=False, callbacks=(), comment=None, max_final_vocab=None, ): """Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/. @@ -259,14 +260,15 @@ def __init__( sentences : iterable of iterables, optional The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + See :class:`~gensim.corpora.utils.BrownCorpus`, :class:`~gensim.corpora.utils.Text8Corpus` + or :class:`~gensim.corpora.utils.LineSentence` in :mod:`~gensim.corpora.utils` module + for such examples. See also the `tutorial on data streaming in Python `_. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it in some other way. corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format. You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or `corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized). vector_size : int, optional @@ -450,10 +452,10 @@ def build_vocab( corpus_iterable : iterable of list of str Can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` module for such examples. + See :class:`~gensim.corpora.utils.BrownCorpus`, :class:`~gensim.corpora.utils.Text8Corpus` + or :class:`~gensim.corpora.utils.LineSentence` module for such examples. corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format. You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or `corpus_file` arguments need to be passed (not both of them). update : bool @@ -966,12 +968,13 @@ def train( corpus_iterable : iterable of list of str The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + See :class:`~gensim.corpora.utils.BrownCorpus`, :class:`~gensim.corpora.utils.Text8Corpus` + or :class:`~gensim.corpora.utils.LineSentence` in :mod:`~gensim.corpora.utils` + module for such examples. See also the `tutorial on data streaming in Python `_. corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format. You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or `corpus_file` arguments need to be passed (not both of them). total_examples : int @@ -1080,7 +1083,7 @@ def _worker_loop_corpusfile( Parameters ---------- corpus_file : str - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format. thread_id : int Thread index starting from 0 to `number of workers - 1`. offset : int @@ -1299,7 +1302,7 @@ def _train_epoch_corpusfile( Parameters ---------- corpus_file : str - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format. cur_epoch : int, optional The current training epoch, needed to compute the training parameters for each job. For example in many implementations the learning rate would be dropping with the number of epochs. @@ -1669,8 +1672,9 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor sentences : iterable of list of str The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + See :class:`~gensim.corpora.utils.BrownCorpus`, :class:`~gensim.corpora.utils.Text8Corpus` + or :class:`~gensim.corpora.utils.LineSentence` in :mod:`~gensim.corpora.utils` + module for such examples. total_sentences : int, optional Count of sentences. chunksize : int, optional @@ -1968,163 +1972,6 @@ def get_latest_training_loss(self): return self.running_training_loss -class BrownCorpus(object): - def __init__(self, dirname): - """Iterate over sentences from the `Brown corpus `_ - (part of `NLTK data `_). - - """ - self.dirname = dirname - - def __iter__(self): - for fname in os.listdir(self.dirname): - fname = os.path.join(self.dirname, fname) - if not os.path.isfile(fname): - continue - with utils.open(fname, 'rb') as fin: - for line in fin: - line = utils.to_unicode(line) - # each file line is a single sentence in the Brown corpus - # each token is WORD/POS_TAG - token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] - # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) - words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] - if not words: # don't bother sending out empty sentences - continue - yield words - - -class Text8Corpus(object): - def __init__(self, fname, max_sentence_length=MAX_WORDS_IN_BATCH): - """Iterate over sentences from the "text8" corpus, unzipped from http://mattmahoney.net/dc/text8.zip.""" - self.fname = fname - self.max_sentence_length = max_sentence_length - - def __iter__(self): - # the entire corpus is one gigantic line -- there are no sentence marks at all - # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens - sentence, rest = [], b'' - with utils.open(self.fname, 'rb') as fin: - while True: - text = rest + fin.read(8192) # avoid loading the entire file (=1 line) into RAM - if text == rest: # EOF - words = utils.to_unicode(text).split() - sentence.extend(words) # return the last chunk of words, too (may be shorter/longer) - if sentence: - yield sentence - break - last_token = text.rfind(b' ') # last token may have been split in two... keep for next iteration - words, rest = (utils.to_unicode(text[:last_token]).split(), - text[last_token:].strip()) if last_token >= 0 else ([], text) - sentence.extend(words) - while len(sentence) >= self.max_sentence_length: - yield sentence[:self.max_sentence_length] - sentence = sentence[self.max_sentence_length:] - - -class LineSentence(object): - def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): - """Iterate over a file that contains sentences: one line = one sentence. - Words must be already preprocessed and separated by whitespace. - - Parameters - ---------- - source : string or a file-like object - Path to the file on disk, or an already-open file object (must support `seek(0)`). - limit : int or None - Clip the file to the first `limit` lines. Do no clipping if `limit is None` (the default). - - Examples - -------- - .. sourcecode:: pycon - - >>> from gensim.test.utils import datapath - >>> sentences = LineSentence(datapath('lee_background.cor')) - >>> for sentence in sentences: - ... pass - - """ - self.source = source - self.max_sentence_length = max_sentence_length - self.limit = limit - - def __iter__(self): - """Iterate through the lines in the source.""" - try: - # Assume it is a file-like object and try treating it as such - # Things that don't have seek will trigger an exception - self.source.seek(0) - for line in itertools.islice(self.source, self.limit): - line = utils.to_unicode(line).split() - i = 0 - while i < len(line): - yield line[i: i + self.max_sentence_length] - i += self.max_sentence_length - except AttributeError: - # If it didn't work like a file, use it as a string filename - with utils.open(self.source, 'rb') as fin: - for line in itertools.islice(fin, self.limit): - line = utils.to_unicode(line).split() - i = 0 - while i < len(line): - yield line[i: i + self.max_sentence_length] - i += self.max_sentence_length - - -class PathLineSentences(object): - def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): - """Like :class:`~gensim.models.word2vec.LineSentence`, but process all files in a directory - in alphabetical order by filename. - - The directory must only contain files that can be read by :class:`gensim.models.word2vec.LineSentence`: - .bz2, .gz, and text files. Any file not ending with .bz2 or .gz is assumed to be a text file. - - The format of files (either text, or compressed text files) in the path is one sentence = one line, - with words already preprocessed and separated by whitespace. - - Warnings - -------- - Does **not recurse** into subdirectories. - - Parameters - ---------- - source : str - Path to the directory. - limit : int or None - Read only the first `limit` lines from each file. Read all if limit is None (the default). - - """ - self.source = source - self.max_sentence_length = max_sentence_length - self.limit = limit - - if os.path.isfile(self.source): - logger.debug('single file given as source, rather than a directory of files') - logger.debug('consider using models.word2vec.LineSentence for a single file') - self.input_files = [self.source] # force code compatibility with list of files - elif os.path.isdir(self.source): - self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path - logger.info('reading directory %s', self.source) - self.input_files = os.listdir(self.source) - self.input_files = [self.source + filename for filename in self.input_files] # make full paths - self.input_files.sort() # makes sure it happens in filename order - else: # not a file or a directory, then we can't do anything with it - raise ValueError('input is neither a file nor a path') - logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files)) - - def __iter__(self): - """iterate through the files""" - for file_name in self.input_files: - logger.info('reading file %s', file_name) - with utils.open(file_name, 'rb') as fin: - for line in itertools.islice(fin, self.limit): - line = utils.to_unicode(line).split() - i = 0 - while i < len(line): - yield line[i:i + self.max_sentence_length] - i += self.max_sentence_length - - class Word2VecVocab(utils.SaveLoad): """Obsolete class retained for now as load-compatibility state capture.""" pass @@ -2196,6 +2043,12 @@ def _assign_binary_codes(wv): logger.info("built huffman tree with maximum node depth %i", max_depth) +# Alliases of classes so that code relies on original location works +BrownCorpus = BrownCorpus +Text8Corpus = Text8Corpus +LineSentence = LineSentence +PathLineSentences = PathLineSentences + # Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 \ # -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3 if __name__ == "__main__": diff --git a/gensim/scripts/word2vec_standalone.py b/gensim/scripts/word2vec_standalone.py index 57f4d907ba..9adc65cd3a 100644 --- a/gensim/scripts/word2vec_standalone.py +++ b/gensim/scripts/word2vec_standalone.py @@ -57,7 +57,8 @@ import argparse from numpy import seterr -from gensim.models.word2vec import Word2Vec, LineSentence # avoid referencing __main__ in pickle +from gensim.models.word2vec import Word2Vec # avoid referencing __main__ in pickle +from gensim.corpora.utils import LineSentence logger = logging.getLogger(__name__) diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 370897bfdb..2a5ebd6622 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -150,7 +150,7 @@ def fit(self, X, y=None): Parameters ---------- - X : {iterable of :class:`~gensim.models.doc2vec.TaggedDocument`, iterable of list of str} + X : {iterable of :class:`~gensim.corpora.utils.TaggedDocument`, iterable of list of str} A collection of tagged documents used for training the model. Returns diff --git a/gensim/sklearn_api/ftmodel.py b/gensim/sklearn_api/ftmodel.py index 7acd22cfc2..4f0d560f8d 100644 --- a/gensim/sklearn_api/ftmodel.py +++ b/gensim/sklearn_api/ftmodel.py @@ -179,8 +179,8 @@ def fit(self, X, y=None): X : iterable of iterables of str Can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + See :class:`~gensim.corpora.utils.BrownCorpus`, :class:`~gensim.corpora.utils.Text8Corpus` + or :class:`~gensim.corpora.utils.LineSentence` in :mod:`~gensim.corpora.utils` module for such examples. Returns ------- diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index ae64b56e3e..3f3ac9c8f3 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -134,8 +134,9 @@ def fit(self, X, y=None): X : iterable of iterables of str The input corpus. X can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + See :class:`~gensim.corpora.utils.BrownCorpus`, :class:`~gensim.corpora.utils.Text8Corpus` + or :class:`~gensim.corpora.utils.LineSentence` in :mod:`~gensim.corpora.utils` + module for such examples. Returns -------