diff --git a/.gitignore b/.gitignore
index c3ba120f37..09b652c23f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -76,3 +76,5 @@ data
 *.inv
 *.js
 docs/_images/
+*.c
+*.cpp
diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst
index 8f5e8fc61e..464f8f192f 100644
--- a/docs/src/apiref.rst
+++ b/docs/src/apiref.rst
@@ -27,6 +27,7 @@ Modules:
     corpora/textcorpus
     corpora/ucicorpus
     corpora/wikicorpus
+    corpora/utils
     models/ldamodel
     models/ldamulticore
     models/nmf
diff --git a/docs/src/corpora/utils.rst b/docs/src/corpora/utils.rst
new file mode 100644
index 0000000000..14955d2e0f
--- /dev/null
+++ b/docs/src/corpora/utils.rst
@@ -0,0 +1,9 @@
+:mod:`corpora.utils` -- Implements various corpus utility
+=========================================================
+
+.. automodule:: gensim.corpora.utils
+    :synopsis: Implements various corpus utility
+    :members:
+    :inherited-members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/gensim/corpora/utils.py b/gensim/corpora/utils.py
new file mode 100644
index 0000000000..71c3360f8a
--- /dev/null
+++ b/gensim/corpora/utils.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Author: Gensim Contributors
+# Copyright (C) 2020 RaRe Technologies s.r.o.
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+import logging
+import os
+import itertools
+from collections import namedtuple
+
+from gensim import utils
+
+logger = logging.getLogger(__name__)
+
+#: Shared capped size, in count of words/tokens, for texts that should not be arbitrarily long
+MAX_WORDS = 10000
+
+
+class LineSentence(object):
+    def __init__(self, source, max_sentence_length=MAX_WORDS, limit=None):
+        """Iterate over a file that contains sentences: one line = one sentence.
+        Words must be already preprocessed and separated by whitespace.
+
+        Parameters
+        ----------
+        source : string or a file-like object
+            Path to the file on disk, or an already-open file object (must support `seek(0)`).
+        limit : int or None
+            Clip the file to the first `limit` lines. Do no clipping if `limit is None` (the default).
+
+        Examples
+        --------
+        .. sourcecode:: pycon
+
+            >>> from gensim.test.utils import datapath
+            >>> sentences = LineSentence(datapath('lee_background.cor'))
+            >>> for sentence in sentences:
+            ...     pass
+
+        """
+        self.source = source
+        self.max_sentence_length = max_sentence_length
+        self.limit = limit
+
+    def __iter__(self):
+        """Iterate through the lines in the source."""
+        try:
+            # Assume it is a file-like object and try treating it as such
+            # Things that don't have seek will trigger an exception
+            self.source.seek(0)
+            for line in itertools.islice(self.source, self.limit):
+                line = utils.to_unicode(line).split()
+                i = 0
+                while i < len(line):
+                    yield line[i: i + self.max_sentence_length]
+                    i += self.max_sentence_length
+        except AttributeError:
+            # If it didn't work like a file, use it as a string filename
+            with utils.open(self.source, 'rb') as fin:
+                for line in itertools.islice(fin, self.limit):
+                    line = utils.to_unicode(line).split()
+                    i = 0
+                    while i < len(line):
+                        yield line[i: i + self.max_sentence_length]
+                        i += self.max_sentence_length
+
+
+class PathLineSentences(object):
+    def __init__(self, source, max_sentence_length=MAX_WORDS, limit=None):
+        """Like :class:`~gensim.corpora.utils.LineSentence`, but process all files in a directory
+        in alphabetical order by filename.
+
+        The directory must only contain files that can be read by :class:`gensim.corpora.utils.LineSentence`:
+        .bz2, .gz, and text files. Any file not ending with .bz2 or .gz is assumed to be a text file.
+
+        The format of files (either text, or compressed text files) in the path is one sentence = one line,
+        with words already preprocessed and separated by whitespace.
+
+        Warnings
+        --------
+        Does **not recurse** into subdirectories.
+
+        Parameters
+        ----------
+        source : str
+            Path to the directory.
+        limit : int or None
+            Read only the first `limit` lines from each file. Read all if limit is None (the default).
+
+        """
+        self.source = source
+        self.max_sentence_length = max_sentence_length
+        self.limit = limit
+
+        if os.path.isfile(self.source):
+            logger.debug('single file given as source, rather than a directory of files')
+            logger.debug('consider using gensim.corpora.utils.LineSentence for a single file')
+            self.input_files = [self.source]  # force code compatibility with list of files
+        elif os.path.isdir(self.source):
+            self.source = os.path.join(self.source, '')  # ensures os-specific slash at end of path
+            logger.info('reading directory %s', self.source)
+            self.input_files = os.listdir(self.source)
+            self.input_files = [self.source + filename for filename in self.input_files]  # make full paths
+            self.input_files.sort()  # makes sure it happens in filename order
+        else:  # not a file or a directory, then we can't do anything with it
+            raise ValueError('input is neither a file nor a path')
+        logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files))
+
+    def __iter__(self):
+        """iterate through the files"""
+        for file_name in self.input_files:
+            logger.info('reading file %s', file_name)
+            with utils.open(file_name, 'rb') as fin:
+                for line in itertools.islice(fin, self.limit):
+                    line = utils.to_unicode(line).split()
+                    i = 0
+                    while i < len(line):
+                        yield line[i:i + self.max_sentence_length]
+                        i += self.max_sentence_length
+
+
+class Text8Corpus(object):
+    def __init__(self, fname, max_sentence_length=MAX_WORDS):
+        """Iterate over sentences from the "text8" corpus, unzipped from http://mattmahoney.net/dc/text8.zip."""
+        self.fname = fname
+        self.max_sentence_length = max_sentence_length
+
+    def __iter__(self):
+        # the entire corpus is one gigantic line -- there are no sentence marks at all
+        # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens
+        sentence, rest = [], b''
+        with utils.open(self.fname, 'rb') as fin:
+            while True:
+                text = rest + fin.read(8192)  # avoid loading the entire file (=1 line) into RAM
+                if text == rest:  # EOF
+                    words = utils.to_unicode(text).split()
+                    sentence.extend(words)  # return the last chunk of words, too (may be shorter/longer)
+                    if sentence:
+                        yield sentence
+                    break
+                last_token = text.rfind(b' ')  # last token may have been split in two... keep for next iteration
+                words, rest = (utils.to_unicode(text[:last_token]).split(),
+                               text[last_token:].strip()) if last_token >= 0 else ([], text)
+                sentence.extend(words)
+                while len(sentence) >= self.max_sentence_length:
+                    yield sentence[:self.max_sentence_length]
+                    sentence = sentence[self.max_sentence_length:]
+
+
+class TaggedDocument(namedtuple('TaggedDocument', 'words tags')):
+    """Represents a document along with a tag, input document format for :class:`~gensim.models.doc2vec.Doc2Vec`.
+
+    A single document, made up of `words` (a list of unicode string tokens) and `tags` (a list of tokens).
+    Tags may be one or more unicode string tokens, but typical practice (which will also be the most memory-efficient)
+    is for the tags list to include a unique integer id as the only tag.
+
+    Replaces "sentence as a list of words" from :class:`gensim.models.word2vec.Word2Vec`.
+
+    """
+    def __str__(self):
+        """Human readable representation of the object's state, used for debugging.
+
+        Returns
+        -------
+        str
+           Human readable representation of the object's state (words and tags).
+
+        """
+        return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags)
+
+
+class TaggedLineDocument(object):
+    def __init__(self, source):
+        """Iterate over a file that contains documents: one line = :class:`~gensim.corpora.utils.TaggedDocument` object.
+
+        Words are expected to be already preprocessed and separated by whitespace. Document tags are constructed
+        automatically from the document line number (each document gets a unique integer tag).
+
+        Parameters
+        ----------
+        source : string or a file-like object
+            Path to the file on disk, or an already-open file object (must support `seek(0)`).
+
+        Examples
+        --------
+        .. sourcecode:: pycon
+
+            >>> from gensim.test.utils import datapath
+            >>> from gensim.models.doc2vec import TaggedLineDocument
+            >>>
+            >>> for document in TaggedLineDocument(datapath("head500.noblanks.cor")):
+            ...     pass
+
+        """
+        self.source = source
+
+    def __iter__(self):
+        """Iterate through the lines in the source.
+
+        Yields
+        ------
+        :class:`~gensim.corpora.utils.TaggedDocument`
+            Document from `source` specified in the constructor.
+
+        """
+        try:
+            # Assume it is a file-like object and try treating it as such
+            # Things that don't have seek will trigger an exception
+            self.source.seek(0)
+            for item_no, line in enumerate(self.source):
+                yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
+        except AttributeError:
+            # If it didn't work like a file, use it as a string filename
+            with utils.open(self.source, 'rb') as fin:
+                for item_no, line in enumerate(fin):
+                    yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
+
+
+class BrownCorpus(object):
+    def __init__(self, dirname):
+        """Iterate over sentences from the `Brown corpus <https://en.wikipedia.org/wiki/Brown_Corpus>`_
+        (part of `NLTK data <https://www.nltk.org/data.html>`_).
+
+        """
+        self.dirname = dirname
+
+    def __iter__(self):
+        for fname in os.listdir(self.dirname):
+            fname = os.path.join(self.dirname, fname)
+            if not os.path.isfile(fname):
+                continue
+            with utils.open(fname, 'rb') as fin:
+                for line in fin:
+                    line = utils.to_unicode(line)
+                    # each file line is a single sentence in the Brown corpus
+                    # each token is WORD/POS_TAG
+                    token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
+                    # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
+                    words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
+                    if not words:  # don't bother sending out empty sentences
+                        continue
+                    yield words
+
+
+class TaggedBrownCorpus(object):
+    def __init__(self, dirname):
+        """Reader for the `Brown corpus (part of NLTK data) <http://www.nltk.org/book/ch02.html#tab-brown-sources>`_.
+
+        Parameters
+        ----------
+        dirname : str
+            Path to folder with Brown corpus.
+
+        """
+        self.dirname = dirname
+
+    def __iter__(self):
+        """Iterate through the corpus.
+
+        Yields
+        ------
+        :class:`~gensim.corpora.utils.TaggedDocument`
+            Document from `source`.
+
+        """
+        for fname in os.listdir(self.dirname):
+            fname = os.path.join(self.dirname, fname)
+            if not os.path.isfile(fname):
+                continue
+            with utils.open(fname, 'rb') as fin:
+                for item_no, line in enumerate(fin):
+                    line = utils.to_unicode(line)
+                    # each file line is a single document in the Brown corpus
+                    # each token is WORD/POS_TAG
+                    token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
+                    # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
+                    words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
+                    if not words:  # don't bother sending out empty documents
+                        continue
+                    yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)])
diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
index 3c6578a261..195269ea42 100644
--- a/gensim/models/doc2vec.py
+++ b/gensim/models/doc2vec.py
@@ -35,7 +35,8 @@
 .. sourcecode:: pycon
 
     >>> from gensim.test.utils import common_texts
-    >>> from gensim.models.doc2vec import Doc2Vec, TaggedDocument
+    >>> from gensim.models.doc2vec import Doc2Vec
+    >>> from gensim.corpora.utils import TaggedDocument
     >>>
     >>> documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
     >>> model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)
@@ -67,7 +68,7 @@
 
 import logging
 import os
-from collections import namedtuple, defaultdict
+from collections import defaultdict
 from collections.abc import Iterable
 from timeit import default_timer
 
@@ -80,6 +81,9 @@
 from gensim.models import Word2Vec, FAST_VERSION  # noqa: F401
 from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector
 
+from gensim.corpora.utils import TaggedBrownCorpus
+from gensim.corpora.utils import TaggedDocument, TaggedLineDocument
+
 logger = logging.getLogger(__name__)
 
 try:
@@ -117,28 +121,6 @@ def d2v_train_epoch_dm(model, corpus_file, offset, start_doctag, _cython_vocab,
         raise NotImplementedError("Training with corpus_file argument is not supported.")
 
 
-class TaggedDocument(namedtuple('TaggedDocument', 'words tags')):
-    """Represents a document along with a tag, input document format for :class:`~gensim.models.doc2vec.Doc2Vec`.
-
-    A single document, made up of `words` (a list of unicode string tokens) and `tags` (a list of tokens).
-    Tags may be one or more unicode string tokens, but typical practice (which will also be the most memory-efficient)
-    is for the tags list to include a unique integer id as the only tag.
-
-    Replaces "sentence as a list of words" from :class:`gensim.models.word2vec.Word2Vec`.
-
-    """
-    def __str__(self):
-        """Human readable representation of the object's state, used for debugging.
-
-        Returns
-        -------
-        str
-           Human readable representation of the object's state (words and tags).
-
-        """
-        return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags)
-
-
 @dataclass
 class Doctag:
     """A dataclass shape-compatible with keyedvectors.SimpleVocab, extended to record
@@ -170,16 +152,16 @@ def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=No
 
         Parameters
         ----------
-        documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional
+        documents : iterable of list of :class:`~gensim.corpora.utils.TaggedDocument`, optional
             Input corpus, can be simply a list of elements, but for larger corpora,consider an iterable that streams
             the documents directly from disk/network. If you don't supply `documents` (or `corpus_file`), the model is
             left uninitialized -- use if you plan to initialize it in some other way.
         corpus_file : str, optional
-            Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+            Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format.
             You may use this argument instead of `documents` to get performance boost. Only one of `documents` or
             `corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized).
             Documents' tags are assigned automatically and are equal to line number, as in
-            :class:`~gensim.models.doc2vec.TaggedLineDocument`.
+            :class:`~gensim.corpora.utils.TaggedLineDocument`.
         dm : {1,0}, optional
             Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used.
             Otherwise, `distributed bag of words` (PV-DBOW) is employed.
@@ -393,7 +375,7 @@ def _do_train_job(self, job, alpha, inits):
 
         Parameters
         ----------
-        job : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`
+        job : iterable of list of :class:`~gensim.corpora.utils.TaggedDocument`
             The corpus chunk to be used for training this batch.
         alpha : float
             Learning rate to be used for training this batch.
@@ -448,15 +430,15 @@ def train(self, corpus_iterable=None, corpus_file=None, total_examples=None, tot
 
         Parameters
         ----------
-        corpus_iterable : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional
+        corpus_iterable : iterable of list of :class:`~gensim.corpora.utils.TaggedDocument`, optional
             Can be simply a list of elements, but for larger corpora,consider an iterable that streams
             the documents directly from disk/network. If you don't supply `documents` (or `corpus_file`), the model is
             left uninitialized -- use if you plan to initialize it in some other way.
         corpus_file : str, optional
-            Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+            Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format.
             You may use this argument instead of `documents` to get performance boost. Only one of `documents` or
             `corpus_file` arguments need to be passed (not both of them). Documents' tags are assigned automatically
-            and are equal to line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`.
+            and are equal to line number, as in :class:`~gensim.corpora.utils.TaggedLineDocument`.
         total_examples : int, optional
             Count of documents.
         total_words : int, optional
@@ -520,7 +502,7 @@ def _get_offsets_and_start_doctags_for_corpusfile(cls, corpus_file, workers):
         Parameters
         ----------
         corpus_file : str
-            Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+            Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format.
         workers : int
             Number of workers.
 
@@ -558,7 +540,7 @@ def _raw_word_count(self, job):
 
         Parameters
         ----------
-        job : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`
+        job : iterable of list of :class:`~gensim.corpora.utils.TaggedDocument`
             Corpus chunk.
 
         Returns
@@ -834,15 +816,16 @@ def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, prog
 
         Parameters
         ----------
-        documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional
-            Can be simply a list of :class:`~gensim.models.doc2vec.TaggedDocument` elements, but for larger corpora,
-            consider an iterable that streams the documents directly from disk/network.
-            See :class:`~gensim.models.doc2vec.TaggedBrownCorpus` or :class:`~gensim.models.doc2vec.TaggedLineDocument`
+        documents : iterable of list of :class:`~gensim.corpora.utils.TaggedDocument`, optional
+            Can be simply a list of :class:`~gensim.corpora.utils.TaggedDocument` elements, but
+            for larger corpora, consider an iterable that streams the documents directly from disk/network.
+            See :class:`~gensim.corpora.utils.TaggedBrownCorpus` or
+            :class:`~gensim.corpora.utils.TaggedLineDocument`
         corpus_file : str, optional
-            Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+            Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format.
             You may use this argument instead of `documents` to get performance boost. Only one of `documents` or
             `corpus_file` arguments need to be passed (not both of them). Documents' tags are assigned automatically
-            and are equal to a line number, as in :class:`~gensim.models.doc2vec.TaggedLineDocument`.
+            and are equal to a line number, as in :class:`~gensim.corpora.utils.TaggedLineDocument`.
         update : bool
             If true, the new words in `documents` will be added to model's vocab.
         progress_per : int
@@ -1005,10 +988,10 @@ def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=10000,
 
         Parameters
         ----------
-        documents : iterable of :class:`~gensim.models.doc2vec.TaggedDocument`, optional
+        documents : iterable of :class:`~gensim.corpora.utils.TaggedDocument`, optional
             The tagged documents used to create the vocabulary. Their tags can be either str tokens or ints (faster).
         corpus_file : str, optional
-            Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+            Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format.
             You may use this argument instead of `documents` to get performance boost. Only one of `documents` or
             `corpus_file` arguments need to be passed (not both of them).
         progress_per : int
@@ -1083,86 +1066,7 @@ class Doc2VecTrainables(utils.SaveLoad):
     """Obsolete class retained for now as load-compatibility state capture"""
 
 
-class TaggedBrownCorpus(object):
-    def __init__(self, dirname):
-        """Reader for the `Brown corpus (part of NLTK data) <http://www.nltk.org/book/ch02.html#tab-brown-sources>`_.
-
-        Parameters
-        ----------
-        dirname : str
-            Path to folder with Brown corpus.
-
-        """
-        self.dirname = dirname
-
-    def __iter__(self):
-        """Iterate through the corpus.
-
-        Yields
-        ------
-        :class:`~gensim.models.doc2vec.TaggedDocument`
-            Document from `source`.
-
-        """
-        for fname in os.listdir(self.dirname):
-            fname = os.path.join(self.dirname, fname)
-            if not os.path.isfile(fname):
-                continue
-            with utils.open(fname, 'rb') as fin:
-                for item_no, line in enumerate(fin):
-                    line = utils.to_unicode(line)
-                    # each file line is a single document in the Brown corpus
-                    # each token is WORD/POS_TAG
-                    token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
-                    # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
-                    words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
-                    if not words:  # don't bother sending out empty documents
-                        continue
-                    yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)])
-
-
-class TaggedLineDocument(object):
-    def __init__(self, source):
-        """Iterate over a file that contains documents: one line = :class:`~gensim.models.doc2vec.TaggedDocument` object.
-
-        Words are expected to be already preprocessed and separated by whitespace. Document tags are constructed
-        automatically from the document line number (each document gets a unique integer tag).
-
-        Parameters
-        ----------
-        source : string or a file-like object
-            Path to the file on disk, or an already-open file object (must support `seek(0)`).
-
-        Examples
-        --------
-        .. sourcecode:: pycon
-
-            >>> from gensim.test.utils import datapath
-            >>> from gensim.models.doc2vec import TaggedLineDocument
-            >>>
-            >>> for document in TaggedLineDocument(datapath("head500.noblanks.cor")):
-            ...     pass
-
-        """
-        self.source = source
-
-    def __iter__(self):
-        """Iterate through the lines in the source.
-
-        Yields
-        ------
-        :class:`~gensim.models.doc2vec.TaggedDocument`
-            Document from `source` specified in the constructor.
-
-        """
-        try:
-            # Assume it is a file-like object and try treating it as such
-            # Things that don't have seek will trigger an exception
-            self.source.seek(0)
-            for item_no, line in enumerate(self.source):
-                yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
-        except AttributeError:
-            # If it didn't work like a file, use it as a string filename
-            with utils.open(self.source, 'rb') as fin:
-                for item_no, line in enumerate(fin):
-                    yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
+# Alliases of classes so that code relies on original location works
+TaggedBrownCorpus = TaggedBrownCorpus
+TaggedDocument = TaggedDocument
+TaggedLineDocument = TaggedLineDocument
diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
index 64a21aafa7..03032c7de8 100644
--- a/gensim/models/fasttext.py
+++ b/gensim/models/fasttext.py
@@ -76,7 +76,7 @@
 This is OK for smaller datasets, but for larger datasets, we recommend streaming the file,
 for example from disk or the network.
 In Gensim, we refer to such datasets as "corpora" (singular "corpus"), and keep them
-in the format described in :class:`~gensim.models.word2vec.LineSentence`.
+in the format described in :class:`~gensim.corpora.utils.LineSentence`.
 Passing a corpus is simple:
 
 .. sourcecode:: pycon
@@ -264,10 +264,10 @@
 from gensim.models.keyedvectors import KeyedVectors
 from gensim import utils
 from gensim.utils import deprecated
+from gensim.corpora.utils import MAX_WORDS
 try:
     from gensim.models.fasttext_inner import (  # noqa: F401
         train_batch_any,
-        MAX_WORDS_IN_BATCH,
         compute_ngrams,
         compute_ngrams_bytes,
         ft_hash_bytes,
@@ -286,7 +286,7 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100
                  window=5, min_count=5,
                  max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
                  negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, min_n=3, max_n=6,
-                 sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(),
+                 sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS, callbacks=(),
                  max_final_vocab=None):
         """Train, use and evaluate word representations learned using the method
         described in `Enriching Word Vectors with Subword Information <https://arxiv.org/abs/1607.04606>`_,
@@ -301,12 +301,12 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100
         sentences : iterable of list of str, optional
             Can be simply a list of lists of tokens, but for larger corpora,
             consider an iterable that streams the sentences directly from disk/network.
-            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus'
-            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such
+            See :class:`~gensim.corpora.utils.BrownCorpus`, :class:`~gensim.corpora.utils.Text8Corpus'
+            or :class:`~gensim.corpora.utils.LineSentence` in :mod:`~gensim.corpora.utils` module for such
             examples. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to
             initialize it in some other way.
         corpus_file : str, optional
-            Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+            Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format.
             You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
             `corpus_file` arguments need to be passed (or none of them, in that case, the model is left
             uninitialized).
@@ -495,10 +495,11 @@ def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, prog
         corpus_iterable : iterable of list of str, optional
             Can be simply a list of lists of tokens, but for larger corpora,
             consider an iterable that streams the sentences directly from disk/network.
-            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
-            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
+            See :class:`~gensim.corpora.utils.BrownCorpus`, :class:`~gensim.corpora.utils.Text8Corpus`
+            or :class:`~gensim.corpora.utils.LineSentence` in :mod:`~gensim.corpora.utils`
+            module for such examples.
         corpus_file : str, optional
-            Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+            Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format.
             You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
             `corpus_file` arguments need to be passed (not both of them).
         update : bool
@@ -620,8 +621,9 @@ def _do_train_job(self, sentences, alpha, inits):
         sentences : iterable of list of str
             Can be simply a list of lists of tokens, but for larger corpora,
             consider an iterable that streams the sentences directly from disk/network.
-            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
-            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
+            See :class:`~gensim.corpora.utils.BrownCorpus`, :class:`~gensim.corpora.utils.Text8Corpus`
+            or :class:`~gensim.corpora.utils.LineSentence` in :mod:`~gensim.corpora.utils`
+            module for such examples.
         alpha : float
             The current learning rate.
         inits : tuple of (:class:`numpy.ndarray`, :class:`numpy.ndarray`)
@@ -659,10 +661,12 @@ def train(self, corpus_iterable=None, corpus_file=None, total_examples=None, tot
         sentences : iterable of list of str, optional
             The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
             consider an iterable that streams the sentences directly from disk/network.
-            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
-            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
+            See :class:`~gensim.corpora.utils.BrownCorpus`,
+            :class:`~gensim.corpora.utils.Text8Corpus`
+            or :class:`~gensim.corpora.utils.LineSentence` in :mod:`~gensim.corpora.utils`
+            module for such examples.
         corpus_file : str, optional
-            Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+            Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format.
             If you use this argument instead of `sentences`, you must provide `total_words` argument as well. Only one
             of `sentences` or `corpus_file` arguments need to be passed (not both of them).
         total_examples : int
diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 959604a4fc..fb5a341577 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -20,7 +20,7 @@
 .. sourcecode:: pycon
 
     >>> from gensim.test.utils import datapath
-    >>> from gensim.models.word2vec import Text8Corpus
+    >>> from gensim.corpora.utils import Text8Corpus
     >>> from gensim.models.phrases import Phrases
     >>>
     >>> # Create training corpus. Must be a sequence of sentences (e.g. an iterable or a generator).
@@ -311,7 +311,7 @@ def export_phrases(self, sentences):
         .. sourcecode:: pycon
 
             >>> from gensim.test.utils import datapath
-            >>> from gensim.models.word2vec import Text8Corpus
+            >>> from gensim.corpora.utils import Text8Corpus
             >>> from gensim.models.phrases import Phrases
             >>>
             >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
@@ -423,8 +423,8 @@ def __init__(
         ----------
         sentences : iterable of list of str, optional
             The `sentences` iterable can be simply a list, but for larger corpora, consider a generator that streams
-            the sentences directly from disk/network, See :class:`~gensim.models.word2vec.BrownCorpus`,
-            :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence`
+            the sentences directly from disk/network, See :class:`~gensim.corpora.utils.BrownCorpus`,
+            :class:`~gensim.corpora.utils.Text8Corpus` or :class:`~gensim.corpora.utils.LineSentence`
             for such examples.
         min_count : float, optional
             Ignore all words and bigrams with total collected count lower than this value.
@@ -474,7 +474,7 @@ def __init__(
         .. sourcecode:: pycon
 
             >>> from gensim.test.utils import datapath
-            >>> from gensim.models.word2vec import Text8Corpus
+            >>> from gensim.corpora.utils import Text8Corpus
             >>> from gensim.models.phrases import Phrases
             >>>
             >>> # Load corpus and train a model.
@@ -563,8 +563,8 @@ def _learn_vocab(
         ----------
         sentences : iterable of list of str
             The `sentences` iterable can be simply a list, but for larger corpora, consider a generator that streams
-            the sentences directly from disk/network, See :class:`~gensim.models.word2vec.BrownCorpus`,
-            :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence`
+            the sentences directly from disk/network, See :class:`~gensim.corpora.utils.BrownCorpus`,
+            :class:`~gensim.corpora.utils.Text8Corpus` or :class:`~gensim.corpora.utils.LineSentence`
             for such examples.
         max_vocab_size : int
             Maximum size (number of tokens) of the vocabulary. Used to control pruning of less common words,
@@ -628,7 +628,7 @@ def add_vocab(self, sentences):
         .. sourcecode:: pycon
 
             >>> from gensim.test.utils import datapath
-            >>> from gensim.models.word2vec import Text8Corpus
+            >>> from gensim.corpora.utils import Text8Corpus
             >>> from gensim.models.phrases import Phrases
             >>>
             >>> # Train a phrase detector from a text corpus.
@@ -740,7 +740,7 @@ def __init__(self, phrases_model):
         .. sourcecode:: pycon
 
             >>> from gensim.test.utils import datapath
-            >>> from gensim.models.word2vec import Text8Corpus
+            >>> from gensim.corpora.utils import Text8Corpus
             >>> from gensim.models.phrases import Phrases
             >>>
             >>> # Load corpus and train a model.
@@ -809,7 +809,7 @@ def score_candidate(self, word_a, word_b, in_between):
     infile = sys.argv[1]
 
     from gensim.models import Phrases  # noqa:F811 for pickle
-    from gensim.models.word2vec import Text8Corpus
+    from gensim.corpora.utils import Text8Corpus
     sentences = Text8Corpus(infile)
 
     bigram = Phrases(sentences, min_count=5, threshold=100)
diff --git a/gensim/models/translation_matrix.py b/gensim/models/translation_matrix.py
index 528e3d6fa2..c6f4c39894 100644
--- a/gensim/models/translation_matrix.py
+++ b/gensim/models/translation_matrix.py
@@ -403,7 +403,7 @@ def __init__(self, source_lang_vec, target_lang_vec, tagged_docs=None, random_st
             Source Doc2Vec model.
         target_lang_vec : :class:`~gensim.models.doc2vec.Doc2Vec`
             Target Doc2Vec model.
-        tagged_docs : list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional.
+        tagged_docs : list of :class:`~gensim.corpora.utils.TaggedDocument`, optional.
             Documents that will be used for training, both the source language document vector and
             target language document vector trained on those tagged documents.
         random_state : {None, int, array_like}, optional
@@ -425,7 +425,7 @@ def train(self, tagged_docs):
 
         Parameters
         ----------
-        tagged_docs : list of :class:`~gensim.models.doc2vec.TaggedDocument`, Documents
+        tagged_docs : list of :class:`~gensim.corpora.utils.TaggedDocument`, Documents
             that will be used for training, both the source language document vector and
             target language document vector trained on those tagged documents.
 
diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 806e087c56..af2fa47c43 100755
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -50,8 +50,8 @@
 
 Note the ``sentences`` iterable must be *restartable* (not just a generator), to allow the algorithm
 to stream over your dataset multiple times. For some examples of streamed iterables,
-see :class:`~gensim.models.word2vec.BrownCorpus`,
-:class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence`.
+see :class:`~gensim.corpora.utils.BrownCorpus`,
+:class:`~gensim.corpora.utils.Text8Corpus` or :class:`~gensim.corpora.utils.LineSentence`.
 
 If you save the model you can continue training it later:
 
@@ -188,7 +188,6 @@
 from collections import defaultdict, namedtuple
 from types import GeneratorType
 import threading
-import itertools
 import copy
 from queue import Queue, Empty
 
@@ -199,6 +198,9 @@
 from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector
 from gensim import utils, matutils
 
+from gensim.corpora.utils import BrownCorpus, Text8Corpus
+from gensim.corpora.utils import LineSentence, PathLineSentences
+from gensim.corpora.utils import MAX_WORDS
 
 logger = logging.getLogger(__name__)
 
@@ -208,7 +210,6 @@
         train_batch_cbow,
         score_sentence_sg,
         score_sentence_cbow,
-        MAX_WORDS_IN_BATCH,
         FAST_VERSION,
     )
 except ImportError:
@@ -238,7 +239,7 @@ def __init__(
             self, sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5,
             max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
             sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0,
-            trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
+            trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS, compute_loss=False, callbacks=(),
             comment=None, max_final_vocab=None,
         ):
         """Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/.
@@ -259,14 +260,15 @@ def __init__(
         sentences : iterable of iterables, optional
             The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
             consider an iterable that streams the sentences directly from disk/network.
-            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
-            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
+            See :class:`~gensim.corpora.utils.BrownCorpus`, :class:`~gensim.corpora.utils.Text8Corpus`
+            or :class:`~gensim.corpora.utils.LineSentence` in :mod:`~gensim.corpora.utils` module
+            for such examples.
             See also the `tutorial on data streaming in Python
             <https://rare-technologies.com/data-streaming-in-python-generators-iterators-iterables/>`_.
             If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it
             in some other way.
         corpus_file : str, optional
-            Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+            Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format.
             You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
             `corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized).
         vector_size : int, optional
@@ -450,10 +452,10 @@ def build_vocab(
         corpus_iterable : iterable of list of str
             Can be simply a list of lists of tokens, but for larger corpora,
             consider an iterable that streams the sentences directly from disk/network.
-            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
-            or :class:`~gensim.models.word2vec.LineSentence` module for such examples.
+            See :class:`~gensim.corpora.utils.BrownCorpus`, :class:`~gensim.corpora.utils.Text8Corpus`
+            or :class:`~gensim.corpora.utils.LineSentence` module for such examples.
         corpus_file : str, optional
-            Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+            Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format.
             You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
             `corpus_file` arguments need to be passed (not both of them).
         update : bool
@@ -966,12 +968,13 @@ def train(
         corpus_iterable : iterable of list of str
             The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
             consider an iterable that streams the sentences directly from disk/network.
-            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
-            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
+            See :class:`~gensim.corpora.utils.BrownCorpus`, :class:`~gensim.corpora.utils.Text8Corpus`
+            or :class:`~gensim.corpora.utils.LineSentence` in :mod:`~gensim.corpora.utils`
+            module for such examples.
             See also the `tutorial on data streaming in Python
             <https://rare-technologies.com/data-streaming-in-python-generators-iterators-iterables/>`_.
         corpus_file : str, optional
-            Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+            Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format.
             You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
             `corpus_file` arguments need to be passed (not both of them).
         total_examples : int
@@ -1080,7 +1083,7 @@ def _worker_loop_corpusfile(
         Parameters
         ----------
         corpus_file : str
-            Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+            Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format.
         thread_id : int
             Thread index starting from 0 to `number of workers - 1`.
         offset : int
@@ -1299,7 +1302,7 @@ def _train_epoch_corpusfile(
         Parameters
         ----------
         corpus_file : str
-            Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
+            Path to a corpus file in :class:`~gensim.corpora.utils.LineSentence` format.
         cur_epoch : int, optional
             The current training epoch, needed to compute the training parameters for each job.
             For example in many implementations the learning rate would be dropping with the number of epochs.
@@ -1669,8 +1672,9 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor
         sentences : iterable of list of str
             The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
             consider an iterable that streams the sentences directly from disk/network.
-            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
-            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
+            See :class:`~gensim.corpora.utils.BrownCorpus`, :class:`~gensim.corpora.utils.Text8Corpus`
+            or :class:`~gensim.corpora.utils.LineSentence` in :mod:`~gensim.corpora.utils`
+            module for such examples.
         total_sentences : int, optional
             Count of sentences.
         chunksize : int, optional
@@ -1968,163 +1972,6 @@ def get_latest_training_loss(self):
         return self.running_training_loss
 
 
-class BrownCorpus(object):
-    def __init__(self, dirname):
-        """Iterate over sentences from the `Brown corpus <https://en.wikipedia.org/wiki/Brown_Corpus>`_
-        (part of `NLTK data <https://www.nltk.org/data.html>`_).
-
-        """
-        self.dirname = dirname
-
-    def __iter__(self):
-        for fname in os.listdir(self.dirname):
-            fname = os.path.join(self.dirname, fname)
-            if not os.path.isfile(fname):
-                continue
-            with utils.open(fname, 'rb') as fin:
-                for line in fin:
-                    line = utils.to_unicode(line)
-                    # each file line is a single sentence in the Brown corpus
-                    # each token is WORD/POS_TAG
-                    token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
-                    # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
-                    words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
-                    if not words:  # don't bother sending out empty sentences
-                        continue
-                    yield words
-
-
-class Text8Corpus(object):
-    def __init__(self, fname, max_sentence_length=MAX_WORDS_IN_BATCH):
-        """Iterate over sentences from the "text8" corpus, unzipped from http://mattmahoney.net/dc/text8.zip."""
-        self.fname = fname
-        self.max_sentence_length = max_sentence_length
-
-    def __iter__(self):
-        # the entire corpus is one gigantic line -- there are no sentence marks at all
-        # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens
-        sentence, rest = [], b''
-        with utils.open(self.fname, 'rb') as fin:
-            while True:
-                text = rest + fin.read(8192)  # avoid loading the entire file (=1 line) into RAM
-                if text == rest:  # EOF
-                    words = utils.to_unicode(text).split()
-                    sentence.extend(words)  # return the last chunk of words, too (may be shorter/longer)
-                    if sentence:
-                        yield sentence
-                    break
-                last_token = text.rfind(b' ')  # last token may have been split in two... keep for next iteration
-                words, rest = (utils.to_unicode(text[:last_token]).split(),
-                               text[last_token:].strip()) if last_token >= 0 else ([], text)
-                sentence.extend(words)
-                while len(sentence) >= self.max_sentence_length:
-                    yield sentence[:self.max_sentence_length]
-                    sentence = sentence[self.max_sentence_length:]
-
-
-class LineSentence(object):
-    def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
-        """Iterate over a file that contains sentences: one line = one sentence.
-        Words must be already preprocessed and separated by whitespace.
-
-        Parameters
-        ----------
-        source : string or a file-like object
-            Path to the file on disk, or an already-open file object (must support `seek(0)`).
-        limit : int or None
-            Clip the file to the first `limit` lines. Do no clipping if `limit is None` (the default).
-
-        Examples
-        --------
-        .. sourcecode:: pycon
-
-            >>> from gensim.test.utils import datapath
-            >>> sentences = LineSentence(datapath('lee_background.cor'))
-            >>> for sentence in sentences:
-            ...     pass
-
-        """
-        self.source = source
-        self.max_sentence_length = max_sentence_length
-        self.limit = limit
-
-    def __iter__(self):
-        """Iterate through the lines in the source."""
-        try:
-            # Assume it is a file-like object and try treating it as such
-            # Things that don't have seek will trigger an exception
-            self.source.seek(0)
-            for line in itertools.islice(self.source, self.limit):
-                line = utils.to_unicode(line).split()
-                i = 0
-                while i < len(line):
-                    yield line[i: i + self.max_sentence_length]
-                    i += self.max_sentence_length
-        except AttributeError:
-            # If it didn't work like a file, use it as a string filename
-            with utils.open(self.source, 'rb') as fin:
-                for line in itertools.islice(fin, self.limit):
-                    line = utils.to_unicode(line).split()
-                    i = 0
-                    while i < len(line):
-                        yield line[i: i + self.max_sentence_length]
-                        i += self.max_sentence_length
-
-
-class PathLineSentences(object):
-    def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
-        """Like :class:`~gensim.models.word2vec.LineSentence`, but process all files in a directory
-        in alphabetical order by filename.
-
-        The directory must only contain files that can be read by :class:`gensim.models.word2vec.LineSentence`:
-        .bz2, .gz, and text files. Any file not ending with .bz2 or .gz is assumed to be a text file.
-
-        The format of files (either text, or compressed text files) in the path is one sentence = one line,
-        with words already preprocessed and separated by whitespace.
-
-        Warnings
-        --------
-        Does **not recurse** into subdirectories.
-
-        Parameters
-        ----------
-        source : str
-            Path to the directory.
-        limit : int or None
-            Read only the first `limit` lines from each file. Read all if limit is None (the default).
-
-        """
-        self.source = source
-        self.max_sentence_length = max_sentence_length
-        self.limit = limit
-
-        if os.path.isfile(self.source):
-            logger.debug('single file given as source, rather than a directory of files')
-            logger.debug('consider using models.word2vec.LineSentence for a single file')
-            self.input_files = [self.source]  # force code compatibility with list of files
-        elif os.path.isdir(self.source):
-            self.source = os.path.join(self.source, '')  # ensures os-specific slash at end of path
-            logger.info('reading directory %s', self.source)
-            self.input_files = os.listdir(self.source)
-            self.input_files = [self.source + filename for filename in self.input_files]  # make full paths
-            self.input_files.sort()  # makes sure it happens in filename order
-        else:  # not a file or a directory, then we can't do anything with it
-            raise ValueError('input is neither a file nor a path')
-        logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files))
-
-    def __iter__(self):
-        """iterate through the files"""
-        for file_name in self.input_files:
-            logger.info('reading file %s', file_name)
-            with utils.open(file_name, 'rb') as fin:
-                for line in itertools.islice(fin, self.limit):
-                    line = utils.to_unicode(line).split()
-                    i = 0
-                    while i < len(line):
-                        yield line[i:i + self.max_sentence_length]
-                        i += self.max_sentence_length
-
-
 class Word2VecVocab(utils.SaveLoad):
     """Obsolete class retained for now as load-compatibility state capture."""
     pass
@@ -2196,6 +2043,12 @@ def _assign_binary_codes(wv):
     logger.info("built huffman tree with maximum node depth %i", max_depth)
 
 
+# Alliases of classes so that code relies on original location works
+BrownCorpus = BrownCorpus
+Text8Corpus = Text8Corpus
+LineSentence = LineSentence
+PathLineSentences = PathLineSentences
+
 # Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 \
 # -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3
 if __name__ == "__main__":
diff --git a/gensim/scripts/word2vec_standalone.py b/gensim/scripts/word2vec_standalone.py
index 57f4d907ba..9adc65cd3a 100644
--- a/gensim/scripts/word2vec_standalone.py
+++ b/gensim/scripts/word2vec_standalone.py
@@ -57,7 +57,8 @@
 import argparse
 from numpy import seterr
 
-from gensim.models.word2vec import Word2Vec, LineSentence  # avoid referencing __main__ in pickle
+from gensim.models.word2vec import Word2Vec  # avoid referencing __main__ in pickle
+from gensim.corpora.utils import LineSentence
 
 logger = logging.getLogger(__name__)
 
diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
index 370897bfdb..2a5ebd6622 100644
--- a/gensim/sklearn_api/d2vmodel.py
+++ b/gensim/sklearn_api/d2vmodel.py
@@ -150,7 +150,7 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : {iterable of :class:`~gensim.models.doc2vec.TaggedDocument`, iterable of list of str}
+        X : {iterable of :class:`~gensim.corpora.utils.TaggedDocument`, iterable of list of str}
             A collection of tagged documents used for training the model.
 
         Returns
diff --git a/gensim/sklearn_api/ftmodel.py b/gensim/sklearn_api/ftmodel.py
index 7acd22cfc2..4f0d560f8d 100644
--- a/gensim/sklearn_api/ftmodel.py
+++ b/gensim/sklearn_api/ftmodel.py
@@ -179,8 +179,8 @@ def fit(self, X, y=None):
         X : iterable of iterables of str
             Can be simply a list of lists of tokens, but for larger corpora,
             consider an iterable that streams the sentences directly from disk/network.
-            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
-            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
+            See :class:`~gensim.corpora.utils.BrownCorpus`, :class:`~gensim.corpora.utils.Text8Corpus`
+            or :class:`~gensim.corpora.utils.LineSentence` in :mod:`~gensim.corpora.utils` module for such examples.
 
         Returns
         -------
diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py
index ae64b56e3e..3f3ac9c8f3 100644
--- a/gensim/sklearn_api/w2vmodel.py
+++ b/gensim/sklearn_api/w2vmodel.py
@@ -134,8 +134,9 @@ def fit(self, X, y=None):
         X : iterable of iterables of str
             The input corpus. X can be simply a list of lists of tokens, but for larger corpora,
             consider an iterable that streams the sentences directly from disk/network.
-            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
-            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
+            See :class:`~gensim.corpora.utils.BrownCorpus`, :class:`~gensim.corpora.utils.Text8Corpus`
+            or :class:`~gensim.corpora.utils.LineSentence` in :mod:`~gensim.corpora.utils`
+            module for such examples.
 
         Returns
         -------