fix: rework exclusion parsing to fix #1779

nedbat · nedbat · commit b3794bec9ff9 · 2024-05-15T07:45:42.000-04:00
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -23,13 +23,17 @@ upgrading your version of coverage.py.
 Unreleased
 ----------
 
+- Fix: nested matches of exclude patterns could exclude too much code, as
+  reported in `issue 1779`_.  This is now fixed.
+
 - In the HTML report, the filter term and "hide covered" checkbox settings are
   remembered between viewings, thanks to `Daniel Diniz <pull 1776_>`_.
 
 - Python 3.13.0b1 is supported.
 
 
 .. _pull 1776: https://github.com/nedbat/coveragepy/pull/1776
+.. _issue 1779: https://github.com/nedbat/coveragepy/issues/1779
 
 
 .. scriv-start-here
diff --git a/coverage/parser.py b/coverage/parser.py
@@ -25,7 +25,7 @@
 from coverage.bytecode import code_objects
 from coverage.debug import short_stack
 from coverage.exceptions import NoSource, NotPython
-from coverage.misc import join_regex, nice_pair
+from coverage.misc import nice_pair
 from coverage.phystokens import generate_tokens
 from coverage.types import TArc, TLineNo
 
@@ -62,8 +62,8 @@ def __init__(
 
         self.exclude = exclude
 
-        # The text lines of the parsed code.
-        self.lines: list[str] = self.text.split("\n")
+        # The parsed AST of the text.
+        self._ast_root: ast.AST | None = None
 
         # The normalized line numbers of the statements in the code. Exclusions
         # are taken into account, and statements are adjusted to their first
@@ -101,19 +101,16 @@ def __init__(
         self._all_arcs: set[TArc] | None = None
         self._missing_arc_fragments: TArcFragments | None = None
 
-    @functools.lru_cache()
-    def lines_matching(self, *regexes: str) -> set[TLineNo]:
-        """Find the lines matching one of a list of regexes.
+    def lines_matching(self, regex: str) -> set[TLineNo]:
+        """Find the lines matching a regex.
 
-        Returns a set of line numbers, the lines that contain a match for one
-        of the regexes in `regexes`.  The entire line needn't match, just a
-        part of it.
+        Returns a set of line numbers, the lines that contain a match for
+        `regex`.  The entire line needn't match, just a part of it.
 
         """
-        combined = join_regex(regexes)
-        regex_c = re.compile(combined)
+        regex_c = re.compile(regex)
         matches = set()
-        for i, ltext in enumerate(self.lines, start=1):
+        for i, ltext in enumerate(self.text.split("\n"), start=1):
             if regex_c.search(ltext):
                 matches.add(self._multiline.get(i, i))
         return matches
@@ -127,26 +124,18 @@ def _raw_parse(self) -> None:
         # Find lines which match an exclusion pattern.
         if self.exclude:
             self.raw_excluded = self.lines_matching(self.exclude)
+            self.excluded = set(self.raw_excluded)
 
-        # Tokenize, to find excluded suites, to find docstrings, and to find
-        # multi-line statements.
-
-        # The last token seen. Start with INDENT to get module docstrings
-        prev_toktype: int = token.INDENT
         # The current number of indents.
         indent: int = 0
         # An exclusion comment will exclude an entire clause at this indent.
         exclude_indent: int = 0
         # Are we currently excluding lines?
         excluding: bool = False
-        # Are we excluding decorators now?
-        excluding_decorators: bool = False
         # The line number of the first line in a multi-line statement.
         first_line: int = 0
         # Is the file empty?
         empty: bool = True
-        # Is this the first token on a line?
-        first_on_line: bool = True
         # Parenthesis (and bracket) nesting level.
         nesting: int = 0
 
@@ -162,42 +151,22 @@ def _raw_parse(self) -> None:
                 indent += 1
             elif toktype == token.DEDENT:
                 indent -= 1
-            elif toktype == token.NAME:
-                if ttext == "class":
-                    # Class definitions look like branches in the bytecode, so
-                    # we need to exclude them.  The simplest way is to note the
-                    # lines with the "class" keyword.
-                    self.raw_classdefs.add(slineno)
             elif toktype == token.OP:
                 if ttext == ":" and nesting == 0:
                     should_exclude = (
-                        self.raw_excluded.intersection(range(first_line, elineno + 1))
-                        or excluding_decorators
+                        self.excluded.intersection(range(first_line, elineno + 1))
                     )
                     if not excluding and should_exclude:
                         # Start excluding a suite.  We trigger off of the colon
                         # token so that the #pragma comment will be recognized on
                         # the same line as the colon.
-                        self.raw_excluded.add(elineno)
+                        self.excluded.add(elineno)
                         exclude_indent = indent
                         excluding = True
-                        excluding_decorators = False
-                elif ttext == "@" and first_on_line:
-                    # A decorator.
-                    if elineno in self.raw_excluded:
-                        excluding_decorators = True
-                    if excluding_decorators:
-                        self.raw_excluded.add(elineno)
                 elif ttext in "([{":
                     nesting += 1
                 elif ttext in ")]}":
                     nesting -= 1
-            elif toktype == token.STRING:
-                if prev_toktype == token.INDENT:
-                    # Strings that are first on an indented line are docstrings.
-                    # (a trick from trace.py in the stdlib.) This works for
-                    # 99.9999% of cases.
-                    self.raw_docstrings.update(range(slineno, elineno+1))
             elif toktype == token.NEWLINE:
                 if first_line and elineno != first_line:
                     # We're at the end of a line, and we've ended on a
@@ -206,7 +175,6 @@ def _raw_parse(self) -> None:
                     for l in range(first_line, elineno+1):
                         self._multiline[l] = first_line
                 first_line = 0
-                first_on_line = True
 
             if ttext.strip() and toktype != tokenize.COMMENT:
                 # A non-white-space token.
@@ -218,10 +186,7 @@ def _raw_parse(self) -> None:
                     if excluding and indent <= exclude_indent:
                         excluding = False
                     if excluding:
-                        self.raw_excluded.add(elineno)
-                    first_on_line = False
-
-            prev_toktype = toktype
+                        self.excluded.add(elineno)
 
         # Find the starts of the executable statements.
         if not empty:
@@ -234,6 +199,34 @@ def _raw_parse(self) -> None:
         if env.PYBEHAVIOR.module_firstline_1 and self._multiline:
             self._multiline[1] = min(self.raw_statements)
 
+        self.excluded = self.first_lines(self.excluded)
+
+        # AST lets us find classes, docstrings, and decorator-affected
+        # functions and classes.
+        assert self._ast_root is not None
+        for node in ast.walk(self._ast_root):
+            # Find class definitions.
+            if isinstance(node, ast.ClassDef):
+                self.raw_classdefs.add(node.lineno)
+            # Find docstrings.
+            if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef, ast.Module)):
+                if node.body:
+                    first = node.body[0]
+                    if (
+                        isinstance(first, ast.Expr)
+                        and isinstance(first.value, ast.Constant)
+                        and isinstance(first.value.value, str)
+                    ):
+                        self.raw_docstrings.update(
+                            range(first.lineno, cast(int, first.end_lineno) + 1)
+                        )
+            # Exclusions carry from decorators and signatures to the bodies of
+            # functions and classes.
+            if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.AsyncFunctionDef)):
+                first_line = min((d.lineno for d in node.decorator_list), default=node.lineno)
+                if self.excluded.intersection(range(first_line, node.lineno + 1)):
+                    self.excluded.update(range(first_line, cast(int, node.end_lineno) + 1))
+
     @functools.lru_cache(maxsize=1000)
     def first_line(self, lineno: TLineNo) -> TLineNo:
         """Return the first line number of the statement including `lineno`."""
@@ -268,19 +261,14 @@ def parse_source(self) -> None:
 
         """
         try:
+            self._ast_root = ast.parse(self.text)
             self._raw_parse()
-        except (tokenize.TokenError, IndentationError, SyntaxError) as err:
-            if hasattr(err, "lineno"):
-                lineno = err.lineno         # IndentationError
-            else:
-                lineno = err.args[1][0]     # TokenError
+        except (IndentationError, SyntaxError) as err:
             raise NotPython(
                 f"Couldn't parse '{self.filename}' as Python source: " +
-                f"{err.args[0]!r} at line {lineno}",
+                f"{err.args[0]!r} at line {err.lineno}",
             ) from err
 
-        self.excluded = self.first_lines(self.raw_excluded)
-
         ignore = self.excluded | self.raw_docstrings
         starts = self.raw_statements - ignore
         self.statements = self.first_lines(starts) - ignore
@@ -303,7 +291,8 @@ def _analyze_ast(self) -> None:
         `_all_arcs` is the set of arcs in the code.
 
         """
-        aaa = AstArcAnalyzer(self.text, self.raw_statements, self._multiline)
+        assert self._ast_root is not None
+        aaa = AstArcAnalyzer(self._ast_root, self.raw_statements, self._multiline)
         aaa.analyze()
 
         self._all_arcs = set()
@@ -403,14 +392,9 @@ def __init__(
             self.code = code
         else:
             assert filename is not None
-            try:
-                self.code = compile(text, filename, "exec", dont_inherit=True)
-            except SyntaxError as synerr:
-                raise NotPython(
-                    "Couldn't parse '%s' as Python source: '%s' at line %d" % (
-                        filename, synerr.msg, synerr.lineno or 0,
-                    ),
-                ) from synerr
+            # We only get here if earlier ast parsing succeeded, so no need to
+            # catch errors.
+            self.code = compile(text, filename, "exec", dont_inherit=True)
 
     def child_parsers(self) -> Iterable[ByteParser]:
         """Iterate over all the code objects nested within this one.
@@ -685,11 +669,11 @@ class AstArcAnalyzer:
 
     def __init__(
         self,
-        text: str,
+        root_node: ast.AST,
         statements: set[TLineNo],
         multiline: dict[TLineNo, TLineNo],
     ) -> None:
-        self.root_node = ast.parse(text)
+        self.root_node = root_node
         # TODO: I think this is happening in too many places.
         self.statements = {multiline.get(l, l) for l in statements}
         self.multiline = multiline
diff --git a/coverage/python.py b/coverage/python.py
@@ -206,8 +206,10 @@ def translate_arcs(self, arcs: Iterable[TArc]) -> set[TArc]:
     def no_branch_lines(self) -> set[TLineNo]:
         assert self.coverage is not None
         no_branch = self.parser.lines_matching(
-            join_regex(self.coverage.config.partial_list),
-            join_regex(self.coverage.config.partial_always_list),
+            join_regex(
+                self.coverage.config.partial_list
+                + self.coverage.config.partial_always_list
+            )
         )
         return no_branch
 
diff --git a/lab/parser.py b/lab/parser.py
@@ -80,7 +80,7 @@ def one_file(self, options, filename):
 
         if options.dis:
             print("Main code:")
-            disassemble(pyparser)
+            disassemble(pyparser.text)
 
         arcs = pyparser.arcs()
 
@@ -96,7 +96,7 @@ def one_file(self, options, filename):
                 exit_counts = pyparser.exit_counts()
 
                 for lineno, ltext in enumerate(pyparser.lines, start=1):
-                    marks = [' ', ' ', ' ', ' ', ' ']
+                    marks = [' '] * 6
                     a = ' '
                     if lineno in pyparser.raw_statements:
                         marks[0] = '-'
@@ -110,7 +110,13 @@ def one_file(self, options, filename):
                     if lineno in pyparser.raw_classdefs:
                         marks[3] = 'C'
                     if lineno in pyparser.raw_excluded:
-                        marks[4] = 'x'
+                        marks[4] = 'X'
+                    elif lineno in pyparser.excluded:
+                        marks[4] = '×'
+                    if lineno in pyparser._multiline.values():
+                        marks[5] = 'o'
+                    elif lineno in pyparser._multiline.keys():
+                        marks[5] = '.'
 
                     if arc_chars:
                         a = arc_chars[lineno].ljust(arc_width)
@@ -173,13 +179,13 @@ def all_code_objects(code):
         yield code
 
 
-def disassemble(pyparser):
+def disassemble(text):
     """Disassemble code, for ad-hoc experimenting."""
 
-    code = compile(pyparser.text, "", "exec", dont_inherit=True)
+    code = compile(text, "", "exec", dont_inherit=True)
     for code_obj in all_code_objects(code):
-        if pyparser.text:
-            srclines = pyparser.text.splitlines()
+        if text:
+            srclines = text.splitlines()
         else:
             srclines = None
         print("\n%s: " % code_obj)
diff --git a/tests/test_coverage.py b/tests/test_coverage.py
@@ -664,12 +664,12 @@ def test_module_docstring(self) -> None:
             [2, 3],
         )
         self.check_coverage("""\
-            # Start with a comment, because it changes the behavior(!?)
+            # Start with a comment, even though it doesn't change the behavior.
             '''I am a module docstring.'''
             a = 3
             b = 4
             """,
-            [2, 3, 4],
+            [3, 4],
         )
 
 
diff --git a/tests/test_parser.py b/tests/test_parser.py

Original file line number	Diff line number	Diff line change
`@@ -664,12 +664,12 @@ def test_module_docstring(self) -> None:`
`664`	`664`	`[2, 3],`
`665`	`665`	`)`
`666`	`666`	`self.check_coverage("""\`
`667`		`- # Start with a comment, because it changes the behavior(!?)`
	`667`	`+ # Start with a comment, even though it doesn't change the behavior.`
`668`	`668`	`'''I am a module docstring.'''`
`669`	`669`	`a = 3`
`670`	`670`	`b = 4`
`671`	`671`	`""",`
`672`		`- [2, 3, 4],`
	`672`	`+ [3, 4],`
`673`	`673`	`)`
`674`	`674`
`675`	`675`