dev/stats/get_important_pr_candidates.py

#!/usr/bin/env python3
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations

import heapq
import logging
import math
import pickle
import re
import textwrap
from datetime import datetime
from functools import cached_property
from typing import TYPE_CHECKING

import pendulum
import rich_click as click
from github import Github, UnknownObjectException
from rich.console import Console

if TYPE_CHECKING:
    from github.PullRequest import PullRequest

logger = logging.getLogger(__name__)

console = Console(width=400, color_system="standard")

option_github_token = click.option(
    "--github-token",
    type=str,
    required=True,
    help=textwrap.dedent(
        """
        A GitHub token is required, and can also be provided by setting the GITHUB_TOKEN env variable.
        Can be generated with:
        https://github.com/settings/tokens/new?description=Read%20issues&scopes=repo:status"""
    ),
    envvar="GITHUB_TOKEN",
)


class PrStat:
    PROVIDER_SCORE = 0.8
    REGULAR_SCORE = 1.0
    REVIEW_INTERACTION_VALUE = 2.0
    COMMENT_INTERACTION_VALUE = 1.0
    REACTION_INTERACTION_VALUE = 0.5

    def __init__(self, g, pull_request: PullRequest):
        self.g = g
        self.pull_request = pull_request
        self.title = pull_request.title
        self._users: set[str] = set()
        self.len_comments: int = 0
        self.comment_reactions: int = 0
        self.issue_nums: list[int] = []
        self.len_issue_comments: int = 0
        self.num_issue_comments: int = 0
        self.num_issue_reactions: int = 0
        self.num_comments: int = 0
        self.num_conv_comments: int = 0
        self.num_protm: int = 0
        self.conv_comment_reactions: int = 0
        self.interaction_score = 1.0

    @property
    def label_score(self) -> float:
        """assigns label score"""
        labels = self.pull_request.labels
        for label in labels:
            if "provider" in label.name:
                return PrStat.PROVIDER_SCORE
        return PrStat.REGULAR_SCORE

    def calc_comments(self):
        """counts reviewer comments, checks for #protm tag, counts rxns"""
        for comment in self.pull_request.get_comments():
            self._users.add(comment.user.login)
            lowercase_body = comment.body.lower()
            if "protm" in lowercase_body:
                self.num_protm += 1
            self.num_comments += 1
            if comment.body is not None:
                self.len_comments += len(comment.body)
            for reaction in comment.get_reactions():
                self._users.add(reaction.user.login)
                self.comment_reactions += 1

    def calc_conv_comments(self):
        """counts conversational comments, checks for #protm tag, counts rxns"""
        for conv_comment in self.pull_request.get_issue_comments():
            self._users.add(conv_comment.user.login)
            lowercase_body = conv_comment.body.lower()
            if "protm" in lowercase_body:
                self.num_protm += 1
            self.num_conv_comments += 1
            for reaction in conv_comment.get_reactions():
                self._users.add(reaction.user.login)
                self.conv_comment_reactions += 1
            if conv_comment.body is not None:
                self.len_issue_comments += len(conv_comment.body)

    @cached_property
    def num_reviews(self) -> int:
        """counts reviews"""
        num_reviews = 0
        for review in self.pull_request.get_reviews():
            self._users.add(review.user.login)
            num_reviews += 1
        return num_reviews

    def issues(self):
        """finds issues in PR"""
        if self.pull_request.body is not None:
            regex = r"(?<=closes: #|elated: #)\d{5}"
            issue_strs = re.findall(regex, self.pull_request.body)
            self.issue_nums = [eval(s) for s in issue_strs]

    def issue_reactions(self):
        """counts reactions to issue comments"""
        if self.issue_nums:
            repo = self.g.get_repo("apache/airflow")
            for num in self.issue_nums:
                try:
                    issue = repo.get_issue(num)
                except UnknownObjectException:
                    continue
                for reaction in issue.get_reactions():
                    self._users.add(reaction.user.login)
                    self.num_issue_reactions += 1
                for issue_comment in issue.get_comments():
                    self.num_issue_comments += 1
                    self._users.add(issue_comment.user.login)
                    if issue_comment.body is not None:
                        self.len_issue_comments += len(issue_comment.body)

    def calc_interaction_score(self):
        """calculates interaction score"""
        interactions = (
            self.num_comments + self.num_conv_comments + self.num_issue_comments
        ) * PrStat.COMMENT_INTERACTION_VALUE
        interactions += (
            self.comment_reactions + self.conv_comment_reactions + self.num_issue_reactions
        ) * PrStat.REACTION_INTERACTION_VALUE
        self.interaction_score += interactions + self.num_reviews * PrStat.REVIEW_INTERACTION_VALUE

    @cached_property
    def num_interacting_users(self) -> int:
        _ = self.interaction_score  # make sure the _users set is populated
        return len(self._users)

    @cached_property
    def num_changed_files(self) -> float:
        return self.pull_request.changed_files

    @cached_property
    def body_length(self) -> int:
        if self.pull_request.body is not None:
            return len(self.pull_request.body)
        else:
            return 0

    @cached_property
    def num_additions(self) -> int:
        return self.pull_request.additions

    @cached_property
    def num_deletions(self) -> int:
        return self.pull_request.deletions

    @property
    def change_score(self) -> float:
        lineactions = self.num_additions + self.num_deletions
        actionsperfile = lineactions / self.num_changed_files
        if self.num_changed_files > 10:
            if actionsperfile > 20:
                return 1.2
            if actionsperfile < 5:
                return 0.7
        return 1.0

    @cached_property
    def comment_length(self) -> int:
        rev_length = 0
        for comment in self.pull_request.get_review_comments():
            if comment.body is not None:
                rev_length += len(comment.body)
        return self.len_comments + self.len_issue_comments + rev_length

    @property
    def length_score(self) -> float:
        score = 1.0
        if self.len_comments > 3000:
            score *= 1.3
        if self.len_comments < 200:
            score *= 0.8
        if self.body_length > 2000:
            score *= 1.4
        if self.body_length < 1000:
            score *= 0.8
        if self.body_length < 20:
            score *= 0.4
        return round(score, 3)

    def adjust_interaction_score(self):
        self.interaction_score *= min(self.num_protm + 1, 3)

    @property
    def score(self):
        #
        # Current principles:
        #
        # Provider and dev-tools PRs should be considered, but should matter 20% less.
        #
        # A review is worth twice as much as a comment, and a comment is worth twice as much as a reaction.
        #
        # If a PR changed more than 20 files, it should matter less the more files there are.
        #
        # If the avg # of changed lines/file is < 5 and there are > 10 files, it should matter 30% less.
        # If the avg # of changed lines/file is > 20 and there are > 10 files, it should matter 20% more.
        #
        # If there are over 3000 characters worth of comments, the PR should matter 30% more.
        # If there are fewer than 200 characters worth of comments, the PR should matter 20% less.
        # If the body contains over 2000 characters, the PR should matter 40% more.
        # If the body contains fewer than 1000 characters, the PR should matter 20% less.
        #
        # Weight PRs with protm tags more heavily:
        # If there is at least one protm tag, multiply the interaction score by the number of tags, up to 3.
        #
        self.calc_comments()
        self.calc_conv_comments()
        self.calc_interaction_score()
        self.adjust_interaction_score()

        return round(
            self.interaction_score
            * self.label_score
            * self.length_score
            * self.change_score
            / (math.log10(self.num_changed_files) if self.num_changed_files > 20 else 1),
            3,
        )

    def __str__(self) -> str:
        if self.num_protm > 0:
            return (
                "[magenta]##Tagged PR## [/]"
                f"Score: {self.score:.2f}: PR{self.pull_request.number}"
                f"by @{self.pull_request.user.login}: "
                f'"{self.pull_request.title}". '
                f"Merged at {self.pull_request.merged_at}: {self.pull_request.html_url}"
            )
        else:
            return (
                f"Score: {self.score:.2f}: PR{self.pull_request.number}"
                f"by @{self.pull_request.user.login}: "
                f'"{self.pull_request.title}". '
                f"Merged at {self.pull_request.merged_at}: {self.pull_request.html_url}"
            )

    def verboseStr(self) -> str:
        if self.num_protm > 0:
            console.print("********************* Tagged with '#protm' *********************", style="magenta")
        return (
            f"-- Created at [bright_blue]{self.pull_request.created_at}[/], "
            f"merged at [bright_blue]{self.pull_request.merged_at}[/]\n"
            f"-- Label score: [green]{self.label_score}[/]\n"
            f"-- Length score: [green]{self.length_score}[/] "
            f"(body length: {self.body_length}, "
            f"comment length: {self.len_comments})\n"
            f"-- Interaction score: [green]{self.interaction_score}[/] "
            f"(users interacting: {self.num_interacting_users}, "
            f"reviews: {self.num_reviews}, "
            f"review comments: {self.num_comments}, "
            f"review reactions: {self.comment_reactions}, "
            f"non-review comments: {self.num_conv_comments}, "
            f"non-review reactions: {self.conv_comment_reactions}, "
            f"issue comments: {self.num_issue_comments}, "
            f"issue reactions: {self.num_issue_reactions})\n"
            f"-- Change score: [green]{self.change_score}[/] "
            f"(changed files: {self.num_changed_files}, "
            f"additions: {self.num_additions}, "
            f"deletions: {self.num_deletions})\n"
            f"-- Overall score: [red]{self.score:.2f}[/]\n"
        )


DAYS_BACK = 5
# Current (or previous during first few days of the next month)
DEFAULT_BEGINNING_OF_MONTH = pendulum.now().subtract(days=DAYS_BACK).start_of("month")
DEFAULT_END_OF_MONTH = DEFAULT_BEGINNING_OF_MONTH.end_of("month").add(days=1)

MAX_PR_CANDIDATES = 500
DEFAULT_TOP_PRS = 10


@click.command()
@option_github_token  # TODO: this should only be required if --load isn't provided
@click.option(
    "--date-start", type=click.DateTime(formats=["%Y-%m-%d"]), default=str(DEFAULT_BEGINNING_OF_MONTH.date())
)
@click.option(
    "--date-end", type=click.DateTime(formats=["%Y-%m-%d"]), default=str(DEFAULT_END_OF_MONTH.date())
)
@click.option("--top-number", type=int, default=DEFAULT_TOP_PRS, help="The number of PRs to select")
@click.option("--save", type=click.File("wb"), help="Save PR data to a pickle file")
@click.option("--load", type=click.File("rb"), help="Load PR data from a file and recalculate scores")
@click.option("--verbose", is_flag="True", help="Print scoring details")
@click.option(
    "--rate-limit",
    is_flag="True",
    help="Print API rate limit reset time using system time, and requests remaining",
)
def main(
    github_token: str,
    date_start: datetime,
    save: click.File(),  # type: ignore
    load: click.File(),  # type: ignore
    date_end: datetime,
    top_number: int,
    verbose: bool,
    rate_limit: bool,
):
    g = Github(github_token)

    if rate_limit:
        r = g.get_rate_limit()
        requests_remaining: int = r.core.remaining
        console.print(
            f"[blue]GitHub API Rate Limit Info\n"
            f"[green]Requests remaining: [red]{requests_remaining}\n"
            f"[green]Reset time: [blue]{r.core.reset.astimezone()}"
        )

    selected_prs: list[PrStat] = []
    if load:
        console.print("Loading PRs from cache and recalculating scores.")
        selected_prs = pickle.load(load, encoding="bytes")
        for issue_num, pr in enumerate(selected_prs, 1):
            console.print(
                f"[green]Loading PR: #{pr.pull_request.number} `{pr.pull_request.title}`.[/]"
                f" Score: {pr.score}."
                f" Url: {pr.pull_request.html_url}"
            )

            if verbose:
                console.print(pr.verboseStr())

    else:
        console.print(f"Finding best candidate PRs between {date_start} and {date_end}.")
        repo = g.get_repo("apache/airflow")
        commits = repo.get_commits(since=date_start, until=date_end)
        pulls: list[PullRequest] = [pull for commit in commits for pull in commit.get_pulls()]
        scores: dict = {}
        for issue_num, pull in enumerate(pulls, 1):
            p = PrStat(g=g, pull_request=pull)  # type: ignore
            scores.update({pull.number: [p.score, pull.title]})
            console.print(
                f"[green]Selecting PR: #{pull.number} `{pull.title}` as candidate.[/]"
                f" Score: {scores[pull.number][0]}."
                f" Url: {pull.html_url}"
            )

            if verbose:
                console.print(p.verboseStr())

            selected_prs.append(p)
            if issue_num == MAX_PR_CANDIDATES:
                console.print(f"[red]Reached {MAX_PR_CANDIDATES}. Stopping")
                break

    console.print(f"Top {top_number} out of {issue_num} PRs:")
    for pr_scored in heapq.nlargest(top_number, scores.items(), key=lambda s: s[1]):
        console.print(f"[green] * PR #{pr_scored[0]}: {pr_scored[1][1]}. Score: [magenta]{pr_scored[1][0]}")

    if save:
        pickle.dump(selected_prs, save)

    if rate_limit:
        r = g.get_rate_limit()
        console.print(
            f"[blue]GitHub API Rate Limit Info\n"
            f"[green]Requests remaining: [red]{r.core.remaining}\n"
            f"[green]Requests made: [red]{requests_remaining - r.core.remaining}\n"
            f"[green]Reset time: [blue]{r.core.reset.astimezone()}"
        )


if __name__ == "__main__":
    main()