Function bodies 354 total

Name: Aljefra Mapper analysis
Creator: Repobility
License: https://repobility.com/legal/terms/

BracketDistribution class · python · L688-L705 (18 LOC)

src/ncaa_eval/evaluation/simulation.py

class BracketDistribution:
    """Score distribution statistics from Monte Carlo simulation.

    Attributes:
        scores: Raw per-simulation scores, shape ``(n_simulations,)``.
        percentiles: Mapping of percentile → value for keys 5, 25, 50, 75, 95.
        mean: Mean score across simulations.
        std: Standard deviation of scores.
        histogram_bins: Histogram bin edges, shape ``(n_bins + 1,)``.
        histogram_counts: Histogram counts, shape ``(n_bins,)``.
    """

    scores: npt.NDArray[np.float64]
    percentiles: dict[int, float]
    mean: float
    std: float
    histogram_bins: npt.NDArray[np.float64]
    histogram_counts: npt.NDArray[np.int64]

MostLikelyBracket class · python · L709-L727 (19 LOC)

src/ncaa_eval/evaluation/simulation.py

class MostLikelyBracket:
    """Maximum-likelihood bracket from greedy traversal.

    Attributes:
        winners: Tuple of team indices for each game's predicted winner,
            in **round-major order** matching ``SimulationResult.sim_winners``
            rows — all Round-of-64 games first (indices 0–31 for 64 teams),
            then Round-of-32 (32–47), through to the championship (index 62).
            63 entries for a 64-team bracket.  Pass directly to
            :func:`score_bracket_against_sims` as ``chosen_bracket``.
        champion_team_id: Canonical team ID of the predicted champion
            (from BracketStructure.team_ids[champion_index]).
        log_likelihood: Sum of ``log(max(P[left, right], P[right, left]))``
            across all games.
    """

    winners: tuple[int, ...]
    champion_team_id: int
    log_likelihood: float

compute_advancement_probs function · python · L735-L793 (59 LOC)

src/ncaa_eval/evaluation/simulation.py

def compute_advancement_probs(
    bracket: BracketStructure,
    P: npt.NDArray[np.float64],
) -> npt.NDArray[np.float64]:
    """Compute exact advancement probabilities via the Phylourny algorithm.

    Post-order traversal of the bracket tree computing Win Probability
    Vectors (WPVs) at each internal node using the formula:

        ``R = V ⊙ (P^T · W) + W ⊙ (P^T · V)``

    Args:
        bracket: Tournament bracket structure.
        P: Pairwise win probability matrix, shape ``(n, n)``.

    Returns:
        Advancement probabilities, shape ``(n, n_rounds)``.
        ``adv_probs[i, r]`` = P(team i wins their game in round r).

    Raises:
        ValueError: If ``n`` is not a power of 2 or does not match
            the bracket's team count.
    """
    n = P.shape[0]
    if n == 0 or (n & (n - 1)) != 0:
        msg = f"n must be a positive power of 2, got {n}"
        raise ValueError(msg)

    expected_teams = len(bracket.team_ids)
    if n != expected_teams:
        msg = f"P

compute_expected_points function · python · L796-L815 (20 LOC)

src/ncaa_eval/evaluation/simulation.py

def compute_expected_points(
    adv_probs: npt.NDArray[np.float64],
    scoring_rule: ScoringRule,
) -> npt.NDArray[np.float64]:
    """Compute Expected Points per team via matrix-vector multiply.

    Args:
        adv_probs: Advancement probabilities, shape ``(n, n_rounds)``.
        scoring_rule: Scoring rule providing per-round point values.

    Returns:
        Expected Points per team, shape ``(n,)``.
    """
    n_rounds = adv_probs.shape[1]
    points = np.array(
        [scoring_rule.points_per_round(r) for r in range(n_rounds)],
        dtype=np.float64,
    )
    result: npt.NDArray[np.float64] = adv_probs @ points
    return result

compute_expected_points_seed_diff function · python · L818-L894 (77 LOC)

src/ncaa_eval/evaluation/simulation.py

def compute_expected_points_seed_diff(
    adv_probs: npt.NDArray[np.float64],
    bracket: BracketStructure,
    P: npt.NDArray[np.float64],
    seed_map: dict[int, int],
) -> npt.NDArray[np.float64]:
    """Compute Expected Points with seed-difference upset bonus.

    Extends standard EP by adding per-matchup seed-diff bonus.  For each
    internal bracket node at round *r*, the bonus contribution for team *i*
    beating opponent *j* is::

        P(i reaches node) * P(i beats j) * P(j reaches node) * bonus(seed_i, seed_j)

    where ``bonus = |seed_i - seed_j|`` when ``seed_i > seed_j`` (upset), else 0.

    Uses ``SeedDiffBonusScoring`` base points for standard round points and
    a post-order traversal of the bracket tree (reusing WPVs from
    :func:`compute_advancement_probs` logic) for bonus computation.

    Args:
        adv_probs: Advancement probabilities, shape ``(n, n_rounds)``.
        bracket: Tournament bracket structure (for tree traversal).
        P: Pairwise win

compute_most_likely_bracket function · python · L897-L963 (67 LOC)

src/ncaa_eval/evaluation/simulation.py

def compute_most_likely_bracket(
    bracket: BracketStructure,
    P: npt.NDArray[np.float64],
) -> MostLikelyBracket:
    """Compute the maximum-likelihood bracket via greedy traversal.

    At each internal node, picks the team with the higher win probability
    (``argmax(P[left, right])``).  Returns the full bracket of winners and
    the log-likelihood of the chosen bracket.

    The ``winners`` array is in **round-major order** — the same order as
    ``SimulationResult.sim_winners`` rows — so it can be passed directly to
    :func:`score_bracket_against_sims`:
    all Round-of-64 games first (indices 0–31), then Round-of-32 (32–47),
    through to the championship game (index 62).

    Args:
        bracket: Tournament bracket structure.
        P: Pairwise win probability matrix, shape ``(n, n)``.

    Returns:
        :class:`MostLikelyBracket` with winners, champion, and log-likelihood.
    """
    log_likelihood = 0.0
    # Collect (round_index, game_order_within_round, win

compute_bracket_distribution function · python · L966-L992 (27 LOC)

src/ncaa_eval/evaluation/simulation.py

def compute_bracket_distribution(
    scores: npt.NDArray[np.float64],
    n_bins: int = 50,
) -> BracketDistribution:
    """Compute score distribution statistics from raw MC scores.

    Args:
        scores: Raw per-simulation scores, shape ``(n_simulations,)``.
        n_bins: Number of histogram bins (default 50).

    Returns:
        :class:`BracketDistribution` with percentiles, mean, std, and histogram.
    """
    percentile_keys = (5, 25, 50, 75, 95)
    pct_values = np.percentile(scores, percentile_keys)
    percentiles = {k: float(v) for k, v in zip(percentile_keys, pct_values)}

    counts_arr, bins_arr = np.histogram(scores, bins=n_bins)

    return BracketDistribution(
        scores=scores,
        percentiles=percentiles,
        mean=float(np.mean(scores)),
        std=float(np.std(scores)),
        histogram_bins=bins_arr.astype(np.float64),
        histogram_counts=counts_arr.astype(np.int64),
    )

If a scraper extracted this row, it came from Repobility (https://repobility.com)

score_bracket_against_sims function · python · L995-L1034 (40 LOC)

src/ncaa_eval/evaluation/simulation.py

def score_bracket_against_sims(
    chosen_bracket: npt.NDArray[np.int32],
    sim_winners: npt.NDArray[np.int32],
    scoring_rules: Sequence[ScoringRule],
) -> dict[str, npt.NDArray[np.float64]]:
    """Score a chosen bracket against each simulated tournament outcome.

    For each simulation, counts how many of the chosen bracket's picks
    match the simulation's actual outcomes, weighted by round points.

    Args:
        chosen_bracket: Game winners for the chosen bracket, shape ``(n_games,)``.
        sim_winners: Per-simulation game winners, shape ``(n_simulations, n_games)``.
        scoring_rules: Scoring rules to score against.

    Returns:
        Mapping of ``rule_name → per-sim scores``, each shape ``(n_simulations,)``.
    """
    n_games = chosen_bracket.shape[0]
    n_rounds = int(np.log2(n_games + 1))

    # Boolean match array: (n_simulations, n_games)
    matches = sim_winners == chosen_bracket[None, :]

    result: dict[str, npt.NDArray[np.float64]] = {}
    for

simulate_tournament_mc function · python · L1042-L1193 (152 LOC)

src/ncaa_eval/evaluation/simulation.py

def simulate_tournament_mc(  # noqa: PLR0913
    bracket: BracketStructure,
    P: npt.NDArray[np.float64],
    scoring_rules: Sequence[ScoringRule],
    season: int,
    n_simulations: int = 10_000,
    rng: np.random.Generator | None = None,
    progress: bool = False,
) -> SimulationResult:
    """Vectorized Monte Carlo tournament simulation.

    All N simulations run in parallel per round (no per-sim Python loops).
    Pre-generates random numbers and uses fancy indexing for batch outcome
    determination.

    Args:
        bracket: Tournament bracket structure (64 teams).
        P: Pairwise win probability matrix, shape ``(n, n)``.
        scoring_rules: Scoring rules to compute scores for.
        season: Tournament season year.
        n_simulations: Number of simulations (default 10,000).
        rng: NumPy random generator for reproducibility.
        progress: Display a tqdm progress bar for simulation rounds.

    Returns:
        :class:`SimulationResult` with MC-derive

_collect_leaves function · python · L1196-L1210 (15 LOC)

src/ncaa_eval/evaluation/simulation.py

def _collect_leaves(node: BracketNode) -> list[int]:
    """Collect leaf team indices in left-to-right order.

    Args:
        node: Root of the subtree.

    Returns:
        List of ``team_index`` values from leaf nodes.
    """
    if node.is_leaf:
        return [node.team_index]
    if node.left is None or node.right is None:
        msg = "Internal bracket node missing child — tree is malformed"
        raise RuntimeError(msg)
    return _collect_leaves(node.left) + _collect_leaves(node.right)

simulate_tournament function · python · L1218-L1290 (73 LOC)

src/ncaa_eval/evaluation/simulation.py

def simulate_tournament(  # noqa: PLR0913
    bracket: BracketStructure,
    probability_provider: ProbabilityProvider,
    context: MatchupContext,
    scoring_rules: Sequence[ScoringRule] | None = None,
    method: str = "analytical",
    n_simulations: int = 10_000,
    rng: np.random.Generator | None = None,
    progress: bool = False,
) -> SimulationResult:
    """High-level tournament simulation orchestrator.

    Dispatches to analytical (Phylourny) or Monte Carlo path based on
    *method*.

    Args:
        bracket: Tournament bracket structure.
        probability_provider: Provider for pairwise win probabilities.
        context: Matchup context (season, day_num, neutral).
        scoring_rules: Scoring rules for EP computation.  Defaults to
            :class:`StandardScoring` only.
        method: ``"analytical"`` (default) or ``"monte_carlo"``.
        n_simulations: Number of MC simulations (ignored for analytical).
        rng: NumPy random generator (MC only).

CVFold class · python · L24-L35 (12 LOC)

src/ncaa_eval/evaluation/splitter.py

class CVFold:
    """A single cross-validation fold.

    Attributes:
        train: All games from seasons strictly before the test year.
        test: Tournament games only from the test year.
        year: The test season year.
    """

    train: pd.DataFrame
    test: pd.DataFrame
    year: int

walk_forward_splits function · python · L38-L109 (72 LOC)

src/ncaa_eval/evaluation/splitter.py

def walk_forward_splits(
    seasons: Sequence[int],
    feature_server: StatefulFeatureServer,
    *,
    mode: str = "batch",
) -> Iterator[CVFold]:
    """Generate walk-forward CV folds with Leave-One-Tournament-Out splits.

    Args:
        seasons: Ordered sequence of season years to include
            (e.g., ``range(2008, 2026)``). Must contain at least 2 seasons.
        feature_server: Configured StatefulFeatureServer for building feature
            matrices.
        mode: Feature serving mode: ``"batch"`` (stateless models) or
            ``"stateful"`` (sequential-update models like Elo).

    Yields:
        CVFold: For each eligible test year (skipping no-tournament years like
        2020): ``train`` contains all games from seasons strictly before the
        test year; ``test`` contains only tournament games from the test year;
        ``year`` is the test season year.

    Raises:
        ValueError: If ``seasons`` has fewer than 2 elements, or if ``mode``

Connector class · python · L41-L72 (32 LOC)

src/ncaa_eval/ingest/connectors/base.py

class Connector(abc.ABC):
    """Abstract base class for NCAA data source connectors.

    All connectors must implement :meth:`fetch_games`, which is the universal
    capability.  :meth:`fetch_teams` and :meth:`fetch_seasons` are optional
    capabilities — subclasses that do not support them inherit the default
    implementation, which raises ``NotImplementedError``.  Callers should use
    :func:`isinstance` checks or ``try``/``except NotImplementedError`` to
    probe optional capabilities before calling them.
    """

    @abc.abstractmethod
    def fetch_games(self, season: int) -> list[Game]:
        """Fetch game results for a given *season* year."""

    def fetch_teams(self) -> list[Team]:
        """Fetch team data from the source.

        Optional capability — not all connectors provide team master data.
        Raises:
            NotImplementedError: If this connector does not support fetching teams.
        """
        raise NotImplementedError(f"{type(self).__name__}

fetch_teams method · python · L56-L63 (8 LOC)

src/ncaa_eval/ingest/connectors/base.py

    def fetch_teams(self) -> list[Team]:
        """Fetch team data from the source.

        Optional capability — not all connectors provide team master data.
        Raises:
            NotImplementedError: If this connector does not support fetching teams.
        """
        raise NotImplementedError(f"{type(self).__name__} does not provide team data")

Repobility · MCP-ready · https://repobility.com

fetch_seasons method · python · L65-L72 (8 LOC)

src/ncaa_eval/ingest/connectors/base.py

    def fetch_seasons(self) -> list[Season]:
        """Fetch available seasons from the source.

        Optional capability — not all connectors provide season master data.
        Raises:
            NotImplementedError: If this connector does not support fetching seasons.
        """
        raise NotImplementedError(f"{type(self).__name__} does not provide season data")

_parse_game_result function · python · L35-L51 (17 LOC)

src/ncaa_eval/ingest/connectors/espn.py

def _parse_game_result(result_str: str) -> tuple[int, int] | None:
    """Parse a cbbpy ``game_result`` string like ``'W 75-60'``.

    Returns ``(team_score, opponent_score)`` or ``None`` if unparseable.
    """
    if not isinstance(result_str, str) or not result_str.strip():
        return None
    parts = result_str.strip().split()
    if len(parts) != 2:
        return None
    scores = parts[1].split("-")
    if len(scores) != 2:
        return None
    try:
        return int(scores[0]), int(scores[1])
    except ValueError:
        return None

_resolve_team_id function · python · L54-L85 (32 LOC)

src/ncaa_eval/ingest/connectors/espn.py

def _resolve_team_id(
    name: str,
    lower_map: dict[str, int],
    original_mapping: dict[str, int],
) -> int | None:
    """Resolve an ESPN team name to a Kaggle team ID.

    Tries exact match first, then falls back to fuzzy matching via rapidfuzz.

    Args:
        name: ESPN team name to resolve.
        lower_map: Pre-computed lowercase-keyed mapping (avoids per-call rebuild).
        original_mapping: Original mapping with original-case keys (used for fuzzy).
    """
    # Exact match (case-insensitive).
    exact = lower_map.get(name.lower())
    if exact is not None:
        return exact

    # Fuzzy match.
    best_score = 0.0
    best_id: int | None = None
    for known_name, tid in original_mapping.items():
        score = fuzz.token_set_ratio(name.lower(), known_name.lower())
        if score > best_score:
            best_score = score
            best_id = tid
    if best_score >= _FUZZY_THRESHOLD and best_id is not None:
        return best_id

    logger.warning("

EspnConnector class · python · L88-L268 (181 LOC)

src/ncaa_eval/ingest/connectors/espn.py

class EspnConnector(Connector):
    """Connector for ESPN game data via the cbbpy scraper.

    Args:
        team_name_to_id: Mapping from team name strings to Kaggle TeamIDs.
        season_day_zeros: Mapping from season year to DayZero date.
    """

    def __init__(
        self,
        team_name_to_id: dict[str, int],
        season_day_zeros: dict[int, datetime.date],
    ) -> None:
        self._team_name_to_id = team_name_to_id
        self._season_day_zeros = season_day_zeros
        # Pre-compute lowercase map once to avoid O(N×M) rebuilds during parsing.
        self._lower_team_map: dict[str, int] = {k.lower(): v for k, v in team_name_to_id.items()}

    # -- Games --------------------------------------------------------------

    def fetch_games(self, season: int) -> list[Game]:
        """Fetch game results for *season* from ESPN via cbbpy.

        Uses `get_team_schedule()` for each team in the mapping and
        deduplicates by ESPN game ID.
        """
        df

__init__ method · python · L96-L104 (9 LOC)

src/ncaa_eval/ingest/connectors/espn.py

    def __init__(
        self,
        team_name_to_id: dict[str, int],
        season_day_zeros: dict[int, datetime.date],
    ) -> None:
        self._team_name_to_id = team_name_to_id
        self._season_day_zeros = season_day_zeros
        # Pre-compute lowercase map once to avoid O(N×M) rebuilds during parsing.
        self._lower_team_map: dict[str, int] = {k.lower(): v for k, v in team_name_to_id.items()}

fetch_games method · python · L108-L117 (10 LOC)

src/ncaa_eval/ingest/connectors/espn.py

    def fetch_games(self, season: int) -> list[Game]:
        """Fetch game results for *season* from ESPN via cbbpy.

        Uses `get_team_schedule()` for each team in the mapping and
        deduplicates by ESPN game ID.
        """
        df = self._fetch_schedule_df(season)
        if df is None or df.empty:
            return []
        return self._parse_schedule_df(df, season)

_fetch_schedule_df method · python · L121-L131 (11 LOC)

src/ncaa_eval/ingest/connectors/espn.py

    def _fetch_schedule_df(self, season: int) -> pd.DataFrame | None:
        """Load a season schedule DataFrame from cbbpy via per-team schedules.

        `get_games_season` is intentionally avoided here: it fetches
        boxscores and play-by-play for every game (thousands of no-timeout
        HTTP requests) and returns a game-info schema that is incompatible
        with the schedule columns expected by `_parse_schedule_df`.
        `get_team_schedule` returns the correct schedule-format schema
        (`team`, `opponent`, `game_result`, …) with one request per team.
        """
        return self._fetch_per_team(season)

_fetch_per_team method · python · L133-L155 (23 LOC)

src/ncaa_eval/ingest/connectors/espn.py

    def _fetch_per_team(self, season: int) -> pd.DataFrame | None:
        """Fetch schedules for each team in the mapping and concatenate."""
        frames: list[pd.DataFrame] = []
        for team_name in self._team_name_to_id:
            try:
                df = ms.get_team_schedule(team_name, season)
                if isinstance(df, pd.DataFrame) and not df.empty:
                    frames.append(df)
            except Exception:
                logger.debug("espn: get_team_schedule('%s', %d) failed", team_name, season)
                continue
        if not frames:
            logger.warning(
                "espn: all %d per-team schedule fetches failed for season %d — no data available",
                len(self._team_name_to_id),
                season,
            )
            return None
        combined = pd.concat(frames, ignore_index=True)
        # Deduplicate by ESPN game_id (each game appears in both teams' schedules).
        if "game_id" in combined.columns:

Repobility analyzer · published findings · https://repobility.com

_parse_schedule_df method · python · L157-L228 (72 LOC)

src/ncaa_eval/ingest/connectors/espn.py

    def _parse_schedule_df(self, df: pd.DataFrame, season: int) -> list[Game]:
        """Convert a cbbpy schedule DataFrame into Game models."""
        missing = _SCHEDULE_COLUMNS - set(df.columns)
        if missing:
            msg = f"espn: schedule DataFrame missing columns: {sorted(missing)}"
            raise DataFormatError(msg)

        day_zero = self._season_day_zeros.get(season)
        games: list[Game] = []
        seen_ids: set[str] = set()

        for _, row in df.iterrows():
            espn_game_id = str(row["game_id"])
            game_id = f"espn_{espn_game_id}"
            if game_id in seen_ids:
                continue
            seen_ids.add(game_id)

            # Parse scores from game_result.
            parsed = _parse_game_result(str(row.get("game_result", "")))
            if parsed is None:
                logger.debug("espn: skipping game %s — unparseable result", espn_game_id)
                continue
            team_score, opp_score = parsed

_parse_date method · python · L231-L241 (11 LOC)

src/ncaa_eval/ingest/connectors/espn.py

    def _parse_date(value: object) -> datetime.date | None:
        """Best-effort date parsing from cbbpy game_day values."""
        if value is None or (isinstance(value, float) and pd.isna(value)):
            return None
        try:
            ts = pd.Timestamp(value)
            if pd.isna(ts):
                return None
            return cast("datetime.date", ts.date())
        except Exception:
            return None

_infer_loc method · python · L244-L268 (25 LOC)

src/ncaa_eval/ingest/connectors/espn.py

    def _infer_loc(
        row: pd.Series,
        team_tid: int,
        w_team_id: int,
    ) -> Literal["H", "A", "N"]:
        """Infer game location from available ESPN context.

        Falls back to ``"N"`` (neutral) when location cannot be determined.
        """
        # Some DataFrames include a 'home_away' or 'is_neutral' column.
        if "is_neutral" in row.index:
            val = row["is_neutral"]
            if val is True or str(val).lower() in ("true", "1", "yes"):
                return "N"

        if "home_away" in row.index:
            ha = str(row["home_away"]).lower()
            if ha == "home":
                # The row's team was home.
                return "H" if team_tid == w_team_id else "A"
            if ha == "away":
                return "A" if team_tid == w_team_id else "H"

        # Default to neutral when ambiguous.
        return "N"

_validate_columns function · python · L50-L55 (6 LOC)