← back to dhilgart__NCAA_eval

Function bodies 354 total

All specs Real LLM only Function bodies
BracketDistribution class · python · L688-L705 (18 LOC)
src/ncaa_eval/evaluation/simulation.py
class BracketDistribution:
    """Score distribution statistics from Monte Carlo simulation.

    Attributes:
        scores: Raw per-simulation scores, shape ``(n_simulations,)``.
        percentiles: Mapping of percentile → value for keys 5, 25, 50, 75, 95.
        mean: Mean score across simulations.
        std: Standard deviation of scores.
        histogram_bins: Histogram bin edges, shape ``(n_bins + 1,)``.
        histogram_counts: Histogram counts, shape ``(n_bins,)``.
    """

    scores: npt.NDArray[np.float64]
    percentiles: dict[int, float]
    mean: float
    std: float
    histogram_bins: npt.NDArray[np.float64]
    histogram_counts: npt.NDArray[np.int64]
MostLikelyBracket class · python · L709-L727 (19 LOC)
src/ncaa_eval/evaluation/simulation.py
class MostLikelyBracket:
    """Maximum-likelihood bracket from greedy traversal.

    Attributes:
        winners: Tuple of team indices for each game's predicted winner,
            in **round-major order** matching ``SimulationResult.sim_winners``
            rows — all Round-of-64 games first (indices 0–31 for 64 teams),
            then Round-of-32 (32–47), through to the championship (index 62).
            63 entries for a 64-team bracket.  Pass directly to
            :func:`score_bracket_against_sims` as ``chosen_bracket``.
        champion_team_id: Canonical team ID of the predicted champion
            (from BracketStructure.team_ids[champion_index]).
        log_likelihood: Sum of ``log(max(P[left, right], P[right, left]))``
            across all games.
    """

    winners: tuple[int, ...]
    champion_team_id: int
    log_likelihood: float
compute_advancement_probs function · python · L735-L793 (59 LOC)
src/ncaa_eval/evaluation/simulation.py
def compute_advancement_probs(
    bracket: BracketStructure,
    P: npt.NDArray[np.float64],
) -> npt.NDArray[np.float64]:
    """Compute exact advancement probabilities via the Phylourny algorithm.

    Post-order traversal of the bracket tree computing Win Probability
    Vectors (WPVs) at each internal node using the formula:

        ``R = V ⊙ (P^T · W) + W ⊙ (P^T · V)``

    Args:
        bracket: Tournament bracket structure.
        P: Pairwise win probability matrix, shape ``(n, n)``.

    Returns:
        Advancement probabilities, shape ``(n, n_rounds)``.
        ``adv_probs[i, r]`` = P(team i wins their game in round r).

    Raises:
        ValueError: If ``n`` is not a power of 2 or does not match
            the bracket's team count.
    """
    n = P.shape[0]
    if n == 0 or (n & (n - 1)) != 0:
        msg = f"n must be a positive power of 2, got {n}"
        raise ValueError(msg)

    expected_teams = len(bracket.team_ids)
    if n != expected_teams:
        msg = f"P
compute_expected_points function · python · L796-L815 (20 LOC)
src/ncaa_eval/evaluation/simulation.py
def compute_expected_points(
    adv_probs: npt.NDArray[np.float64],
    scoring_rule: ScoringRule,
) -> npt.NDArray[np.float64]:
    """Compute Expected Points per team via matrix-vector multiply.

    Args:
        adv_probs: Advancement probabilities, shape ``(n, n_rounds)``.
        scoring_rule: Scoring rule providing per-round point values.

    Returns:
        Expected Points per team, shape ``(n,)``.
    """
    n_rounds = adv_probs.shape[1]
    points = np.array(
        [scoring_rule.points_per_round(r) for r in range(n_rounds)],
        dtype=np.float64,
    )
    result: npt.NDArray[np.float64] = adv_probs @ points
    return result
compute_expected_points_seed_diff function · python · L818-L894 (77 LOC)
src/ncaa_eval/evaluation/simulation.py
def compute_expected_points_seed_diff(
    adv_probs: npt.NDArray[np.float64],
    bracket: BracketStructure,
    P: npt.NDArray[np.float64],
    seed_map: dict[int, int],
) -> npt.NDArray[np.float64]:
    """Compute Expected Points with seed-difference upset bonus.

    Extends standard EP by adding per-matchup seed-diff bonus.  For each
    internal bracket node at round *r*, the bonus contribution for team *i*
    beating opponent *j* is::

        P(i reaches node) * P(i beats j) * P(j reaches node) * bonus(seed_i, seed_j)

    where ``bonus = |seed_i - seed_j|`` when ``seed_i > seed_j`` (upset), else 0.

    Uses ``SeedDiffBonusScoring`` base points for standard round points and
    a post-order traversal of the bracket tree (reusing WPVs from
    :func:`compute_advancement_probs` logic) for bonus computation.

    Args:
        adv_probs: Advancement probabilities, shape ``(n, n_rounds)``.
        bracket: Tournament bracket structure (for tree traversal).
        P: Pairwise win
compute_most_likely_bracket function · python · L897-L963 (67 LOC)
src/ncaa_eval/evaluation/simulation.py
def compute_most_likely_bracket(
    bracket: BracketStructure,
    P: npt.NDArray[np.float64],
) -> MostLikelyBracket:
    """Compute the maximum-likelihood bracket via greedy traversal.

    At each internal node, picks the team with the higher win probability
    (``argmax(P[left, right])``).  Returns the full bracket of winners and
    the log-likelihood of the chosen bracket.

    The ``winners`` array is in **round-major order** — the same order as
    ``SimulationResult.sim_winners`` rows — so it can be passed directly to
    :func:`score_bracket_against_sims`:
    all Round-of-64 games first (indices 0–31), then Round-of-32 (32–47),
    through to the championship game (index 62).

    Args:
        bracket: Tournament bracket structure.
        P: Pairwise win probability matrix, shape ``(n, n)``.

    Returns:
        :class:`MostLikelyBracket` with winners, champion, and log-likelihood.
    """
    log_likelihood = 0.0
    # Collect (round_index, game_order_within_round, win
compute_bracket_distribution function · python · L966-L992 (27 LOC)
src/ncaa_eval/evaluation/simulation.py
def compute_bracket_distribution(
    scores: npt.NDArray[np.float64],
    n_bins: int = 50,
) -> BracketDistribution:
    """Compute score distribution statistics from raw MC scores.

    Args:
        scores: Raw per-simulation scores, shape ``(n_simulations,)``.
        n_bins: Number of histogram bins (default 50).

    Returns:
        :class:`BracketDistribution` with percentiles, mean, std, and histogram.
    """
    percentile_keys = (5, 25, 50, 75, 95)
    pct_values = np.percentile(scores, percentile_keys)
    percentiles = {k: float(v) for k, v in zip(percentile_keys, pct_values)}

    counts_arr, bins_arr = np.histogram(scores, bins=n_bins)

    return BracketDistribution(
        scores=scores,
        percentiles=percentiles,
        mean=float(np.mean(scores)),
        std=float(np.std(scores)),
        histogram_bins=bins_arr.astype(np.float64),
        histogram_counts=counts_arr.astype(np.int64),
    )
If a scraper extracted this row, it came from Repobility (https://repobility.com)
score_bracket_against_sims function · python · L995-L1034 (40 LOC)
src/ncaa_eval/evaluation/simulation.py
def score_bracket_against_sims(
    chosen_bracket: npt.NDArray[np.int32],
    sim_winners: npt.NDArray[np.int32],
    scoring_rules: Sequence[ScoringRule],
) -> dict[str, npt.NDArray[np.float64]]:
    """Score a chosen bracket against each simulated tournament outcome.

    For each simulation, counts how many of the chosen bracket's picks
    match the simulation's actual outcomes, weighted by round points.

    Args:
        chosen_bracket: Game winners for the chosen bracket, shape ``(n_games,)``.
        sim_winners: Per-simulation game winners, shape ``(n_simulations, n_games)``.
        scoring_rules: Scoring rules to score against.

    Returns:
        Mapping of ``rule_name → per-sim scores``, each shape ``(n_simulations,)``.
    """
    n_games = chosen_bracket.shape[0]
    n_rounds = int(np.log2(n_games + 1))

    # Boolean match array: (n_simulations, n_games)
    matches = sim_winners == chosen_bracket[None, :]

    result: dict[str, npt.NDArray[np.float64]] = {}
    for 
simulate_tournament_mc function · python · L1042-L1193 (152 LOC)
src/ncaa_eval/evaluation/simulation.py
def simulate_tournament_mc(  # noqa: PLR0913
    bracket: BracketStructure,
    P: npt.NDArray[np.float64],
    scoring_rules: Sequence[ScoringRule],
    season: int,
    n_simulations: int = 10_000,
    rng: np.random.Generator | None = None,
    progress: bool = False,
) -> SimulationResult:
    """Vectorized Monte Carlo tournament simulation.

    All N simulations run in parallel per round (no per-sim Python loops).
    Pre-generates random numbers and uses fancy indexing for batch outcome
    determination.

    Args:
        bracket: Tournament bracket structure (64 teams).
        P: Pairwise win probability matrix, shape ``(n, n)``.
        scoring_rules: Scoring rules to compute scores for.
        season: Tournament season year.
        n_simulations: Number of simulations (default 10,000).
        rng: NumPy random generator for reproducibility.
        progress: Display a tqdm progress bar for simulation rounds.

    Returns:
        :class:`SimulationResult` with MC-derive
_collect_leaves function · python · L1196-L1210 (15 LOC)
src/ncaa_eval/evaluation/simulation.py
def _collect_leaves(node: BracketNode) -> list[int]:
    """Collect leaf team indices in left-to-right order.

    Args:
        node: Root of the subtree.

    Returns:
        List of ``team_index`` values from leaf nodes.
    """
    if node.is_leaf:
        return [node.team_index]
    if node.left is None or node.right is None:
        msg = "Internal bracket node missing child — tree is malformed"
        raise RuntimeError(msg)
    return _collect_leaves(node.left) + _collect_leaves(node.right)
simulate_tournament function · python · L1218-L1290 (73 LOC)
src/ncaa_eval/evaluation/simulation.py
def simulate_tournament(  # noqa: PLR0913
    bracket: BracketStructure,
    probability_provider: ProbabilityProvider,
    context: MatchupContext,
    scoring_rules: Sequence[ScoringRule] | None = None,
    method: str = "analytical",
    n_simulations: int = 10_000,
    rng: np.random.Generator | None = None,
    progress: bool = False,
) -> SimulationResult:
    """High-level tournament simulation orchestrator.

    Dispatches to analytical (Phylourny) or Monte Carlo path based on
    *method*.

    Args:
        bracket: Tournament bracket structure.
        probability_provider: Provider for pairwise win probabilities.
        context: Matchup context (season, day_num, neutral).
        scoring_rules: Scoring rules for EP computation.  Defaults to
            :class:`StandardScoring` only.
        method: ``"analytical"`` (default) or ``"monte_carlo"``.
        n_simulations: Number of MC simulations (ignored for analytical).
        rng: NumPy random generator (MC only).
       
CVFold class · python · L24-L35 (12 LOC)
src/ncaa_eval/evaluation/splitter.py
class CVFold:
    """A single cross-validation fold.

    Attributes:
        train: All games from seasons strictly before the test year.
        test: Tournament games only from the test year.
        year: The test season year.
    """

    train: pd.DataFrame
    test: pd.DataFrame
    year: int
walk_forward_splits function · python · L38-L109 (72 LOC)
src/ncaa_eval/evaluation/splitter.py
def walk_forward_splits(
    seasons: Sequence[int],
    feature_server: StatefulFeatureServer,
    *,
    mode: str = "batch",
) -> Iterator[CVFold]:
    """Generate walk-forward CV folds with Leave-One-Tournament-Out splits.

    Args:
        seasons: Ordered sequence of season years to include
            (e.g., ``range(2008, 2026)``). Must contain at least 2 seasons.
        feature_server: Configured StatefulFeatureServer for building feature
            matrices.
        mode: Feature serving mode: ``"batch"`` (stateless models) or
            ``"stateful"`` (sequential-update models like Elo).

    Yields:
        CVFold: For each eligible test year (skipping no-tournament years like
        2020): ``train`` contains all games from seasons strictly before the
        test year; ``test`` contains only tournament games from the test year;
        ``year`` is the test season year.

    Raises:
        ValueError: If ``seasons`` has fewer than 2 elements, or if ``mode``
           
Connector class · python · L41-L72 (32 LOC)
src/ncaa_eval/ingest/connectors/base.py
class Connector(abc.ABC):
    """Abstract base class for NCAA data source connectors.

    All connectors must implement :meth:`fetch_games`, which is the universal
    capability.  :meth:`fetch_teams` and :meth:`fetch_seasons` are optional
    capabilities — subclasses that do not support them inherit the default
    implementation, which raises ``NotImplementedError``.  Callers should use
    :func:`isinstance` checks or ``try``/``except NotImplementedError`` to
    probe optional capabilities before calling them.
    """

    @abc.abstractmethod
    def fetch_games(self, season: int) -> list[Game]:
        """Fetch game results for a given *season* year."""

    def fetch_teams(self) -> list[Team]:
        """Fetch team data from the source.

        Optional capability — not all connectors provide team master data.
        Raises:
            NotImplementedError: If this connector does not support fetching teams.
        """
        raise NotImplementedError(f"{type(self).__name__}
fetch_teams method · python · L56-L63 (8 LOC)
src/ncaa_eval/ingest/connectors/base.py
    def fetch_teams(self) -> list[Team]:
        """Fetch team data from the source.

        Optional capability — not all connectors provide team master data.
        Raises:
            NotImplementedError: If this connector does not support fetching teams.
        """
        raise NotImplementedError(f"{type(self).__name__} does not provide team data")
Repobility · MCP-ready · https://repobility.com
fetch_seasons method · python · L65-L72 (8 LOC)
src/ncaa_eval/ingest/connectors/base.py
    def fetch_seasons(self) -> list[Season]:
        """Fetch available seasons from the source.

        Optional capability — not all connectors provide season master data.
        Raises:
            NotImplementedError: If this connector does not support fetching seasons.
        """
        raise NotImplementedError(f"{type(self).__name__} does not provide season data")
_parse_game_result function · python · L35-L51 (17 LOC)
src/ncaa_eval/ingest/connectors/espn.py
def _parse_game_result(result_str: str) -> tuple[int, int] | None:
    """Parse a cbbpy ``game_result`` string like ``'W 75-60'``.

    Returns ``(team_score, opponent_score)`` or ``None`` if unparseable.
    """
    if not isinstance(result_str, str) or not result_str.strip():
        return None
    parts = result_str.strip().split()
    if len(parts) != 2:
        return None
    scores = parts[1].split("-")
    if len(scores) != 2:
        return None
    try:
        return int(scores[0]), int(scores[1])
    except ValueError:
        return None
_resolve_team_id function · python · L54-L85 (32 LOC)
src/ncaa_eval/ingest/connectors/espn.py
def _resolve_team_id(
    name: str,
    lower_map: dict[str, int],
    original_mapping: dict[str, int],
) -> int | None:
    """Resolve an ESPN team name to a Kaggle team ID.

    Tries exact match first, then falls back to fuzzy matching via rapidfuzz.

    Args:
        name: ESPN team name to resolve.
        lower_map: Pre-computed lowercase-keyed mapping (avoids per-call rebuild).
        original_mapping: Original mapping with original-case keys (used for fuzzy).
    """
    # Exact match (case-insensitive).
    exact = lower_map.get(name.lower())
    if exact is not None:
        return exact

    # Fuzzy match.
    best_score = 0.0
    best_id: int | None = None
    for known_name, tid in original_mapping.items():
        score = fuzz.token_set_ratio(name.lower(), known_name.lower())
        if score > best_score:
            best_score = score
            best_id = tid
    if best_score >= _FUZZY_THRESHOLD and best_id is not None:
        return best_id

    logger.warning("
EspnConnector class · python · L88-L268 (181 LOC)
src/ncaa_eval/ingest/connectors/espn.py
class EspnConnector(Connector):
    """Connector for ESPN game data via the cbbpy scraper.

    Args:
        team_name_to_id: Mapping from team name strings to Kaggle TeamIDs.
        season_day_zeros: Mapping from season year to DayZero date.
    """

    def __init__(
        self,
        team_name_to_id: dict[str, int],
        season_day_zeros: dict[int, datetime.date],
    ) -> None:
        self._team_name_to_id = team_name_to_id
        self._season_day_zeros = season_day_zeros
        # Pre-compute lowercase map once to avoid O(N×M) rebuilds during parsing.
        self._lower_team_map: dict[str, int] = {k.lower(): v for k, v in team_name_to_id.items()}

    # -- Games --------------------------------------------------------------

    def fetch_games(self, season: int) -> list[Game]:
        """Fetch game results for *season* from ESPN via cbbpy.

        Uses `get_team_schedule()` for each team in the mapping and
        deduplicates by ESPN game ID.
        """
        df 
__init__ method · python · L96-L104 (9 LOC)
src/ncaa_eval/ingest/connectors/espn.py
    def __init__(
        self,
        team_name_to_id: dict[str, int],
        season_day_zeros: dict[int, datetime.date],
    ) -> None:
        self._team_name_to_id = team_name_to_id
        self._season_day_zeros = season_day_zeros
        # Pre-compute lowercase map once to avoid O(N×M) rebuilds during parsing.
        self._lower_team_map: dict[str, int] = {k.lower(): v for k, v in team_name_to_id.items()}
fetch_games method · python · L108-L117 (10 LOC)
src/ncaa_eval/ingest/connectors/espn.py
    def fetch_games(self, season: int) -> list[Game]:
        """Fetch game results for *season* from ESPN via cbbpy.

        Uses `get_team_schedule()` for each team in the mapping and
        deduplicates by ESPN game ID.
        """
        df = self._fetch_schedule_df(season)
        if df is None or df.empty:
            return []
        return self._parse_schedule_df(df, season)
_fetch_schedule_df method · python · L121-L131 (11 LOC)
src/ncaa_eval/ingest/connectors/espn.py
    def _fetch_schedule_df(self, season: int) -> pd.DataFrame | None:
        """Load a season schedule DataFrame from cbbpy via per-team schedules.

        `get_games_season` is intentionally avoided here: it fetches
        boxscores and play-by-play for every game (thousands of no-timeout
        HTTP requests) and returns a game-info schema that is incompatible
        with the schedule columns expected by `_parse_schedule_df`.
        `get_team_schedule` returns the correct schedule-format schema
        (`team`, `opponent`, `game_result`, …) with one request per team.
        """
        return self._fetch_per_team(season)
_fetch_per_team method · python · L133-L155 (23 LOC)
src/ncaa_eval/ingest/connectors/espn.py
    def _fetch_per_team(self, season: int) -> pd.DataFrame | None:
        """Fetch schedules for each team in the mapping and concatenate."""
        frames: list[pd.DataFrame] = []
        for team_name in self._team_name_to_id:
            try:
                df = ms.get_team_schedule(team_name, season)
                if isinstance(df, pd.DataFrame) and not df.empty:
                    frames.append(df)
            except Exception:
                logger.debug("espn: get_team_schedule('%s', %d) failed", team_name, season)
                continue
        if not frames:
            logger.warning(
                "espn: all %d per-team schedule fetches failed for season %d — no data available",
                len(self._team_name_to_id),
                season,
            )
            return None
        combined = pd.concat(frames, ignore_index=True)
        # Deduplicate by ESPN game_id (each game appears in both teams' schedules).
        if "game_id" in combined.columns:
  
Repobility analyzer · published findings · https://repobility.com
_parse_schedule_df method · python · L157-L228 (72 LOC)
src/ncaa_eval/ingest/connectors/espn.py
    def _parse_schedule_df(self, df: pd.DataFrame, season: int) -> list[Game]:
        """Convert a cbbpy schedule DataFrame into Game models."""
        missing = _SCHEDULE_COLUMNS - set(df.columns)
        if missing:
            msg = f"espn: schedule DataFrame missing columns: {sorted(missing)}"
            raise DataFormatError(msg)

        day_zero = self._season_day_zeros.get(season)
        games: list[Game] = []
        seen_ids: set[str] = set()

        for _, row in df.iterrows():
            espn_game_id = str(row["game_id"])
            game_id = f"espn_{espn_game_id}"
            if game_id in seen_ids:
                continue
            seen_ids.add(game_id)

            # Parse scores from game_result.
            parsed = _parse_game_result(str(row.get("game_result", "")))
            if parsed is None:
                logger.debug("espn: skipping game %s — unparseable result", espn_game_id)
                continue
            team_score, opp_score = parsed

     
_parse_date method · python · L231-L241 (11 LOC)
src/ncaa_eval/ingest/connectors/espn.py
    def _parse_date(value: object) -> datetime.date | None:
        """Best-effort date parsing from cbbpy game_day values."""
        if value is None or (isinstance(value, float) and pd.isna(value)):
            return None
        try:
            ts = pd.Timestamp(value)
            if pd.isna(ts):
                return None
            return cast("datetime.date", ts.date())
        except Exception:
            return None
_infer_loc method · python · L244-L268 (25 LOC)
src/ncaa_eval/ingest/connectors/espn.py
    def _infer_loc(
        row: pd.Series,
        team_tid: int,
        w_team_id: int,
    ) -> Literal["H", "A", "N"]:
        """Infer game location from available ESPN context.

        Falls back to ``"N"`` (neutral) when location cannot be determined.
        """
        # Some DataFrames include a 'home_away' or 'is_neutral' column.
        if "is_neutral" in row.index:
            val = row["is_neutral"]
            if val is True or str(val).lower() in ("true", "1", "yes"):
                return "N"

        if "home_away" in row.index:
            ha = str(row["home_away"]).lower()
            if ha == "home":
                # The row's team was home.
                return "H" if team_tid == w_team_id else "A"
            if ha == "away":
                return "A" if team_tid == w_team_id else "H"

        # Default to neutral when ambiguous.
        return "N"
_validate_columns function · python · L50-L55 (6 LOC)
src/ncaa_eval/ingest/connectors/kaggle.py
def _validate_columns(df: pd.DataFrame, expected: set[str], filename: str) -> None:
    """Raise :class:`DataFormatError` if *df* is missing required columns."""
    missing = expected - set(df.columns)
    if missing:
        msg = f"kaggle: {filename} missing columns: {sorted(missing)}"
        raise DataFormatError(msg)
KaggleConnector class · python · L63-L250 (188 LOC)
src/ncaa_eval/ingest/connectors/kaggle.py
class KaggleConnector(Connector):
    """Connector for Kaggle March Machine Learning Mania competition data.

    Args:
        extract_dir: Local directory where CSV files are downloaded/extracted.
        competition: Kaggle competition slug.
    """

    def __init__(
        self,
        extract_dir: Path,
        competition: str = "march-machine-learning-mania-2025",
    ) -> None:
        self._extract_dir = extract_dir
        self._competition = competition
        # Cache DayZero mapping {season_year: date} once loaded.
        self._day_zeros: dict[int, datetime.date] | None = None

    # -- network step -------------------------------------------------------

    def download(self, *, force: bool = False) -> None:
        """Download and extract competition CSV files via the Kaggle API.

        Args:
            force: Re-download even if files already exist.

        Raises:
            AuthenticationError: Credentials missing or invalid.
            NetworkError: Downlo
__init__ method · python · L71-L79 (9 LOC)
src/ncaa_eval/ingest/connectors/kaggle.py
    def __init__(
        self,
        extract_dir: Path,
        competition: str = "march-machine-learning-mania-2025",
    ) -> None:
        self._extract_dir = extract_dir
        self._competition = competition
        # Cache DayZero mapping {season_year: date} once loaded.
        self._day_zeros: dict[int, datetime.date] | None = None
download method · python · L83-L125 (43 LOC)
src/ncaa_eval/ingest/connectors/kaggle.py
    def download(self, *, force: bool = False) -> None:
        """Download and extract competition CSV files via the Kaggle API.

        Args:
            force: Re-download even if files already exist.

        Raises:
            AuthenticationError: Credentials missing or invalid.
            NetworkError: Download failed due to connection issues.
        """
        try:
            from kaggle.api.kaggle_api_extended import KaggleApi  # type: ignore[import-untyped]
        except ImportError as exc:
            msg = "kaggle: the 'kaggle' package is required. Install it with: pip install kaggle"
            raise ConnectorError(msg) from exc

        api = KaggleApi()
        try:
            api.authenticate()
        except Exception as exc:
            msg = (
                "kaggle: credentials not found. "
                "Save your API token to ~/.kaggle/access_token (see README for setup instructions)."
            )
            raise AuthenticationError(msg) from exc

 
_read_csv method · python · L129-L144 (16 LOC)
src/ncaa_eval/ingest/connectors/kaggle.py
    def _read_csv(self, filename: str) -> pd.DataFrame:
        """Read a CSV file from the extract directory.

        Raises:
            DataFormatError: File not found or unreadable.
        """
        path = self._extract_dir / filename
        if not path.exists():
            msg = f"kaggle: file not found: {path}"
            raise DataFormatError(msg)
        try:
            df: pd.DataFrame = pd.read_csv(path)
        except Exception as exc:
            msg = f"kaggle: failed to parse {filename}: {exc}"
            raise DataFormatError(msg) from exc
        return df
Repobility · code-quality intelligence platform · https://repobility.com
load_day_zeros method · python · L146-L160 (15 LOC)
src/ncaa_eval/ingest/connectors/kaggle.py
    def load_day_zeros(self) -> dict[int, datetime.date]:
        """Load and cache the season → DayZero mapping.

        Returns:
            Mapping of season year to the date of Day 0 for that season.
        """
        if self._day_zeros is not None:
            return self._day_zeros
        df = self._read_csv("MSeasons.csv")
        _validate_columns(df, _SEASONS_COLUMNS, "MSeasons.csv")
        mapping: dict[int, datetime.date] = {}
        for _, row in df.iterrows():
            mapping[int(row["Season"])] = datetime.datetime.strptime(str(row["DayZero"]), "%m/%d/%Y").date()
        self._day_zeros = mapping
        return mapping
fetch_teams method · python · L164-L168 (5 LOC)
src/ncaa_eval/ingest/connectors/kaggle.py
    def fetch_teams(self) -> list[Team]:
        """Parse ``MTeams.csv`` into Team models."""
        df = self._read_csv("MTeams.csv")
        _validate_columns(df, _TEAMS_COLUMNS, "MTeams.csv")
        return [Team(team_id=int(row["TeamID"]), team_name=str(row["TeamName"])) for _, row in df.iterrows()]
fetch_team_spellings method · python · L170-L179 (10 LOC)
src/ncaa_eval/ingest/connectors/kaggle.py
    def fetch_team_spellings(self) -> dict[str, int]:
        """Parse ``MTeamSpellings.csv`` into a spelling → TeamID mapping.

        Returns every alternate spelling (lower-cased) for each team, which
        provides much wider coverage than the canonical names in MTeams.csv
        when resolving ESPN team name strings to Kaggle IDs.
        """
        df = self._read_csv("MTeamSpellings.csv")
        _validate_columns(df, _SPELLINGS_COLUMNS, "MTeamSpellings.csv")
        return dict(zip(df["TeamNameSpelling"].str.lower(), df["TeamID"].astype(int)))
fetch_games method · python · L181-L196 (16 LOC)
src/ncaa_eval/ingest/connectors/kaggle.py
    def fetch_games(self, season: int) -> list[Game]:
        """Parse regular-season and tournament CSVs into Game models.

        Games from ``MRegularSeasonCompactResults.csv`` have
        ``is_tournament=False``; games from ``MNCAATourneyCompactResults.csv``
        have ``is_tournament=True``.
        """
        day_zeros = self.load_day_zeros()
        games: list[Game] = []
        games.extend(
            self._parse_games_csv("MRegularSeasonCompactResults.csv", season, day_zeros, is_tournament=False)
        )
        games.extend(
            self._parse_games_csv("MNCAATourneyCompactResults.csv", season, day_zeros, is_tournament=True)
        )
        return games
fetch_seasons method · python · L198-L202 (5 LOC)
src/ncaa_eval/ingest/connectors/kaggle.py
    def fetch_seasons(self) -> list[Season]:
        """Parse ``MSeasons.csv`` into Season models."""
        df = self._read_csv("MSeasons.csv")
        _validate_columns(df, _SEASONS_COLUMNS, "MSeasons.csv")
        return [Season(year=int(row["Season"])) for _, row in df.iterrows()]
_parse_games_csv method · python · L206-L250 (45 LOC)
src/ncaa_eval/ingest/connectors/kaggle.py
    def _parse_games_csv(
        self,
        filename: str,
        season: int,
        day_zeros: dict[int, datetime.date],
        *,
        is_tournament: bool,
    ) -> list[Game]:
        """Parse a single games CSV, filtering to *season*."""
        df = self._read_csv(filename)
        _validate_columns(df, _REGULAR_SEASON_COLUMNS, filename)
        df = df[df["Season"] == season]
        games: list[Game] = []
        for _, row in df.iterrows():
            s = int(row["Season"])
            day_num = int(row["DayNum"])
            w_team_id = int(row["WTeamID"])
            l_team_id = int(row["LTeamID"])

            game_date: datetime.date | None = None
            dz = day_zeros.get(s)
            if dz is not None:
                game_date = dz + datetime.timedelta(days=day_num)

            wloc = str(row["WLoc"])
            if wloc not in ("H", "A", "N"):
                msg = f"kaggle: {filename} has unexpected WLoc value: {wloc!r}"
                raise DataFo
Repository class · python · L28-L53 (26 LOC)
src/ncaa_eval/ingest/repository.py
class Repository(abc.ABC):
    """Abstract base class for NCAA data persistence."""

    @abc.abstractmethod
    def get_teams(self) -> list[Team]:
        """Return all stored teams."""

    @abc.abstractmethod
    def get_games(self, season: int) -> list[Game]:
        """Return all games for a given *season* year."""

    @abc.abstractmethod
    def get_seasons(self) -> list[Season]:
        """Return all stored seasons."""

    @abc.abstractmethod
    def save_teams(self, teams: list[Team]) -> None:
        """Persist a collection of teams (overwrite)."""

    @abc.abstractmethod
    def save_games(self, games: list[Game]) -> None:
        """Persist a collection of games (overwrite per season partition)."""

    @abc.abstractmethod
    def save_seasons(self, seasons: list[Season]) -> None:
        """Persist a collection of seasons (overwrite)."""
_apply_model_defaults function · python · L93-L106 (14 LOC)
src/ncaa_eval/ingest/repository.py
def _apply_model_defaults(df: pd.DataFrame, model: type[Game]) -> None:
    """Fill null values in *df* with non-None Pydantic field defaults.

    When pyarrow unifies schemas across partitions that were written at
    different schema versions, columns present in newer partitions but absent
    in older ones are filled with null.  This helper re-applies the Pydantic
    model defaults so that ``model(**row)`` doesn't receive ``None`` for a
    field that expects a concrete default value.
    """
    sentinel: Any = ...  # PydanticUndefined is represented as Ellipsis
    for name, field_info in model.model_fields.items():
        default = field_info.default
        if name in df.columns and default is not sentinel and default is not None:
            df[name] = df[name].fillna(default)
If a scraper extracted this row, it came from Repobility (https://repobility.com)
ParquetRepository class · python · L109-L206 (98 LOC)
src/ncaa_eval/ingest/repository.py
class ParquetRepository(Repository):
    """Repository implementation backed by Parquet files.

    Directory layout::

        {base_path}/
            teams.parquet
            seasons.parquet
            games/
                season={year}/
                    data.parquet
    """

    def __init__(self, base_path: Path) -> None:
        self._base_path = base_path

    # -- reads ---------------------------------------------------------------

    def get_teams(self) -> list[Team]:
        path = self._base_path / "teams.parquet"
        if not path.exists():
            return []
        df = pd.read_parquet(path, engine="pyarrow")
        return [Team(**row) for row in df.to_dict(orient="records")]

    def get_games(self, season: int) -> list[Game]:
        games_dir = self._base_path / "games"
        if not games_dir.exists():
            return []

        dataset = ds.dataset(
            games_dir,
            format="parquet",
            partitioning=ds.partitioning(pa.s
get_teams method · python · L127-L132 (6 LOC)
src/ncaa_eval/ingest/repository.py
    def get_teams(self) -> list[Team]:
        path = self._base_path / "teams.parquet"
        if not path.exists():
            return []
        df = pd.read_parquet(path, engine="pyarrow")
        return [Team(**row) for row in df.to_dict(orient="records")]
get_games method · python · L134-L155 (22 LOC)
src/ncaa_eval/ingest/repository.py
    def get_games(self, season: int) -> list[Game]:
        games_dir = self._base_path / "games"
        if not games_dir.exists():
            return []

        dataset = ds.dataset(
            games_dir,
            format="parquet",
            partitioning=ds.partitioning(pa.schema([("season", pa.int64())]), flavor="hive"),
        )
        table = dataset.to_table(filter=ds.field("season") == season)
        if table.num_rows == 0:
            return []

        df = table.to_pandas()
        # Schema evolution: when the dataset spans partitions with different
        # schemas (e.g., older files lack columns added later), pyarrow fills
        # missing cells with null after unifying schemas.  Re-apply Pydantic
        # defaults for any column whose model field has a non-None default so
        # model construction doesn't fail on unexpected null input.
        _apply_model_defaults(df, Game)
        return [Game(**row) for row in df.to_dict(orient="records")]
get_seasons method · python · L157-L162 (6 LOC)
src/ncaa_eval/ingest/repository.py
    def get_seasons(self) -> list[Season]:
        path = self._base_path / "seasons.parquet"
        if not path.exists():
            return []
        df = pd.read_parquet(path, engine="pyarrow")
        return [Season(**row) for row in df.to_dict(orient="records")]
save_teams method · python · L166-L174 (9 LOC)
src/ncaa_eval/ingest/repository.py
    def save_teams(self, teams: list[Team]) -> None:
        if not teams:
            return
        self._base_path.mkdir(parents=True, exist_ok=True)
        table = pa.Table.from_pydict(
            {field: [getattr(t, field) for t in teams] for field in _TEAM_SCHEMA.names},
            schema=_TEAM_SCHEMA,
        )
        pq.write_table(table, self._base_path / "teams.parquet")
save_games method · python · L176-L196 (21 LOC)
src/ncaa_eval/ingest/repository.py
    def save_games(self, games: list[Game]) -> None:
        if not games:
            return

        games_dir = self._base_path / "games"

        # Group games by season for partitioned writes.
        seasons: dict[int, list[Game]] = {}
        for g in games:
            seasons.setdefault(g.season, []).append(g)

        for season_year, season_games in seasons.items():
            partition_dir = games_dir / f"season={season_year}"
            partition_dir.mkdir(parents=True, exist_ok=True)

            # Build a schema without the partition column (pyarrow hive
            # partitioning stores it in the directory name).
            write_schema = pa.schema([f for f in _GAME_SCHEMA if f.name != "season"])
            data = {field.name: [getattr(g, field.name) for g in season_games] for field in write_schema}
            table = pa.Table.from_pydict(data, schema=write_schema)
            pq.write_table(table, partition_dir / "data.parquet")
save_seasons method · python · L198-L206 (9 LOC)
src/ncaa_eval/ingest/repository.py
    def save_seasons(self, seasons: list[Season]) -> None:
        if not seasons:
            return
        self._base_path.mkdir(parents=True, exist_ok=True)
        table = pa.Table.from_pydict(
            {field: [getattr(s, field) for s in seasons] for field in _SEASON_SCHEMA.names},
            schema=_SEASON_SCHEMA,
        )
        pq.write_table(table, self._base_path / "seasons.parquet")
Team class · python · L17-L24 (8 LOC)
src/ncaa_eval/ingest/schema.py
class Team(BaseModel):
    """A college basketball team."""

    model_config = ConfigDict(populate_by_name=True)

    team_id: int = Field(..., ge=1, alias="TeamID")
    team_name: str = Field(..., min_length=1, alias="TeamName")
    canonical_name: str = Field(default="", alias="CanonicalName")
Repobility · MCP-ready · https://repobility.com
Season class · python · L27-L32 (6 LOC)
src/ncaa_eval/ingest/schema.py
class Season(BaseModel):
    """A single NCAA basketball season (identified by calendar year)."""

    model_config = ConfigDict(populate_by_name=True)

    year: int = Field(..., ge=1985, alias="Year")
Game class · python · L35-L60 (26 LOC)
src/ncaa_eval/ingest/schema.py
class Game(BaseModel):
    """A single NCAA basketball game result."""

    model_config = ConfigDict(populate_by_name=True)

    game_id: str = Field(..., min_length=1, alias="GameID")
    season: int = Field(..., ge=1985, alias="Season")
    day_num: int = Field(..., ge=0, alias="DayNum")
    date: datetime.date | None = Field(default=None, alias="Date")
    w_team_id: int = Field(..., ge=1, alias="WTeamID")
    l_team_id: int = Field(..., ge=1, alias="LTeamID")
    w_score: int = Field(..., ge=0, alias="WScore")
    l_score: int = Field(..., ge=0, alias="LScore")
    loc: Literal["H", "A", "N"] = Field(..., alias="Loc")
    num_ot: int = Field(default=0, ge=0, alias="NumOT")
    is_tournament: bool = Field(default=False, alias="IsTournament")

    @model_validator(mode="after")
    def _check_game_integrity(self) -> Game:
        if self.w_score <= self.l_score:
            msg = f"w_score ({self.w_score}) must be greater than l_score ({self.l_score})"
            raise ValueError(m
_check_game_integrity method · python · L53-L60 (8 LOC)
src/ncaa_eval/ingest/schema.py
    def _check_game_integrity(self) -> Game:
        if self.w_score <= self.l_score:
            msg = f"w_score ({self.w_score}) must be greater than l_score ({self.l_score})"
            raise ValueError(msg)
        if self.w_team_id == self.l_team_id:
            msg = f"w_team_id and l_team_id must differ (both are {self.w_team_id})"
            raise ValueError(msg)
        return self
‹ prevpage 3 / 8next ›