Function bodies 354 total
BracketDistribution class · python · L688-L705 (18 LOC)src/ncaa_eval/evaluation/simulation.py
class BracketDistribution:
"""Score distribution statistics from Monte Carlo simulation.
Attributes:
scores: Raw per-simulation scores, shape ``(n_simulations,)``.
percentiles: Mapping of percentile → value for keys 5, 25, 50, 75, 95.
mean: Mean score across simulations.
std: Standard deviation of scores.
histogram_bins: Histogram bin edges, shape ``(n_bins + 1,)``.
histogram_counts: Histogram counts, shape ``(n_bins,)``.
"""
scores: npt.NDArray[np.float64]
percentiles: dict[int, float]
mean: float
std: float
histogram_bins: npt.NDArray[np.float64]
histogram_counts: npt.NDArray[np.int64]MostLikelyBracket class · python · L709-L727 (19 LOC)src/ncaa_eval/evaluation/simulation.py
class MostLikelyBracket:
"""Maximum-likelihood bracket from greedy traversal.
Attributes:
winners: Tuple of team indices for each game's predicted winner,
in **round-major order** matching ``SimulationResult.sim_winners``
rows — all Round-of-64 games first (indices 0–31 for 64 teams),
then Round-of-32 (32–47), through to the championship (index 62).
63 entries for a 64-team bracket. Pass directly to
:func:`score_bracket_against_sims` as ``chosen_bracket``.
champion_team_id: Canonical team ID of the predicted champion
(from BracketStructure.team_ids[champion_index]).
log_likelihood: Sum of ``log(max(P[left, right], P[right, left]))``
across all games.
"""
winners: tuple[int, ...]
champion_team_id: int
log_likelihood: floatcompute_advancement_probs function · python · L735-L793 (59 LOC)src/ncaa_eval/evaluation/simulation.py
def compute_advancement_probs(
bracket: BracketStructure,
P: npt.NDArray[np.float64],
) -> npt.NDArray[np.float64]:
"""Compute exact advancement probabilities via the Phylourny algorithm.
Post-order traversal of the bracket tree computing Win Probability
Vectors (WPVs) at each internal node using the formula:
``R = V ⊙ (P^T · W) + W ⊙ (P^T · V)``
Args:
bracket: Tournament bracket structure.
P: Pairwise win probability matrix, shape ``(n, n)``.
Returns:
Advancement probabilities, shape ``(n, n_rounds)``.
``adv_probs[i, r]`` = P(team i wins their game in round r).
Raises:
ValueError: If ``n`` is not a power of 2 or does not match
the bracket's team count.
"""
n = P.shape[0]
if n == 0 or (n & (n - 1)) != 0:
msg = f"n must be a positive power of 2, got {n}"
raise ValueError(msg)
expected_teams = len(bracket.team_ids)
if n != expected_teams:
msg = f"Pcompute_expected_points function · python · L796-L815 (20 LOC)src/ncaa_eval/evaluation/simulation.py
def compute_expected_points(
adv_probs: npt.NDArray[np.float64],
scoring_rule: ScoringRule,
) -> npt.NDArray[np.float64]:
"""Compute Expected Points per team via matrix-vector multiply.
Args:
adv_probs: Advancement probabilities, shape ``(n, n_rounds)``.
scoring_rule: Scoring rule providing per-round point values.
Returns:
Expected Points per team, shape ``(n,)``.
"""
n_rounds = adv_probs.shape[1]
points = np.array(
[scoring_rule.points_per_round(r) for r in range(n_rounds)],
dtype=np.float64,
)
result: npt.NDArray[np.float64] = adv_probs @ points
return resultcompute_expected_points_seed_diff function · python · L818-L894 (77 LOC)src/ncaa_eval/evaluation/simulation.py
def compute_expected_points_seed_diff(
adv_probs: npt.NDArray[np.float64],
bracket: BracketStructure,
P: npt.NDArray[np.float64],
seed_map: dict[int, int],
) -> npt.NDArray[np.float64]:
"""Compute Expected Points with seed-difference upset bonus.
Extends standard EP by adding per-matchup seed-diff bonus. For each
internal bracket node at round *r*, the bonus contribution for team *i*
beating opponent *j* is::
P(i reaches node) * P(i beats j) * P(j reaches node) * bonus(seed_i, seed_j)
where ``bonus = |seed_i - seed_j|`` when ``seed_i > seed_j`` (upset), else 0.
Uses ``SeedDiffBonusScoring`` base points for standard round points and
a post-order traversal of the bracket tree (reusing WPVs from
:func:`compute_advancement_probs` logic) for bonus computation.
Args:
adv_probs: Advancement probabilities, shape ``(n, n_rounds)``.
bracket: Tournament bracket structure (for tree traversal).
P: Pairwise wincompute_most_likely_bracket function · python · L897-L963 (67 LOC)src/ncaa_eval/evaluation/simulation.py
def compute_most_likely_bracket(
bracket: BracketStructure,
P: npt.NDArray[np.float64],
) -> MostLikelyBracket:
"""Compute the maximum-likelihood bracket via greedy traversal.
At each internal node, picks the team with the higher win probability
(``argmax(P[left, right])``). Returns the full bracket of winners and
the log-likelihood of the chosen bracket.
The ``winners`` array is in **round-major order** — the same order as
``SimulationResult.sim_winners`` rows — so it can be passed directly to
:func:`score_bracket_against_sims`:
all Round-of-64 games first (indices 0–31), then Round-of-32 (32–47),
through to the championship game (index 62).
Args:
bracket: Tournament bracket structure.
P: Pairwise win probability matrix, shape ``(n, n)``.
Returns:
:class:`MostLikelyBracket` with winners, champion, and log-likelihood.
"""
log_likelihood = 0.0
# Collect (round_index, game_order_within_round, wincompute_bracket_distribution function · python · L966-L992 (27 LOC)src/ncaa_eval/evaluation/simulation.py
def compute_bracket_distribution(
scores: npt.NDArray[np.float64],
n_bins: int = 50,
) -> BracketDistribution:
"""Compute score distribution statistics from raw MC scores.
Args:
scores: Raw per-simulation scores, shape ``(n_simulations,)``.
n_bins: Number of histogram bins (default 50).
Returns:
:class:`BracketDistribution` with percentiles, mean, std, and histogram.
"""
percentile_keys = (5, 25, 50, 75, 95)
pct_values = np.percentile(scores, percentile_keys)
percentiles = {k: float(v) for k, v in zip(percentile_keys, pct_values)}
counts_arr, bins_arr = np.histogram(scores, bins=n_bins)
return BracketDistribution(
scores=scores,
percentiles=percentiles,
mean=float(np.mean(scores)),
std=float(np.std(scores)),
histogram_bins=bins_arr.astype(np.float64),
histogram_counts=counts_arr.astype(np.int64),
)If a scraper extracted this row, it came from Repobility (https://repobility.com)
score_bracket_against_sims function · python · L995-L1034 (40 LOC)src/ncaa_eval/evaluation/simulation.py
def score_bracket_against_sims(
chosen_bracket: npt.NDArray[np.int32],
sim_winners: npt.NDArray[np.int32],
scoring_rules: Sequence[ScoringRule],
) -> dict[str, npt.NDArray[np.float64]]:
"""Score a chosen bracket against each simulated tournament outcome.
For each simulation, counts how many of the chosen bracket's picks
match the simulation's actual outcomes, weighted by round points.
Args:
chosen_bracket: Game winners for the chosen bracket, shape ``(n_games,)``.
sim_winners: Per-simulation game winners, shape ``(n_simulations, n_games)``.
scoring_rules: Scoring rules to score against.
Returns:
Mapping of ``rule_name → per-sim scores``, each shape ``(n_simulations,)``.
"""
n_games = chosen_bracket.shape[0]
n_rounds = int(np.log2(n_games + 1))
# Boolean match array: (n_simulations, n_games)
matches = sim_winners == chosen_bracket[None, :]
result: dict[str, npt.NDArray[np.float64]] = {}
for simulate_tournament_mc function · python · L1042-L1193 (152 LOC)src/ncaa_eval/evaluation/simulation.py
def simulate_tournament_mc( # noqa: PLR0913
bracket: BracketStructure,
P: npt.NDArray[np.float64],
scoring_rules: Sequence[ScoringRule],
season: int,
n_simulations: int = 10_000,
rng: np.random.Generator | None = None,
progress: bool = False,
) -> SimulationResult:
"""Vectorized Monte Carlo tournament simulation.
All N simulations run in parallel per round (no per-sim Python loops).
Pre-generates random numbers and uses fancy indexing for batch outcome
determination.
Args:
bracket: Tournament bracket structure (64 teams).
P: Pairwise win probability matrix, shape ``(n, n)``.
scoring_rules: Scoring rules to compute scores for.
season: Tournament season year.
n_simulations: Number of simulations (default 10,000).
rng: NumPy random generator for reproducibility.
progress: Display a tqdm progress bar for simulation rounds.
Returns:
:class:`SimulationResult` with MC-derive_collect_leaves function · python · L1196-L1210 (15 LOC)src/ncaa_eval/evaluation/simulation.py
def _collect_leaves(node: BracketNode) -> list[int]:
"""Collect leaf team indices in left-to-right order.
Args:
node: Root of the subtree.
Returns:
List of ``team_index`` values from leaf nodes.
"""
if node.is_leaf:
return [node.team_index]
if node.left is None or node.right is None:
msg = "Internal bracket node missing child — tree is malformed"
raise RuntimeError(msg)
return _collect_leaves(node.left) + _collect_leaves(node.right)simulate_tournament function · python · L1218-L1290 (73 LOC)src/ncaa_eval/evaluation/simulation.py
def simulate_tournament( # noqa: PLR0913
bracket: BracketStructure,
probability_provider: ProbabilityProvider,
context: MatchupContext,
scoring_rules: Sequence[ScoringRule] | None = None,
method: str = "analytical",
n_simulations: int = 10_000,
rng: np.random.Generator | None = None,
progress: bool = False,
) -> SimulationResult:
"""High-level tournament simulation orchestrator.
Dispatches to analytical (Phylourny) or Monte Carlo path based on
*method*.
Args:
bracket: Tournament bracket structure.
probability_provider: Provider for pairwise win probabilities.
context: Matchup context (season, day_num, neutral).
scoring_rules: Scoring rules for EP computation. Defaults to
:class:`StandardScoring` only.
method: ``"analytical"`` (default) or ``"monte_carlo"``.
n_simulations: Number of MC simulations (ignored for analytical).
rng: NumPy random generator (MC only).
CVFold class · python · L24-L35 (12 LOC)src/ncaa_eval/evaluation/splitter.py
class CVFold:
"""A single cross-validation fold.
Attributes:
train: All games from seasons strictly before the test year.
test: Tournament games only from the test year.
year: The test season year.
"""
train: pd.DataFrame
test: pd.DataFrame
year: intwalk_forward_splits function · python · L38-L109 (72 LOC)src/ncaa_eval/evaluation/splitter.py
def walk_forward_splits(
seasons: Sequence[int],
feature_server: StatefulFeatureServer,
*,
mode: str = "batch",
) -> Iterator[CVFold]:
"""Generate walk-forward CV folds with Leave-One-Tournament-Out splits.
Args:
seasons: Ordered sequence of season years to include
(e.g., ``range(2008, 2026)``). Must contain at least 2 seasons.
feature_server: Configured StatefulFeatureServer for building feature
matrices.
mode: Feature serving mode: ``"batch"`` (stateless models) or
``"stateful"`` (sequential-update models like Elo).
Yields:
CVFold: For each eligible test year (skipping no-tournament years like
2020): ``train`` contains all games from seasons strictly before the
test year; ``test`` contains only tournament games from the test year;
``year`` is the test season year.
Raises:
ValueError: If ``seasons`` has fewer than 2 elements, or if ``mode``
Connector class · python · L41-L72 (32 LOC)src/ncaa_eval/ingest/connectors/base.py
class Connector(abc.ABC):
"""Abstract base class for NCAA data source connectors.
All connectors must implement :meth:`fetch_games`, which is the universal
capability. :meth:`fetch_teams` and :meth:`fetch_seasons` are optional
capabilities — subclasses that do not support them inherit the default
implementation, which raises ``NotImplementedError``. Callers should use
:func:`isinstance` checks or ``try``/``except NotImplementedError`` to
probe optional capabilities before calling them.
"""
@abc.abstractmethod
def fetch_games(self, season: int) -> list[Game]:
"""Fetch game results for a given *season* year."""
def fetch_teams(self) -> list[Team]:
"""Fetch team data from the source.
Optional capability — not all connectors provide team master data.
Raises:
NotImplementedError: If this connector does not support fetching teams.
"""
raise NotImplementedError(f"{type(self).__name__}fetch_teams method · python · L56-L63 (8 LOC)src/ncaa_eval/ingest/connectors/base.py
def fetch_teams(self) -> list[Team]:
"""Fetch team data from the source.
Optional capability — not all connectors provide team master data.
Raises:
NotImplementedError: If this connector does not support fetching teams.
"""
raise NotImplementedError(f"{type(self).__name__} does not provide team data")Repobility · MCP-ready · https://repobility.com
fetch_seasons method · python · L65-L72 (8 LOC)src/ncaa_eval/ingest/connectors/base.py
def fetch_seasons(self) -> list[Season]:
"""Fetch available seasons from the source.
Optional capability — not all connectors provide season master data.
Raises:
NotImplementedError: If this connector does not support fetching seasons.
"""
raise NotImplementedError(f"{type(self).__name__} does not provide season data")_parse_game_result function · python · L35-L51 (17 LOC)src/ncaa_eval/ingest/connectors/espn.py
def _parse_game_result(result_str: str) -> tuple[int, int] | None:
"""Parse a cbbpy ``game_result`` string like ``'W 75-60'``.
Returns ``(team_score, opponent_score)`` or ``None`` if unparseable.
"""
if not isinstance(result_str, str) or not result_str.strip():
return None
parts = result_str.strip().split()
if len(parts) != 2:
return None
scores = parts[1].split("-")
if len(scores) != 2:
return None
try:
return int(scores[0]), int(scores[1])
except ValueError:
return None_resolve_team_id function · python · L54-L85 (32 LOC)src/ncaa_eval/ingest/connectors/espn.py
def _resolve_team_id(
name: str,
lower_map: dict[str, int],
original_mapping: dict[str, int],
) -> int | None:
"""Resolve an ESPN team name to a Kaggle team ID.
Tries exact match first, then falls back to fuzzy matching via rapidfuzz.
Args:
name: ESPN team name to resolve.
lower_map: Pre-computed lowercase-keyed mapping (avoids per-call rebuild).
original_mapping: Original mapping with original-case keys (used for fuzzy).
"""
# Exact match (case-insensitive).
exact = lower_map.get(name.lower())
if exact is not None:
return exact
# Fuzzy match.
best_score = 0.0
best_id: int | None = None
for known_name, tid in original_mapping.items():
score = fuzz.token_set_ratio(name.lower(), known_name.lower())
if score > best_score:
best_score = score
best_id = tid
if best_score >= _FUZZY_THRESHOLD and best_id is not None:
return best_id
logger.warning("EspnConnector class · python · L88-L268 (181 LOC)src/ncaa_eval/ingest/connectors/espn.py
class EspnConnector(Connector):
"""Connector for ESPN game data via the cbbpy scraper.
Args:
team_name_to_id: Mapping from team name strings to Kaggle TeamIDs.
season_day_zeros: Mapping from season year to DayZero date.
"""
def __init__(
self,
team_name_to_id: dict[str, int],
season_day_zeros: dict[int, datetime.date],
) -> None:
self._team_name_to_id = team_name_to_id
self._season_day_zeros = season_day_zeros
# Pre-compute lowercase map once to avoid O(N×M) rebuilds during parsing.
self._lower_team_map: dict[str, int] = {k.lower(): v for k, v in team_name_to_id.items()}
# -- Games --------------------------------------------------------------
def fetch_games(self, season: int) -> list[Game]:
"""Fetch game results for *season* from ESPN via cbbpy.
Uses `get_team_schedule()` for each team in the mapping and
deduplicates by ESPN game ID.
"""
df __init__ method · python · L96-L104 (9 LOC)src/ncaa_eval/ingest/connectors/espn.py
def __init__(
self,
team_name_to_id: dict[str, int],
season_day_zeros: dict[int, datetime.date],
) -> None:
self._team_name_to_id = team_name_to_id
self._season_day_zeros = season_day_zeros
# Pre-compute lowercase map once to avoid O(N×M) rebuilds during parsing.
self._lower_team_map: dict[str, int] = {k.lower(): v for k, v in team_name_to_id.items()}fetch_games method · python · L108-L117 (10 LOC)src/ncaa_eval/ingest/connectors/espn.py
def fetch_games(self, season: int) -> list[Game]:
"""Fetch game results for *season* from ESPN via cbbpy.
Uses `get_team_schedule()` for each team in the mapping and
deduplicates by ESPN game ID.
"""
df = self._fetch_schedule_df(season)
if df is None or df.empty:
return []
return self._parse_schedule_df(df, season)_fetch_schedule_df method · python · L121-L131 (11 LOC)src/ncaa_eval/ingest/connectors/espn.py
def _fetch_schedule_df(self, season: int) -> pd.DataFrame | None:
"""Load a season schedule DataFrame from cbbpy via per-team schedules.
`get_games_season` is intentionally avoided here: it fetches
boxscores and play-by-play for every game (thousands of no-timeout
HTTP requests) and returns a game-info schema that is incompatible
with the schedule columns expected by `_parse_schedule_df`.
`get_team_schedule` returns the correct schedule-format schema
(`team`, `opponent`, `game_result`, …) with one request per team.
"""
return self._fetch_per_team(season)_fetch_per_team method · python · L133-L155 (23 LOC)src/ncaa_eval/ingest/connectors/espn.py
def _fetch_per_team(self, season: int) -> pd.DataFrame | None:
"""Fetch schedules for each team in the mapping and concatenate."""
frames: list[pd.DataFrame] = []
for team_name in self._team_name_to_id:
try:
df = ms.get_team_schedule(team_name, season)
if isinstance(df, pd.DataFrame) and not df.empty:
frames.append(df)
except Exception:
logger.debug("espn: get_team_schedule('%s', %d) failed", team_name, season)
continue
if not frames:
logger.warning(
"espn: all %d per-team schedule fetches failed for season %d — no data available",
len(self._team_name_to_id),
season,
)
return None
combined = pd.concat(frames, ignore_index=True)
# Deduplicate by ESPN game_id (each game appears in both teams' schedules).
if "game_id" in combined.columns:
Repobility analyzer · published findings · https://repobility.com
_parse_schedule_df method · python · L157-L228 (72 LOC)src/ncaa_eval/ingest/connectors/espn.py
def _parse_schedule_df(self, df: pd.DataFrame, season: int) -> list[Game]:
"""Convert a cbbpy schedule DataFrame into Game models."""
missing = _SCHEDULE_COLUMNS - set(df.columns)
if missing:
msg = f"espn: schedule DataFrame missing columns: {sorted(missing)}"
raise DataFormatError(msg)
day_zero = self._season_day_zeros.get(season)
games: list[Game] = []
seen_ids: set[str] = set()
for _, row in df.iterrows():
espn_game_id = str(row["game_id"])
game_id = f"espn_{espn_game_id}"
if game_id in seen_ids:
continue
seen_ids.add(game_id)
# Parse scores from game_result.
parsed = _parse_game_result(str(row.get("game_result", "")))
if parsed is None:
logger.debug("espn: skipping game %s — unparseable result", espn_game_id)
continue
team_score, opp_score = parsed
_parse_date method · python · L231-L241 (11 LOC)src/ncaa_eval/ingest/connectors/espn.py
def _parse_date(value: object) -> datetime.date | None:
"""Best-effort date parsing from cbbpy game_day values."""
if value is None or (isinstance(value, float) and pd.isna(value)):
return None
try:
ts = pd.Timestamp(value)
if pd.isna(ts):
return None
return cast("datetime.date", ts.date())
except Exception:
return None_infer_loc method · python · L244-L268 (25 LOC)src/ncaa_eval/ingest/connectors/espn.py
def _infer_loc(
row: pd.Series,
team_tid: int,
w_team_id: int,
) -> Literal["H", "A", "N"]:
"""Infer game location from available ESPN context.
Falls back to ``"N"`` (neutral) when location cannot be determined.
"""
# Some DataFrames include a 'home_away' or 'is_neutral' column.
if "is_neutral" in row.index:
val = row["is_neutral"]
if val is True or str(val).lower() in ("true", "1", "yes"):
return "N"
if "home_away" in row.index:
ha = str(row["home_away"]).lower()
if ha == "home":
# The row's team was home.
return "H" if team_tid == w_team_id else "A"
if ha == "away":
return "A" if team_tid == w_team_id else "H"
# Default to neutral when ambiguous.
return "N"_validate_columns function · python · L50-L55 (6 LOC)src/ncaa_eval/ingest/connectors/kaggle.py
def _validate_columns(df: pd.DataFrame, expected: set[str], filename: str) -> None:
"""Raise :class:`DataFormatError` if *df* is missing required columns."""
missing = expected - set(df.columns)
if missing:
msg = f"kaggle: {filename} missing columns: {sorted(missing)}"
raise DataFormatError(msg)KaggleConnector class · python · L63-L250 (188 LOC)src/ncaa_eval/ingest/connectors/kaggle.py
class KaggleConnector(Connector):
"""Connector for Kaggle March Machine Learning Mania competition data.
Args:
extract_dir: Local directory where CSV files are downloaded/extracted.
competition: Kaggle competition slug.
"""
def __init__(
self,
extract_dir: Path,
competition: str = "march-machine-learning-mania-2025",
) -> None:
self._extract_dir = extract_dir
self._competition = competition
# Cache DayZero mapping {season_year: date} once loaded.
self._day_zeros: dict[int, datetime.date] | None = None
# -- network step -------------------------------------------------------
def download(self, *, force: bool = False) -> None:
"""Download and extract competition CSV files via the Kaggle API.
Args:
force: Re-download even if files already exist.
Raises:
AuthenticationError: Credentials missing or invalid.
NetworkError: Downlo__init__ method · python · L71-L79 (9 LOC)src/ncaa_eval/ingest/connectors/kaggle.py
def __init__(
self,
extract_dir: Path,
competition: str = "march-machine-learning-mania-2025",
) -> None:
self._extract_dir = extract_dir
self._competition = competition
# Cache DayZero mapping {season_year: date} once loaded.
self._day_zeros: dict[int, datetime.date] | None = Nonedownload method · python · L83-L125 (43 LOC)src/ncaa_eval/ingest/connectors/kaggle.py
def download(self, *, force: bool = False) -> None:
"""Download and extract competition CSV files via the Kaggle API.
Args:
force: Re-download even if files already exist.
Raises:
AuthenticationError: Credentials missing or invalid.
NetworkError: Download failed due to connection issues.
"""
try:
from kaggle.api.kaggle_api_extended import KaggleApi # type: ignore[import-untyped]
except ImportError as exc:
msg = "kaggle: the 'kaggle' package is required. Install it with: pip install kaggle"
raise ConnectorError(msg) from exc
api = KaggleApi()
try:
api.authenticate()
except Exception as exc:
msg = (
"kaggle: credentials not found. "
"Save your API token to ~/.kaggle/access_token (see README for setup instructions)."
)
raise AuthenticationError(msg) from exc
_read_csv method · python · L129-L144 (16 LOC)src/ncaa_eval/ingest/connectors/kaggle.py
def _read_csv(self, filename: str) -> pd.DataFrame:
"""Read a CSV file from the extract directory.
Raises:
DataFormatError: File not found or unreadable.
"""
path = self._extract_dir / filename
if not path.exists():
msg = f"kaggle: file not found: {path}"
raise DataFormatError(msg)
try:
df: pd.DataFrame = pd.read_csv(path)
except Exception as exc:
msg = f"kaggle: failed to parse {filename}: {exc}"
raise DataFormatError(msg) from exc
return dfRepobility · code-quality intelligence platform · https://repobility.com
load_day_zeros method · python · L146-L160 (15 LOC)src/ncaa_eval/ingest/connectors/kaggle.py
def load_day_zeros(self) -> dict[int, datetime.date]:
"""Load and cache the season → DayZero mapping.
Returns:
Mapping of season year to the date of Day 0 for that season.
"""
if self._day_zeros is not None:
return self._day_zeros
df = self._read_csv("MSeasons.csv")
_validate_columns(df, _SEASONS_COLUMNS, "MSeasons.csv")
mapping: dict[int, datetime.date] = {}
for _, row in df.iterrows():
mapping[int(row["Season"])] = datetime.datetime.strptime(str(row["DayZero"]), "%m/%d/%Y").date()
self._day_zeros = mapping
return mappingfetch_teams method · python · L164-L168 (5 LOC)src/ncaa_eval/ingest/connectors/kaggle.py
def fetch_teams(self) -> list[Team]:
"""Parse ``MTeams.csv`` into Team models."""
df = self._read_csv("MTeams.csv")
_validate_columns(df, _TEAMS_COLUMNS, "MTeams.csv")
return [Team(team_id=int(row["TeamID"]), team_name=str(row["TeamName"])) for _, row in df.iterrows()]fetch_team_spellings method · python · L170-L179 (10 LOC)src/ncaa_eval/ingest/connectors/kaggle.py
def fetch_team_spellings(self) -> dict[str, int]:
"""Parse ``MTeamSpellings.csv`` into a spelling → TeamID mapping.
Returns every alternate spelling (lower-cased) for each team, which
provides much wider coverage than the canonical names in MTeams.csv
when resolving ESPN team name strings to Kaggle IDs.
"""
df = self._read_csv("MTeamSpellings.csv")
_validate_columns(df, _SPELLINGS_COLUMNS, "MTeamSpellings.csv")
return dict(zip(df["TeamNameSpelling"].str.lower(), df["TeamID"].astype(int)))fetch_games method · python · L181-L196 (16 LOC)src/ncaa_eval/ingest/connectors/kaggle.py
def fetch_games(self, season: int) -> list[Game]:
"""Parse regular-season and tournament CSVs into Game models.
Games from ``MRegularSeasonCompactResults.csv`` have
``is_tournament=False``; games from ``MNCAATourneyCompactResults.csv``
have ``is_tournament=True``.
"""
day_zeros = self.load_day_zeros()
games: list[Game] = []
games.extend(
self._parse_games_csv("MRegularSeasonCompactResults.csv", season, day_zeros, is_tournament=False)
)
games.extend(
self._parse_games_csv("MNCAATourneyCompactResults.csv", season, day_zeros, is_tournament=True)
)
return gamesfetch_seasons method · python · L198-L202 (5 LOC)src/ncaa_eval/ingest/connectors/kaggle.py
def fetch_seasons(self) -> list[Season]:
"""Parse ``MSeasons.csv`` into Season models."""
df = self._read_csv("MSeasons.csv")
_validate_columns(df, _SEASONS_COLUMNS, "MSeasons.csv")
return [Season(year=int(row["Season"])) for _, row in df.iterrows()]_parse_games_csv method · python · L206-L250 (45 LOC)src/ncaa_eval/ingest/connectors/kaggle.py
def _parse_games_csv(
self,
filename: str,
season: int,
day_zeros: dict[int, datetime.date],
*,
is_tournament: bool,
) -> list[Game]:
"""Parse a single games CSV, filtering to *season*."""
df = self._read_csv(filename)
_validate_columns(df, _REGULAR_SEASON_COLUMNS, filename)
df = df[df["Season"] == season]
games: list[Game] = []
for _, row in df.iterrows():
s = int(row["Season"])
day_num = int(row["DayNum"])
w_team_id = int(row["WTeamID"])
l_team_id = int(row["LTeamID"])
game_date: datetime.date | None = None
dz = day_zeros.get(s)
if dz is not None:
game_date = dz + datetime.timedelta(days=day_num)
wloc = str(row["WLoc"])
if wloc not in ("H", "A", "N"):
msg = f"kaggle: {filename} has unexpected WLoc value: {wloc!r}"
raise DataFoRepository class · python · L28-L53 (26 LOC)src/ncaa_eval/ingest/repository.py
class Repository(abc.ABC):
"""Abstract base class for NCAA data persistence."""
@abc.abstractmethod
def get_teams(self) -> list[Team]:
"""Return all stored teams."""
@abc.abstractmethod
def get_games(self, season: int) -> list[Game]:
"""Return all games for a given *season* year."""
@abc.abstractmethod
def get_seasons(self) -> list[Season]:
"""Return all stored seasons."""
@abc.abstractmethod
def save_teams(self, teams: list[Team]) -> None:
"""Persist a collection of teams (overwrite)."""
@abc.abstractmethod
def save_games(self, games: list[Game]) -> None:
"""Persist a collection of games (overwrite per season partition)."""
@abc.abstractmethod
def save_seasons(self, seasons: list[Season]) -> None:
"""Persist a collection of seasons (overwrite)."""_apply_model_defaults function · python · L93-L106 (14 LOC)src/ncaa_eval/ingest/repository.py
def _apply_model_defaults(df: pd.DataFrame, model: type[Game]) -> None:
"""Fill null values in *df* with non-None Pydantic field defaults.
When pyarrow unifies schemas across partitions that were written at
different schema versions, columns present in newer partitions but absent
in older ones are filled with null. This helper re-applies the Pydantic
model defaults so that ``model(**row)`` doesn't receive ``None`` for a
field that expects a concrete default value.
"""
sentinel: Any = ... # PydanticUndefined is represented as Ellipsis
for name, field_info in model.model_fields.items():
default = field_info.default
if name in df.columns and default is not sentinel and default is not None:
df[name] = df[name].fillna(default)If a scraper extracted this row, it came from Repobility (https://repobility.com)
ParquetRepository class · python · L109-L206 (98 LOC)src/ncaa_eval/ingest/repository.py
class ParquetRepository(Repository):
"""Repository implementation backed by Parquet files.
Directory layout::
{base_path}/
teams.parquet
seasons.parquet
games/
season={year}/
data.parquet
"""
def __init__(self, base_path: Path) -> None:
self._base_path = base_path
# -- reads ---------------------------------------------------------------
def get_teams(self) -> list[Team]:
path = self._base_path / "teams.parquet"
if not path.exists():
return []
df = pd.read_parquet(path, engine="pyarrow")
return [Team(**row) for row in df.to_dict(orient="records")]
def get_games(self, season: int) -> list[Game]:
games_dir = self._base_path / "games"
if not games_dir.exists():
return []
dataset = ds.dataset(
games_dir,
format="parquet",
partitioning=ds.partitioning(pa.sget_teams method · python · L127-L132 (6 LOC)src/ncaa_eval/ingest/repository.py
def get_teams(self) -> list[Team]:
path = self._base_path / "teams.parquet"
if not path.exists():
return []
df = pd.read_parquet(path, engine="pyarrow")
return [Team(**row) for row in df.to_dict(orient="records")]get_games method · python · L134-L155 (22 LOC)src/ncaa_eval/ingest/repository.py
def get_games(self, season: int) -> list[Game]:
games_dir = self._base_path / "games"
if not games_dir.exists():
return []
dataset = ds.dataset(
games_dir,
format="parquet",
partitioning=ds.partitioning(pa.schema([("season", pa.int64())]), flavor="hive"),
)
table = dataset.to_table(filter=ds.field("season") == season)
if table.num_rows == 0:
return []
df = table.to_pandas()
# Schema evolution: when the dataset spans partitions with different
# schemas (e.g., older files lack columns added later), pyarrow fills
# missing cells with null after unifying schemas. Re-apply Pydantic
# defaults for any column whose model field has a non-None default so
# model construction doesn't fail on unexpected null input.
_apply_model_defaults(df, Game)
return [Game(**row) for row in df.to_dict(orient="records")]get_seasons method · python · L157-L162 (6 LOC)src/ncaa_eval/ingest/repository.py
def get_seasons(self) -> list[Season]:
path = self._base_path / "seasons.parquet"
if not path.exists():
return []
df = pd.read_parquet(path, engine="pyarrow")
return [Season(**row) for row in df.to_dict(orient="records")]save_teams method · python · L166-L174 (9 LOC)src/ncaa_eval/ingest/repository.py
def save_teams(self, teams: list[Team]) -> None:
if not teams:
return
self._base_path.mkdir(parents=True, exist_ok=True)
table = pa.Table.from_pydict(
{field: [getattr(t, field) for t in teams] for field in _TEAM_SCHEMA.names},
schema=_TEAM_SCHEMA,
)
pq.write_table(table, self._base_path / "teams.parquet")save_games method · python · L176-L196 (21 LOC)src/ncaa_eval/ingest/repository.py
def save_games(self, games: list[Game]) -> None:
if not games:
return
games_dir = self._base_path / "games"
# Group games by season for partitioned writes.
seasons: dict[int, list[Game]] = {}
for g in games:
seasons.setdefault(g.season, []).append(g)
for season_year, season_games in seasons.items():
partition_dir = games_dir / f"season={season_year}"
partition_dir.mkdir(parents=True, exist_ok=True)
# Build a schema without the partition column (pyarrow hive
# partitioning stores it in the directory name).
write_schema = pa.schema([f for f in _GAME_SCHEMA if f.name != "season"])
data = {field.name: [getattr(g, field.name) for g in season_games] for field in write_schema}
table = pa.Table.from_pydict(data, schema=write_schema)
pq.write_table(table, partition_dir / "data.parquet")save_seasons method · python · L198-L206 (9 LOC)src/ncaa_eval/ingest/repository.py
def save_seasons(self, seasons: list[Season]) -> None:
if not seasons:
return
self._base_path.mkdir(parents=True, exist_ok=True)
table = pa.Table.from_pydict(
{field: [getattr(s, field) for s in seasons] for field in _SEASON_SCHEMA.names},
schema=_SEASON_SCHEMA,
)
pq.write_table(table, self._base_path / "seasons.parquet")Team class · python · L17-L24 (8 LOC)src/ncaa_eval/ingest/schema.py
class Team(BaseModel):
"""A college basketball team."""
model_config = ConfigDict(populate_by_name=True)
team_id: int = Field(..., ge=1, alias="TeamID")
team_name: str = Field(..., min_length=1, alias="TeamName")
canonical_name: str = Field(default="", alias="CanonicalName")Repobility · MCP-ready · https://repobility.com
Season class · python · L27-L32 (6 LOC)src/ncaa_eval/ingest/schema.py
class Season(BaseModel):
"""A single NCAA basketball season (identified by calendar year)."""
model_config = ConfigDict(populate_by_name=True)
year: int = Field(..., ge=1985, alias="Year")Game class · python · L35-L60 (26 LOC)src/ncaa_eval/ingest/schema.py
class Game(BaseModel):
"""A single NCAA basketball game result."""
model_config = ConfigDict(populate_by_name=True)
game_id: str = Field(..., min_length=1, alias="GameID")
season: int = Field(..., ge=1985, alias="Season")
day_num: int = Field(..., ge=0, alias="DayNum")
date: datetime.date | None = Field(default=None, alias="Date")
w_team_id: int = Field(..., ge=1, alias="WTeamID")
l_team_id: int = Field(..., ge=1, alias="LTeamID")
w_score: int = Field(..., ge=0, alias="WScore")
l_score: int = Field(..., ge=0, alias="LScore")
loc: Literal["H", "A", "N"] = Field(..., alias="Loc")
num_ot: int = Field(default=0, ge=0, alias="NumOT")
is_tournament: bool = Field(default=False, alias="IsTournament")
@model_validator(mode="after")
def _check_game_integrity(self) -> Game:
if self.w_score <= self.l_score:
msg = f"w_score ({self.w_score}) must be greater than l_score ({self.l_score})"
raise ValueError(m_check_game_integrity method · python · L53-L60 (8 LOC)src/ncaa_eval/ingest/schema.py
def _check_game_integrity(self) -> Game:
if self.w_score <= self.l_score:
msg = f"w_score ({self.w_score}) must be greater than l_score ({self.l_score})"
raise ValueError(msg)
if self.w_team_id == self.l_team_id:
msg = f"w_team_id and l_team_id must differ (both are {self.w_team_id})"
raise ValueError(msg)
return self