Function bodies 354 total
_build_espn_team_map function · python · L40-L107 (68 LOC)src/ncaa_eval/ingest/sync.py
def _build_espn_team_map(year: int, spellings: dict[str, int]) -> dict[str, int]:
"""Build ESPN location-name → Kaggle TeamID mapping via cbbpy's bundled team map.
cbbpy ships a `mens_team_map.csv` that lists every D-I team per season
with the `location` name ESPN uses internally (e.g. `"UC Santa Barbara"`,
`"Florida Gulf Coast"`). Using these location names as the keys in
`team_name_to_id` means:
* `_fetch_per_team` queries cbbpy with *exact* ESPN names → no wrong
fuzzy match (avoids `"california-santa-barbara"` → `"California"`).
* The schedule DataFrame's `team`/`opponent` columns also use these
location names → `_resolve_team_id` can do direct dict lookups.
Each ESPN location is resolved to a Kaggle ID by exact lookup in the
Kaggle spellings dict (lowercased). A token-set-ratio fuzzy fallback
handles any locations not covered by the spellings.
Falls back to the latest available season in the map if *year* is absent
(SyncResult class · python · L111-L118 (8 LOC)src/ncaa_eval/ingest/sync.py
class SyncResult:
"""Summary of a single source sync operation."""
source: str
teams_written: int = 0
seasons_written: int = 0
games_written: int = 0
seasons_cached: int = 0SyncEngine class · python · L121-L269 (149 LOC)src/ncaa_eval/ingest/sync.py
class SyncEngine:
"""Orchestrates data sync from external sources into the local repository.
Args:
repository: Repository instance used for reading and writing data.
data_dir: Root directory for local Parquet files and cached CSVs.
"""
def __init__(self, repository: Repository, data_dir: Path) -> None:
self._repo = repository
self._data_dir = data_dir
def _espn_marker(self, year: int) -> Path:
"""Return the path of the ESPN sync marker file for *year*."""
return self._data_dir / f".espn_synced_{year}"
def sync_kaggle(self, force_refresh: bool = False) -> SyncResult:
"""Sync NCAA data from Kaggle with Parquet-level caching.
Downloads CSVs (if not cached) and converts them to Parquet.
Skips individual entities whose Parquet files already exist,
unless *force_refresh* is ``True``.
Args:
force_refresh: Bypass all caches and re-fetch everything.
Returns:__init__ method · python · L129-L131 (3 LOC)src/ncaa_eval/ingest/sync.py
def __init__(self, repository: Repository, data_dir: Path) -> None:
self._repo = repository
self._data_dir = data_dir_espn_marker method · python · L133-L135 (3 LOC)src/ncaa_eval/ingest/sync.py
def _espn_marker(self, year: int) -> Path:
"""Return the path of the ESPN sync marker file for *year*."""
return self._data_dir / f".espn_synced_{year}"sync_kaggle method · python · L137-L187 (51 LOC)src/ncaa_eval/ingest/sync.py
def sync_kaggle(self, force_refresh: bool = False) -> SyncResult:
"""Sync NCAA data from Kaggle with Parquet-level caching.
Downloads CSVs (if not cached) and converts them to Parquet.
Skips individual entities whose Parquet files already exist,
unless *force_refresh* is ``True``.
Args:
force_refresh: Bypass all caches and re-fetch everything.
Returns:
SyncResult summarising teams/seasons/games written and cached.
"""
result = SyncResult(source="kaggle")
connector = KaggleConnector(extract_dir=self._data_dir / "kaggle")
connector.download(force=force_refresh) # CSV-level cache
# Teams: Parquet-level cache
teams_path = self._data_dir / "teams.parquet"
if force_refresh or not teams_path.exists():
teams = connector.fetch_teams()
self._repo.save_teams(teams)
result.teams_written = len(teams)
typer.echo(f"[kasync_espn method · python · L189-L256 (68 LOC)src/ncaa_eval/ingest/sync.py
def sync_espn(self, force_refresh: bool = False) -> SyncResult:
"""Sync the most recent season's games from ESPN.
Requires Kaggle data to be synced first (needs team and season
mappings). Uses a marker-file cache: if ``.espn_synced_{year}``
exists the season is considered up-to-date unless *force_refresh*.
ESPN games are merged with existing Kaggle games for the same
season partition before saving (because ``save_games`` overwrites).
Args:
force_refresh: Bypass marker-file cache and re-fetch from ESPN.
Returns:
SyncResult summarising games written and seasons cached.
Raises:
RuntimeError: Kaggle data has not been synced yet.
"""
result = SyncResult(source="espn")
teams = self._repo.get_teams()
seasons = self._repo.get_seasons()
if not teams or not seasons:
raise RuntimeError(
"ESPN sync requires Kaggle About: code-quality intelligence by Repobility · https://repobility.com
sync_all method · python · L258-L269 (12 LOC)src/ncaa_eval/ingest/sync.py
def sync_all(self, force_refresh: bool = False) -> list[SyncResult]:
"""Sync all configured sources: Kaggle first, then ESPN.
Args:
force_refresh: Bypass caches for all sources.
Returns:
List of SyncResult, one per source (kaggle, espn).
"""
kaggle_result = self.sync_kaggle(force_refresh)
espn_result = self.sync_espn(force_refresh)
return [kaggle_result, espn_result]ModelConfig class · python · L21-L27 (7 LOC)src/ncaa_eval/model/base.py
class ModelConfig(BaseModel):
"""Base configuration shared by all model implementations.
Subclasses add model-specific hyperparameters as additional fields.
"""
model_name: strModel class · python · L30-L62 (33 LOC)src/ncaa_eval/model/base.py
class Model(abc.ABC):
"""Abstract base class for all NCAA prediction models.
Every model — stateful or stateless — must implement these five
methods so that the training CLI, evaluation engine, and persistence
layer can treat all models uniformly.
"""
@abc.abstractmethod
def fit(self, X: pd.DataFrame, y: pd.Series) -> None:
"""Train the model on feature matrix *X* and labels *y*."""
...
@abc.abstractmethod
def predict_proba(self, X: pd.DataFrame) -> pd.Series:
"""Return P(team_a wins) in [0, 1] for each row of *X*."""
...
@abc.abstractmethod
def save(self, path: Path) -> None:
"""Persist the trained model to *path*."""
...
@classmethod
@abc.abstractmethod
def load(cls, path: Path) -> Self:
"""Load a previously-saved model from *path*."""
...
@abc.abstractmethod
def get_config(self) -> ModelConfig:
"""Return the Pydantic-validated configuration fofit method · python · L39-L41 (3 LOC)src/ncaa_eval/model/base.py
def fit(self, X: pd.DataFrame, y: pd.Series) -> None:
"""Train the model on feature matrix *X* and labels *y*."""
...predict_proba method · python · L44-L46 (3 LOC)src/ncaa_eval/model/base.py
def predict_proba(self, X: pd.DataFrame) -> pd.Series:
"""Return P(team_a wins) in [0, 1] for each row of *X*."""
...save method · python · L49-L51 (3 LOC)src/ncaa_eval/model/base.py
def save(self, path: Path) -> None:
"""Persist the trained model to *path*."""
...load method · python · L55-L57 (3 LOC)src/ncaa_eval/model/base.py
def load(cls, path: Path) -> Self:
"""Load a previously-saved model from *path*."""
...get_config method · python · L60-L62 (3 LOC)src/ncaa_eval/model/base.py
def get_config(self) -> ModelConfig:
"""Return the Pydantic-validated configuration for this model."""
...Generated by Repobility's multi-pass static-analysis pipeline (https://repobility.com)
StatefulModel class · python · L71-L198 (128 LOC)src/ncaa_eval/model/base.py
class StatefulModel(Model):
"""Template base for models that process games sequentially.
Concrete methods ``fit`` and ``predict_proba`` are provided as
template methods. Subclasses implement the abstract hooks:
* ``update(game)`` — absorb a single game result
* ``_predict_one(team_a_id, team_b_id)`` — return P(team_a wins)
* ``start_season(season)`` — reset / prepare for a new season
* ``get_state()`` / ``set_state(state)`` — snapshot / restore ratings
"""
# ------------------------------------------------------------------
# Concrete template methods
# ------------------------------------------------------------------
def fit(self, X: pd.DataFrame, y: pd.Series) -> None:
"""Reconstruct games from *X*/*y* and update sequentially."""
games = self._to_games(X, y)
current_season: int | None = None
for game in games:
if game.season != current_season:
self.start_season(game.seasofit method · python · L87-L95 (9 LOC)src/ncaa_eval/model/base.py
def fit(self, X: pd.DataFrame, y: pd.Series) -> None:
"""Reconstruct games from *X*/*y* and update sequentially."""
games = self._to_games(X, y)
current_season: int | None = None
for game in games:
if game.season != current_season:
self.start_season(game.season)
current_season = game.season
self.update(game)predict_proba method · python · L97-L100 (4 LOC)src/ncaa_eval/model/base.py
def predict_proba(self, X: pd.DataFrame) -> pd.Series:
"""Call ``_predict_one`` per row using ``itertuples``."""
preds: list[float] = [self._predict_one(row.team_a_id, row.team_b_id) for row in X.itertuples()]
return pd.Series(preds, index=X.index)_to_games method · python · L107-L169 (63 LOC)src/ncaa_eval/model/base.py
def _to_games(X: pd.DataFrame, y: pd.Series) -> list[Game]:
"""Reconstruct :class:`Game` objects from the feature DataFrame.
Parameters
----------
X : pd.DataFrame
Feature matrix with columns: ``team_a_id``, ``team_b_id``,
``season``, ``day_num``, ``date``, ``loc_encoding``,
``game_id``, ``is_tournament``. Optionally ``w_score``,
``l_score``, ``num_ot``.
y : pd.Series
Binary label — ``1`` (or ``True``) means team_a won.
"""
# Hoist column-existence checks outside the loop (O(1) each, not O(n))
has_scores = "w_score" in X.columns and "l_score" in X.columns
has_num_ot = "num_ot" in X.columns
games: list[Game] = []
for row in X.itertuples():
idx = row.Index
team_a_won = bool(y.loc[idx])
team_a_id = int(row.team_a_id)
team_b_id = int(row.team_b_id)
if team_a_won:
_predict_one method · python · L176-L178 (3 LOC)src/ncaa_eval/model/base.py
def _predict_one(self, team_a_id: int, team_b_id: int) -> float:
"""Return P(team_a wins) given team IDs."""
...update method · python · L181-L183 (3 LOC)src/ncaa_eval/model/base.py
def update(self, game: Game) -> None:
"""Absorb the result of a single game."""
...start_season method · python · L186-L188 (3 LOC)src/ncaa_eval/model/base.py
def start_season(self, season: int) -> None:
"""Called before the first game of each season."""
...get_state method · python · L191-L193 (3 LOC)src/ncaa_eval/model/base.py
def get_state(self) -> dict[str, Any]:
"""Return a serialisable snapshot of internal ratings."""
...All rows scored by the Repobility analyzer (https://repobility.com)
set_state method · python · L196-L198 (3 LOC)src/ncaa_eval/model/base.py
def set_state(self, state: dict[str, Any]) -> None:
"""Restore internal ratings from a snapshot."""
...EloModelConfig class · python · L22-L37 (16 LOC)src/ncaa_eval/model/elo.py
class EloModelConfig(ModelConfig):
"""Pydantic configuration for the Elo model.
Fields and defaults mirror :class:`~ncaa_eval.transform.elo.EloConfig`.
"""
model_name: Literal["elo"] = "elo"
initial_rating: float = 1500.0
k_early: float = 56.0
k_regular: float = 38.0
k_tournament: float = 47.5
early_game_threshold: int = 20
margin_exponent: float = 0.85
max_margin: int = 25
home_advantage_elo: float = 3.5
mean_reversion_fraction: float = 0.25EloModel class · python · L41-L187 (147 LOC)src/ncaa_eval/model/elo.py
class EloModel(StatefulModel):
"""Elo rating model wrapping :class:`EloFeatureEngine`."""
def __init__(self, config: EloModelConfig | None = None) -> None:
self._config = config or EloModelConfig()
self._engine = EloFeatureEngine(self._to_elo_config(self._config))
# ------------------------------------------------------------------
# StatefulModel abstract hooks
# ------------------------------------------------------------------
def update(self, game: Game) -> None:
"""Delegate game processing to the engine."""
self._engine.update_game(
w_team_id=game.w_team_id,
l_team_id=game.l_team_id,
w_score=game.w_score,
l_score=game.l_score,
loc=game.loc,
is_tournament=game.is_tournament,
num_ot=game.num_ot,
)
def start_season(self, season: int) -> None:
"""Delegate season transition to the engine."""
self._engine.start_new_se__init__ method · python · L44-L46 (3 LOC)src/ncaa_eval/model/elo.py
def __init__(self, config: EloModelConfig | None = None) -> None:
self._config = config or EloModelConfig()
self._engine = EloFeatureEngine(self._to_elo_config(self._config))update method · python · L52-L62 (11 LOC)src/ncaa_eval/model/elo.py
def update(self, game: Game) -> None:
"""Delegate game processing to the engine."""
self._engine.update_game(
w_team_id=game.w_team_id,
l_team_id=game.l_team_id,
w_score=game.w_score,
l_score=game.l_score,
loc=game.loc,
is_tournament=game.is_tournament,
num_ot=game.num_ot,
)start_season method · python · L64-L66 (3 LOC)src/ncaa_eval/model/elo.py
def start_season(self, season: int) -> None:
"""Delegate season transition to the engine."""
self._engine.start_new_season(season)_predict_one method · python · L68-L72 (5 LOC)src/ncaa_eval/model/elo.py
def _predict_one(self, team_a_id: int, team_b_id: int) -> float:
"""Return P(team_a wins) using the Elo expected-score formula."""
r_a = self._engine.get_rating(team_a_id)
r_b = self._engine.get_rating(team_b_id)
return EloFeatureEngine.expected_score(r_a, r_b)get_state method · python · L74-L79 (6 LOC)src/ncaa_eval/model/elo.py
def get_state(self) -> dict[str, Any]:
"""Return ratings and game counts as a serialisable snapshot."""
return {
"ratings": self._engine.get_all_ratings(),
"game_counts": dict(self._engine._game_counts),
}Want this analysis on your repo? https://repobility.com/scan/
set_state method · python · L81-L116 (36 LOC)src/ncaa_eval/model/elo.py
def set_state(self, state: dict[str, Any]) -> None:
"""Restore ratings and game counts from a snapshot.
Parameters
----------
state
Must contain ``"ratings"`` (``dict[int, float]``) and
``"game_counts"`` (``dict[int, int]``) keys, as returned by
:meth:`get_state`. Keys may be ``int`` or ``str``; string keys
are coerced to ``int`` so that JSON-decoded dicts (where all keys
are strings) work correctly without silent rating loss.
Raises
------
KeyError
If ``"ratings"`` or ``"game_counts"`` keys are absent.
TypeError
If either value is not a ``dict``.
"""
if "ratings" not in state or "game_counts" not in state:
missing = {"ratings", "game_counts"} - state.keys()
msg = f"set_state() state dict missing required keys: {missing}"
raise KeyError(msg)
ratings = state["ratings"]
save method · python · L122-L132 (11 LOC)src/ncaa_eval/model/elo.py
def save(self, path: Path) -> None:
"""JSON-dump config and state to *path* directory."""
path.mkdir(parents=True, exist_ok=True)
(path / "config.json").write_text(self._config.model_dump_json())
state = self.get_state()
# JSON keys must be strings
serialisable = {
"ratings": {str(k): v for k, v in state["ratings"].items()},
"game_counts": {str(k): v for k, v in state["game_counts"].items()},
}
(path / "state.json").write_text(json.dumps(serialisable))load method · python · L135-L162 (28 LOC)src/ncaa_eval/model/elo.py
def load(cls, path: Path) -> Self:
"""Reconstruct an EloModel from a saved directory.
Raises
------
FileNotFoundError
If either ``config.json`` or ``state.json`` is missing. A missing
file indicates an incomplete :meth:`save` (e.g., interrupted write).
"""
config_path = path / "config.json"
state_path = path / "state.json"
missing = [p for p in (config_path, state_path) if not p.exists()]
if missing:
missing_names = ", ".join(p.name for p in missing)
msg = (
f"Incomplete save at {path!r}: missing {missing_names}. "
"The save may have been interrupted."
)
raise FileNotFoundError(msg)
config = EloModelConfig.model_validate_json(config_path.read_text())
instance = cls(config)
raw = json.loads(state_path.read_text())
state = {
"ratings": {int(k): v for k, v in raw["raget_config method · python · L168-L170 (3 LOC)src/ncaa_eval/model/elo.py
def get_config(self) -> EloModelConfig:
"""Return the Pydantic-validated configuration."""
return self._config_to_elo_config method · python · L177-L187 (11 LOC)src/ncaa_eval/model/elo.py
def _to_elo_config(config: EloModelConfig) -> EloConfig:
"""Convert Pydantic config to the frozen dataclass the engine expects.
Uses :func:`dataclasses.fields` to derive the argument set from
``EloConfig`` at runtime, so any new field added to ``EloConfig`` is
automatically included — without requiring a manual update here.
``EloModelConfig`` must keep its fields in sync with ``EloConfig``.
"""
elo_field_names = {f.name for f in dataclasses.fields(EloConfig)}
kwargs = {k: v for k, v in config.model_dump().items() if k in elo_field_names}
return EloConfig(**kwargs)LogisticRegressionConfig class · python · L20-L25 (6 LOC)src/ncaa_eval/model/logistic_regression.py
class LogisticRegressionConfig(ModelConfig):
"""Hyperparameters for the logistic regression test fixture."""
model_name: Literal["logistic_regression"] = "logistic_regression"
C: float = 1.0 # noqa: N815 — sklearn convention
max_iter: int = 200LogisticRegressionModel class · python · L29-L56 (28 LOC)src/ncaa_eval/model/logistic_regression.py
class LogisticRegressionModel(Model):
"""Thin wrapper around sklearn ``LogisticRegression``."""
def __init__(self, config: LogisticRegressionConfig | None = None) -> None:
self._config = config or LogisticRegressionConfig()
self._clf = LogisticRegression(C=self._config.C, max_iter=self._config.max_iter)
def fit(self, X: pd.DataFrame, y: pd.Series) -> None:
self._clf.fit(X, y)
def predict_proba(self, X: pd.DataFrame) -> pd.Series:
probs = self._clf.predict_proba(X)[:, 1]
return pd.Series(probs, index=X.index)
def save(self, path: Path) -> None:
path.mkdir(parents=True, exist_ok=True)
joblib.dump(self._clf, path / "model.joblib")
(path / "config.json").write_text(self._config.model_dump_json())
@classmethod
def load(cls, path: Path) -> Self:
config = LogisticRegressionConfig.model_validate_json((path / "config.json").read_text())
instance = cls(config)
instance._clf = __init__ method · python · L32-L34 (3 LOC)src/ncaa_eval/model/logistic_regression.py
def __init__(self, config: LogisticRegressionConfig | None = None) -> None:
self._config = config or LogisticRegressionConfig()
self._clf = LogisticRegression(C=self._config.C, max_iter=self._config.max_iter)About: code-quality intelligence by Repobility · https://repobility.com
predict_proba method · python · L39-L41 (3 LOC)src/ncaa_eval/model/logistic_regression.py
def predict_proba(self, X: pd.DataFrame) -> pd.Series:
probs = self._clf.predict_proba(X)[:, 1]
return pd.Series(probs, index=X.index)save method · python · L43-L46 (4 LOC)src/ncaa_eval/model/logistic_regression.py
def save(self, path: Path) -> None:
path.mkdir(parents=True, exist_ok=True)
joblib.dump(self._clf, path / "model.joblib")
(path / "config.json").write_text(self._config.model_dump_json())load method · python · L49-L53 (5 LOC)src/ncaa_eval/model/logistic_regression.py
def load(cls, path: Path) -> Self:
config = LogisticRegressionConfig.model_validate_json((path / "config.json").read_text())
instance = cls(config)
instance._clf = joblib.load(path / "model.joblib")
return instanceregister_model function · python · L23-L40 (18 LOC)src/ncaa_eval/model/registry.py
def register_model(name: str) -> Callable[[_T], _T]:
"""Class decorator that registers a :class:`Model` subclass.
Usage::
@register_model("elo")
class EloModel(StatefulModel):
...
"""
def decorator(cls: _T) -> _T:
if name in _MODEL_REGISTRY:
msg = f"Model name {name!r} is already registered to {_MODEL_REGISTRY[name].__name__}"
raise ValueError(msg)
_MODEL_REGISTRY[name] = cls
return cls
return decoratorget_model function · python · L43-L52 (10 LOC)src/ncaa_eval/model/registry.py
def get_model(name: str) -> type[Model]:
"""Return the model class registered under *name*.
Raises :class:`ModelNotFoundError` if not found.
"""
try:
return _MODEL_REGISTRY[name]
except KeyError:
msg = f"No model registered with name {name!r}. Available: {list_models()}"
raise ModelNotFoundError(msg) from Nonelist_models function · python · L55-L57 (3 LOC)src/ncaa_eval/model/registry.py
def list_models() -> list[str]:
"""Return all registered model names (sorted)."""
return sorted(_MODEL_REGISTRY)ModelRun class · python · L39-L49 (11 LOC)src/ncaa_eval/model/tracking.py
class ModelRun(BaseModel):
"""Metadata for a single model training run."""
run_id: str
model_type: str
hyperparameters: dict[str, Any]
timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
git_hash: str
start_year: int
end_year: int
prediction_count: intPrediction class · python · L52-L60 (9 LOC)src/ncaa_eval/model/tracking.py
class Prediction(BaseModel):
"""A single game-level probability prediction."""
run_id: str
game_id: str
season: int
team_a_id: int
team_b_id: int
pred_win_prob: Annotated[float, Field(ge=0.0, le=1.0)]Generated by Repobility's multi-pass static-analysis pipeline (https://repobility.com)
RunStore class · python · L66-L291 (226 LOC)src/ncaa_eval/model/tracking.py
class RunStore:
"""Persist and load model runs and predictions on the local filesystem.
Directory layout::
base_path/
runs/
<run_id>/
run.json # ModelRun metadata
predictions.parquet # Prediction records (PyArrow)
summary.parquet # BacktestResult.summary (year × metrics)
fold_predictions.parquet # CV fold y_true/y_prob per year
model/ # Trained model artifacts
model.ubj # XGBoost native format (XGBoost only)
model.json # Elo ratings (Elo only)
config.json # Model config
feature_names.json # Feature column names used during training
"""
def __init__(self, base_path: Path) -> None:
self._runs_dir = base_path / "runs"
def save_run(self, run: ModelRun, predictions: list[Prediction]) -save_run method · python · L88-L111 (24 LOC)src/ncaa_eval/model/tracking.py
def save_run(self, run: ModelRun, predictions: list[Prediction]) -> None:
"""Write run metadata (JSON) and predictions (Parquet)."""
run_dir = self._runs_dir / run.run_id
run_dir.mkdir(parents=True, exist_ok=True)
# Metadata
(run_dir / "run.json").write_text(run.model_dump_json(indent=2))
# Predictions
if predictions:
rows = [p.model_dump() for p in predictions]
table = pa.Table.from_pylist(rows, schema=_PREDICTION_SCHEMA)
else:
table = pa.table(
{
col: pa.array([], type=typ)
for col, typ in zip(
_PREDICTION_SCHEMA.names,
[f.type for f in _PREDICTION_SCHEMA],
)
},
schema=_PREDICTION_SCHEMA,
)
pq.write_table(table, run_dir / "predictions.parquet")load_run method · python · L113-L125 (13 LOC)src/ncaa_eval/model/tracking.py
def load_run(self, run_id: str) -> ModelRun:
"""Load run metadata from JSON.
Raises
------
FileNotFoundError
If the run directory or ``run.json`` does not exist.
"""
run_json = self._runs_dir / run_id / "run.json"
if not run_json.exists():
msg = f"No run found with id {run_id!r} at {run_json}"
raise FileNotFoundError(msg)
return ModelRun.model_validate_json(run_json.read_text())