Function bodies 268 total

benchmark_dataloader function · python · L192-L241 (50 LOC)

src/cl/datasets/jax_dataloader.py

def benchmark_dataloader(loader, num_batches: int = 100, warmup: int = 10):
    """Benchmark data loading throughput.

    Measures batches/second and identifies bottlenecks.

    Args:
        loader: DataLoader to benchmark (PyTorch or JAX-wrapped)
        num_batches: Number of batches to measure
        warmup: Number of warmup batches (ignored in timing)

    Returns:
        dict with 'batches_per_sec', 'samples_per_sec', 'avg_batch_time_ms'
    """
    import time

    # Warmup
    iter_loader = iter(loader)
    for _ in range(warmup):
        try:
            _ = next(iter_loader)
        except StopIteration:
            iter_loader = iter(loader)
            _ = next(iter_loader)

    # Benchmark
    start_time = time.perf_counter()
    total_samples = 0

    for i in range(num_batches):
        try:
            batch = next(iter_loader)
            # Get batch size
            if isinstance(batch, tuple):
                x = batch[0]
            else:
                x = bat

get_optimal_prefetch_size function · python · L244-L261 (18 LOC)

src/cl/datasets/jax_dataloader.py

def get_optimal_prefetch_size(batch_time_ms: float, data_load_time_ms: float) -> int:
    """Calculate optimal prefetch queue size.

    Args:
        batch_time_ms: Time to process one batch on GPU (ms)
        data_load_time_ms: Time to load one batch from disk (ms)

    Returns:
        Recommended prefetch_size

    Formula: prefetch_size = ceil(data_load_time / batch_time) + 1
    This ensures the GPU never waits for data.
    """
    import math
    if batch_time_ms <= 0:
        return 2  # Default
    ratio = data_load_time_ms / batch_time_ms
    return max(2, math.ceil(ratio) + 1)

MNISTDataset.__init__ method · python · L49-L78 (30 LOC)

src/cl/datasets/mnist.py

    def __init__(self, config: Dict[str, Any]):
        """Initialize the MNIST dataset.

        Args:
            config: Configuration dictionary
        """
        super().__init__(config)

        self._n_tasks = config.get('n_task', 5)
        self.rotation_range = config.get('rotation_range', DEFAULT_ROTATION_RANGE)
        self.scaling_range = config.get('scaling_range', DEFAULT_SCALING_RANGE)
        self.train_split = config.get('train_test_split', DEFAULT_TRAIN_TEST_SPLIT)

        # Load MNIST dataset
        print("Loading MNIST dataset")
        my_transforms = transforms.Compose([transforms.ToTensor()])
        self.dataset = torchvision.datasets.MNIST(
            './data', train=True, download=True, transform=my_transforms
        )

        # Extract images and labels
        [self.images, self.labels] = [list(t) for t in zip(*self.dataset)]
        self.images = torch.stack(self.images, dim=0)
        self.labels = np.array(self.labels)

        # Apply debug limit

MNISTDataset._load_task_data method · python · L80-L118 (39 LOC)

src/cl/datasets/mnist.py

    def _load_task_data(self, task_id: int) -> None:
        """Load data for a specific MNIST task with transforms.

        Applies rotation and scaling transforms based on task_id to create
        distribution shift between tasks.

        Args:
            task_id: Task identifier (0-indexed)
        """
        # Set deterministic seed for reproducible task generation
        # Critical for accurate CL metrics - ensures task data is identical during training and evaluation
        np.random.seed(task_id * DEFAULT_PERMUTATION_SEED_MULTIPLIER)

        X = self.images.clone()
        y = self.labels.copy()

        # Apply task-specific transformations
        rot_angle = np.random.random() * self.rotation_range
        scaling_min, scaling_max = self.scaling_range
        scaling = np.random.random() * (scaling_max - scaling_min) + scaling_min

        X = torchvision.transforms.functional.affine(
            X, rot_angle,
            translate=(scaling, scaling),
            scal

PermutedMNISTDataset.__init__ method · python · L157-L186 (30 LOC)

src/cl/datasets/mnist.py

    def __init__(self, config: Dict[str, Any]):
        """Initialize the Permuted MNIST dataset.

        Args:
            config: Configuration dictionary
        """
        super().__init__(config)

        self._n_tasks = config.get('n_task', 10)
        self.train_split = config.get('train_test_split', DEFAULT_TRAIN_TEST_SPLIT)
        self.seed_multiplier = config.get('permutation_seed_multiplier', DEFAULT_PERMUTATION_SEED_MULTIPLIER)
        self.image_size = config.get('image_size', DEFAULT_INPUT_SIZE_MNIST)

        # Load MNIST dataset
        print("Loading Permuted MNIST dataset")
        my_transforms = transforms.Compose([transforms.ToTensor()])
        self.dataset = torchvision.datasets.MNIST(
            './data', train=True, download=True, transform=my_transforms
        )

        # Extract images and labels
        [self.images, self.labels] = [list(t) for t in zip(*self.dataset)]
        self.images = torch.stack(self.images, dim=0)
        self.labels = np.array

PermutedMNISTDataset._load_task_data method · python · L188-L218 (31 LOC)

src/cl/datasets/mnist.py

    def _load_task_data(self, task_id: int) -> None:
        """Load data for a specific permutation task.

        Applies a task-specific permutation to all pixels.

        Args:
            task_id: Task identifier (0-indexed)
        """
        X = self.images.clone()
        y = self.labels.copy()

        # Generate task-specific permutation
        rng = np.random.RandomState(seed=task_id * self.seed_multiplier)
        perm = rng.permutation(self.image_size * self.image_size)

        # Flatten, permute, reshape
        # X shape: (N, 1, 28, 28) -> flatten to (N, 784) -> permute -> reshape back
        X = X.view(X.shape[0], -1)[:, perm].view(X.shape[0], 1, self.image_size, self.image_size)

        # Use deterministic train/test split for reproducibility
        # Use the same RNG state for consistent splits across training and evaluation
        n_samples = X.shape[0]
        n_train = int(self.train_split * n_samples)

        train_idx = rng.randint(0, n_samples, n_train)

generate_sine_data function · python · L30-L75 (46 LOC)

src/cl/datasets/sine.py

def generate_sine_data(delta: float, n_tasks: int = 40, output_path: str = 'data/Incremental_Sine1e^4.p',
                       seed: int = 1) -> str:
    """Generate sine data for continual learning tasks.

    Creates a pickle file containing sine wave data for multiple tasks.
    Each task has gradually increasing frequency and amplitude.

    Args:
        delta: Perturbation value for gradual task drift
        n_tasks: Number of tasks to generate (default: 40)
        output_path: Path to save the pickle file
        seed: Random seed for reproducibility

    Returns:
        Path to the generated pickle file

    Data format per task:
        (y, time, phase, amplitude, frequency) where:
        - y: Sine wave values, shape (n_samples, n_time_points)
        - time: Time points array
        - phase: Phase values, shape (n_samples, 1)
        - amplitude: Amplitude values, shape (n_samples, 1)
        - frequency: Frequency values, shape (n_samples, 1)
    """
    # Added by Cl

Repobility · code-quality intelligence · https://repobility.com

SineDataset.__init__ method · python · L103-L149 (47 LOC)

src/cl/datasets/sine.py

    def __init__(self, config: Dict[str, Any]):
        """Initialize the sine dataset.

        Args:
            config: Configuration dictionary
        """
        super().__init__(config)

        self.delta = config.get('delta', 0.00001)
        self.data_path = config.get('data_path', 'data/Incremental_Sine1e^4.p')
        self._n_tasks = config.get('n_task', 40)
        self.test_size = config.get('test_size', 0.2)

        # Noise parameters for Experiment 3 (increasing noise per task)
        self.noise_enabled = config.get('noise_enabled', False)
        self.noise_scale = config.get('noise_scale', 0.1)
        self.noise_increment = config.get('noise_increment', 0.05)

        # Generate data if file doesn't exist
        if not os.path.exists(self.data_path):
            # Added by Claude: ensure parent directory exists before generating data
            data_dir = os.path.dirname(self.data_path)
            if data_dir and not os.path.exists(data_dir):
                os.

SineDataset._load_task_data method · python · L151-L186 (36 LOC)

src/cl/datasets/sine.py

    def _load_task_data(self, task_id: int) -> None:
        """Load data for a specific sine task.

        Extracts sine data for the given task and creates train/test splits.
        Features: [phase, amplitude, frequency]
        Target: sine wave values (flattened)

        Args:
            task_id: Task identifier (0-indexed)
        """
        if task_id >= self._available_tasks:
            raise ValueError(f"Task {task_id} not available. Max task: {self._available_tasks - 1}")

        # Extract task data
        y, time, phase, amplitude, frequency = self.raw_data['task' + str(task_id)]

        # Create feature matrix: [phase, amplitude, frequency]
        X = np.concatenate([phase, amplitude.reshape([-1, 1]), frequency.reshape([-1, 1])], axis=1)

        # Flatten y if needed (shape: n_samples x n_time_points -> n_samples)
        # For regression, we typically predict all time points
        # But original code treats y as target directly
        y = y.astype(np.float32)

BaseGraphDataset.__init__ method · python · L54-L85 (32 LOC)