Function bodies 261 total
CrossLaggedResultsAnalyzer._load_estimated_results method · python · L116-L129 (14 LOC)src/analysis/analyze_crosslagged_results.py
def _load_estimated_results(self, estimated_dir: Path) -> None:
"""Load estimated lag model results."""
for var_dir in estimated_dir.iterdir():
if var_dir.is_dir():
var_name = var_dir.name
# Load posterior summary
summary_path = var_dir / "posterior_summary.csv"
if summary_path.exists():
try:
summary_df = pd.read_csv(summary_path)
self.estimated_results[var_name] = summary_df
except Exception as e:
print(f" Error loading {summary_path}: {e}")CrossLaggedResultsAnalyzer._load_cumulative_results method · python · L131-L144 (14 LOC)src/analysis/analyze_crosslagged_results.py
def _load_cumulative_results(self, cumulative_dir: Path) -> None:
"""Load cumulative lag model results."""
for var_dir in cumulative_dir.iterdir():
if var_dir.is_dir():
var_name = var_dir.name
# Load posterior summary
summary_path = var_dir / "posterior_summary.csv"
if summary_path.exists():
try:
summary_df = pd.read_csv(summary_path)
self.cumulative_results[var_name] = summary_df
except Exception as e:
print(f" Error loading {summary_path}: {e}")CrossLaggedResultsAnalyzer.create_summary_report method · python · L146-L223 (78 LOC)src/analysis/analyze_crosslagged_results.py
def create_summary_report(self) -> pd.DataFrame:
"""Create comprehensive summary report."""
print("\n" + "=" * 70)
print("CREATING SUMMARY REPORT")
print("=" * 70)
if self.comparison_df is None or self.comparison_df.empty:
print("No comparison data available.")
return pd.DataFrame()
# Create summary statistics
summary_rows = []
for var_name in self.comparison_df['variable'].unique():
df_var = self.comparison_df[self.comparison_df['variable'] == var_name]
# Basic statistics
n_models = len(df_var)
n_fixed = len(df_var[df_var['model_type'] == 'fixed'])
n_estimated = len(df_var[df_var['model_type'] == 'estimated'])
n_cumulative = len(df_var[df_var['model_type'] == 'cumulative'])
# Effect sizes
beta_means = df_var['beta_mean'].dropna()
beta_positive = (beta_means > 0).sum()
betCrossLaggedResultsAnalyzer.create_visualizations method · python · L225-L258 (34 LOC)src/analysis/analyze_crosslagged_results.py
def create_visualizations(self) -> None:
"""Create comprehensive visualizations."""
print("\n" + "=" * 70)
print("CREATING VISUALIZATIONS")
print("=" * 70)
if self.comparison_df is None or self.comparison_df.empty:
print("No data available for visualizations.")
return
# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
# Create visualization directory
viz_dir = self.results_dir / "analysis_visualizations"
viz_dir.mkdir(exist_ok=True)
# 1. Effect size comparison across variables
self._plot_effect_sizes(viz_dir)
# 2. Lag-effect relationship for fixed models
self._plot_lag_effects(viz_dir)
# 3. Model comparison by WAIC
self._plot_model_comparison(viz_dir)
# 4. Credible interval widths
self._plot_ci_widths(viz_dir)
# 5. Posterior distributions for key parameters
selCrossLaggedResultsAnalyzer._plot_effect_sizes method · python · L260-L322 (63 LOC)src/analysis/analyze_crosslagged_results.py
def _plot_effect_sizes(self, viz_dir: Path) -> None:
"""Plot effect sizes across variables and models."""
df = self.comparison_df
plt.figure(figsize=(14, 8))
# Create grouped bar chart
variables = df['variable'].unique()
model_types = df['model_type'].unique()
x_pos = np.arange(len(variables))
bar_width = 0.8 / len(model_types)
for i, model_type in enumerate(model_types):
df_model = df[df['model_type'] == model_type]
# Align by variable
beta_means = []
beta_errors_low = []
beta_errors_high = []
for var_name in variables:
df_var = df_model[df_model['variable'] == var_name]
if not df_var.empty:
# For fixed models, take average across lags
if model_type == 'fixed':
beta_mean = df_var['beta_mean'].mean()
beta_ci_low = dfCrossLaggedResultsAnalyzer._plot_lag_effects method · python · L324-L358 (35 LOC)src/analysis/analyze_crosslagged_results.py
def _plot_lag_effects(self, viz_dir: Path) -> None:
"""Plot lag-effect relationship for fixed models."""
df_fixed = self.comparison_df[self.comparison_df['model_type'] == 'fixed']
if df_fixed.empty:
return
plt.figure(figsize=(12, 8))
# Plot each variable separately
variables = df_fixed['variable'].unique()
for var_name in variables:
df_var = df_fixed[df_fixed['variable'] == var_name]
# Sort by lag
df_var = df_var.sort_values('lag_value')
# Plot with error bars
plt.errorbar(df_var['lag_value'], df_var['beta_mean'],
yerr=[df_var['beta_mean'] - df_var['beta_ci_low'],
df_var['beta_ci_high'] - df_var['beta_mean']],
fmt='o-', capsize=5, capthick=2, linewidth=2,
label=var_name, alpha=0.8, markersize=8)
plt.axhline(y=0, color='black', linestyle='--'CrossLaggedResultsAnalyzer._plot_model_comparison method · python · L360-L401 (42 LOC)src/analysis/analyze_crosslagged_results.py
def _plot_model_comparison(self, viz_dir: Path) -> None:
"""Plot model comparison by WAIC."""
df = self.comparison_df
if df['waic'].isna().all():
return
plt.figure(figsize=(12, 8))
# Create pivot table for WAIC
df_waic = df.pivot_table(
index='variable',
columns='model_type',
values='waic',
aggfunc='first'
)
# Plot WAIC as heatmap
plt.imshow(df_waic.values, cmap='viridis', aspect='auto')
plt.colorbar(label='WAIC (lower is better)')
# Add labels
plt.xticks(range(len(df_waic.columns)), df_waic.columns, rotation=45)
plt.yticks(range(len(df_waic.index)), df_waic.index)
plt.xlabel('Model Type', fontsize=12)
plt.ylabel('Workout Variable', fontsize=12)
plt.title('Model Comparison by WAIC', fontsize=14, fontweight='bold')
# Add text values
for i in range(len(df_waic.index)):
Source: Repobility analyzer · https://repobility.com
CrossLaggedResultsAnalyzer._plot_ci_widths method · python · L403-L426 (24 LOC)src/analysis/analyze_crosslagged_results.py
def _plot_ci_widths(self, viz_dir: Path) -> None:
"""Plot credible interval widths."""
df = self.comparison_df
# Calculate CI widths
df['ci_width'] = df['beta_ci_high'] - df['beta_ci_low']
plt.figure(figsize=(12, 8))
# Group by variable and model type
df_grouped = df.groupby(['variable', 'model_type'])['ci_width'].mean().unstack()
# Plot as grouped bar chart
df_grouped.plot(kind='bar', figsize=(12, 8))
plt.xlabel('Workout Variable', fontsize=12)
plt.ylabel('95% Credible Interval Width', fontsize=12)
plt.title('Uncertainty in Effect Estimates by Variable and Model', fontsize=14, fontweight='bold')
plt.legend(title='Model Type')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig(viz_dir / "ci_widths_comparison.png", dpi=150)
plt.close()
print(f" Saved CI widths plot: {viz_dir / 'ci_widths_comparison.png'}")CrossLaggedResultsAnalyzer._plot_posterior_distributions method · python · L428-L437 (10 LOC)src/analysis/analyze_crosslagged_results.py
def _plot_posterior_distributions(self, viz_dir: Path) -> None:
"""Plot posterior distributions for key parameters."""
# This would require loading the full posterior samples
# For now, we'll create a simpler version using summary statistics
if not self.fixed_results and not self.estimated_results and not self.cumulative_results:
return
print(" Note: Full posterior distribution plots require loading posterior samples.")
print(" Consider running with --load-samples flag when samples are available.")CrossLaggedResultsAnalyzer.generate_html_report method · python · L439-L547 (109 LOC)src/analysis/analyze_crosslagged_results.py
def generate_html_report(self) -> None:
"""Generate HTML report with interactive visualizations."""
print("\n" + "=" * 70)
print("GENERATING HTML REPORT")
print("=" * 70)
if self.comparison_df is None or self.comparison_df.empty:
print("No data available for HTML report.")
return
# Create HTML report
report_path = self.results_dir / "analysis_report.html"
html_content = f"""
<!DOCTYPE html>
<html>
<head>
<title>Cross-Lagged Model Analysis Report</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 40px; }}
h1 {{ color: #333; border-bottom: 2px solid #333; padding-bottom: 10px; }}
h2 {{ color: #555; margin-top: 30px; }}
table {{ border-collapse: collapse; width: 100%; margin: 20px 0; }}
th, td {{ border: 1px solid #ddd; padding: 12px; text-align: left; }}
CrossLaggedResultsAnalyzer.run_analysis method · python · L549-L581 (33 LOC)src/analysis/analyze_crosslagged_results.py
def run_analysis(self, load_individual: bool = False) -> None:
"""Run complete analysis pipeline.
Args:
load_individual: Whether to load individual model results (slower).
"""
print("=" * 70)
print("CROSS-LAGGED RESULTS ANALYSIS")
print("=" * 70)
print(f"Results directory: {self.results_dir}")
print("=" * 70)
# Load comparison tables
self.load_comparison_tables()
# Load individual results if requested
if load_individual:
self.load_individual_results()
# Create summary report
summary_df = self.create_summary_report()
# Create visualizations
self.create_visualizations()
# Generate HTML report
self.generate_html_report()
print("\n" + "=" * 70)
print("ANALYSIS COMPLETE")
print("=" * 70)
print(f"Results saved to: {self.results_dir}")
print("=" * 70)main function · python · L584-L607 (24 LOC)src/analysis/analyze_crosslagged_results.py
def main():
parser = argparse.ArgumentParser(
description="Analyze and visualize cross-lagged model results",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--results-dir",
type=str,
default="output/full_comparison",
help="Directory containing cross-lagged model results"
)
parser.add_argument(
"--load-individual",
action="store_true",
help="Load individual model results (slower but more detailed)"
)
args = parser.parse_args()
# Create and run analyzer
analyzer = CrossLaggedResultsAnalyzer(args.results_dir)
analyzer.run_analysis(load_individual=args.load_individual)HealthCrossLaggedAnalyzer.__init__ method · python · L69-L144 (76 LOC)src/analysis/analyze_health_crosslagged.py
def __init__(
self,
workout_vars: List[str],
health_metrics: Optional[List[str]] = None,
categories: Optional[List[str]] = None,
data_dir: str = "data",
output_dir: str = "output/health_crosslagged",
fixed_lags: List[float] = [0, 1, 2, 3, 7],
estimated_lag_prior_mean: float = 3.0,
estimated_lag_prior_sd: float = 2.0,
cumulative_window: int = 7,
cumulative_step: float = 1.0,
chains: int = 4,
iter_warmup: int = 500,
iter_sampling: int = 500,
use_sparse: bool = True,
n_inducing_points: int = 50,
skip_plots: bool = False,
force_refit: bool = False,
no_cache: bool = False,
max_metrics: int = 10,
):
"""Initialize analyzer with configuration.
Args:
workout_vars: List of workout/activity variables to analyze.
health_metrics: Specific health metrics to analyze (if None, uses categories).
HealthCrossLaggedAnalyzer.load_data method · python · L146-L198 (53 LOC)src/analysis/analyze_health_crosslagged.py
def load_data(self) -> None:
"""Load all required data."""
print("\n" + "=" * 70)
print("LOADING DATA")
print("=" * 70)
# Load weight data
print("\nLoading weight data...")
self.weight_df = load_weight_data(self.data_dir)
print(f" Loaded {len(self.weight_df)} weight measurements")
# Load health metrics
print("\nLoading health metrics...")
self.health_df = load_combined_health_data(self.data_dir)
print(f" Loaded {len(self.health_df)} days of health data")
print(f" Available metrics: {len(self.health_df.columns)}")
# Get available metrics
self.available_metrics = get_available_health_metrics()
# Select health metrics to analyze
self.selected_metrics = self._select_health_metrics()
print(f"\nSelected {len(self.selected_metrics)} health metrics for analysis:")
for metric in self.selected_metrics:
print(f" - {metric}"HealthCrossLaggedAnalyzer._select_health_metrics method · python · L200-L233 (34 LOC)src/analysis/analyze_health_crosslagged.py
def _select_health_metrics(self) -> List[str]:
"""Select health metrics to analyze based on user input."""
all_metrics = []
# Get all metrics from categories if specified
if self.categories:
for category in self.categories:
if category in self.available_metrics.get('categories', {}):
all_metrics.extend(self.available_metrics['categories'][category])
# Add specific metrics if provided
if self.health_metrics:
all_metrics.extend(self.health_metrics)
# If neither categories nor specific metrics provided, use all
if not all_metrics:
for category_metrics in self.available_metrics.get('categories', {}).values():
all_metrics.extend(category_metrics)
# Remove duplicates and limit to max_metrics
unique_metrics = list(dict.fromkeys(all_metrics))
# Filter out non-numeric columns and date columns
numeric_metrWant this analysis on your repo? https://repobility.com/scan/
HealthCrossLaggedAnalyzer._merge_data method · python · L235-L280 (46 LOC)src/analysis/analyze_health_crosslagged.py
def _merge_data(self) -> None:
"""Merge weight, workout, and health data."""
print("\nMerging data...")
# Start with weight data
self.merged_df = self.weight_df.copy()
# Add workout aggregates for each variable
for workout_var in self.workout_vars:
if workout_var in self.workout_data:
# Get workout data for this variable
workout_df = self.workout_data[workout_var].copy()
# Rename 'workout_count' to the variable name
workout_df = workout_df.rename(columns={'workout_count': workout_var})
# Merge workout data
self.merged_df = pd.merge(
self.merged_df,
workout_df[['date', workout_var]],
on='date',
how='left'
)
# Fill missing with 0
self.merged_df[workout_var] = self.merged_df[workout_var].fillna(0)
HealthCrossLaggedAnalyzer.run_analysis method · python · L282-L309 (28 LOC)src/analysis/analyze_health_crosslagged.py
def run_analysis(self) -> None:
"""Run cross-lagged analysis for all workout-health metric pairs."""
print("\n" + "=" * 70)
print("RUNNING CROSS-LAGGED ANALYSIS")
print("=" * 70)
total_analyses = len(self.workout_vars) * len(self.selected_metrics)
print(f"\nTotal analyses to run: {total_analyses}")
print(f"Workout variables: {self.workout_vars}")
print(f"Health metrics: {self.selected_metrics}")
analysis_count = 0
for workout_var in self.workout_vars:
for health_metric in self.selected_metrics:
analysis_count += 1
print(f"\n{'='*60}")
print(f"Analysis {analysis_count}/{total_analyses}: {workout_var} → {health_metric}")
print(f"{'='*60}")
try:
result = self._analyze_pair(workout_var, health_metric)
self.results[f"{workout_var}_{health_metric}"] = result
eHealthCrossLaggedAnalyzer._analyze_pair method · python · L311-L406 (96 LOC)src/analysis/analyze_health_crosslagged.py
def _analyze_pair(self, workout_var: str, health_metric: str) -> Dict[str, Any]:
"""Analyze cross-lagged effect for a single workout-health metric pair."""
# Prepare data for this pair
pair_df = self.merged_df[['date', workout_var, health_metric]].copy()
pair_df = pair_df.dropna(subset=[workout_var, health_metric])
if len(pair_df) < 50:
raise ValueError(f"Insufficient data: only {len(pair_df)} complete observations")
print(f" Data: {len(pair_df)} complete observations")
print(f" Workout days: {(pair_df[workout_var] > 0).sum()}")
print(f" Health metric range: [{pair_df[health_metric].min():.2f}, {pair_df[health_metric].max():.2f}]")
# Create output directory for this pair
pair_dir = self.output_dir / f"{workout_var}_{health_metric}"
pair_dir.mkdir(parents=True, exist_ok=True)
# Run fixed lag comparison
print(f" Running fixed lag comparison...")
fixed_rHealthCrossLaggedAnalyzer.generate_summary_report method · python · L408-L485 (78 LOC)src/analysis/analyze_health_crosslagged.py
def generate_summary_report(self) -> None:
"""Generate summary report of all analyses."""
print("\n" + "=" * 70)
print("GENERATING SUMMARY REPORT")
print("=" * 70)
if not self.results:
print("No results to summarize")
return
# Create summary dataframe
summary_rows = []
for key, result in self.results.items():
# Extract key metrics
workout_var = result['workout_var']
health_metric = result['health_metric']
# Get effect sizes from different models
fixed_betas = []
if result['fixed_results']:
for lag_result in result['fixed_results'].values():
if 'beta_mean' in lag_result:
fixed_betas.append(lag_result['beta_mean'])
estimated_beta = None
if result['estimated_results'] and 'beta_mean' in result['estimated_results']:
estimatedHealthCrossLaggedAnalyzer._generate_html_report method · python · L487-L593 (107 LOC)src/analysis/analyze_health_crosslagged.py
def _generate_html_report(self, summary_df: pd.DataFrame) -> None:
"""Generate HTML report of findings."""
html_content = f"""
<!DOCTYPE html>
<html>
<head>
<title>Health Cross-Lagged Analysis Report</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 40px; }}
h1 {{ color: #333; border-bottom: 2px solid #333; padding-bottom: 10px; }}
h2 {{ color: #555; margin-top: 30px; }}
table {{ border-collapse: collapse; width: 100%; margin: 20px 0; }}
th, td {{ border: 1px solid #ddd; padding: 12px; text-align: left; }}
th {{ background-color: #f2f2f2; font-weight: bold; }}
tr:nth-child(even) {{ background-color: #f9f9f9; }}
.summary {{ background-color: #e8f4f8; padding: 15px; border-radius: 5px; margin: 20px 0; }}
.positive {{ color: green; font-weight: bold; }}
HealthCrossLaggedAnalyzer._print_top_findings method · python · L595-L635 (41 LOC)src/analysis/analyze_health_crosslagged.py
def _print_top_findings(self, summary_df: pd.DataFrame) -> None:
"""Print top findings from analysis."""
print("\n" + "=" * 70)
print("TOP FINDINGS")
print("=" * 70)
if summary_df.empty:
print("No results to analyze")
return
# Find strongest positive effects
positive_effects = summary_df[summary_df['beta_mean'] > 0.1].copy()
if not positive_effects.empty:
positive_effects = positive_effects.sort_values('beta_mean', ascending=False)
print("\nStrongest Positive Effects (workouts increase health metric):")
for _, row in positive_effects.head(5).iterrows():
print(f" {row['workout_var']} → {row['health_metric']}: β = {row['beta_mean']:.3f} ± {row['beta_std']:.3f}")
# Find strongest negative effects
negative_effects = summary_df[summary_df['beta_mean'] < -0.1].copy()
if not negative_effects.empty:
negative_effemain function · python · L638-L805 (168 LOC)src/analysis/analyze_health_crosslagged.py
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description="Analyze cross-lagged effects of workouts on health metrics"
)
# Required arguments
parser.add_argument(
"--workout-vars",
type=str,
required=True,
help="Comma-separated list of workout variables to analyze"
)
# Health metric selection
parser.add_argument(
"--health-metrics",
type=str,
default="",
help="Comma-separated list of specific health metrics to analyze"
)
parser.add_argument(
"--categories",
type=str,
default="",
help="Comma-separated list of health metric categories (sleep,stress,heart,activity,respiration,body_battery)"
)
parser.add_argument(
"--max-metrics",
type=int,
default=10,
help="Maximum number of health metrics to analyze (default: 10)"
)
# Model configuration
parser.add_argument(
"-main function · python · L13-L109 (97 LOC)src/analysis/bivariate_evaluation.py
def main():
output_dir = Path("output/bivariate")
output_dir.mkdir(parents=True, exist_ok=True)
print("Fitting bivariate model (weight + resting heart rate)...")
fit, idata, df, stan_data = fit_bivariate_model(
chains=2,
iter_warmup=100,
iter_sampling=100,
cache=False,
force_refit=True,
use_sparse=True,
n_inducing_points=30,
)
print("\n=== Model Summary ===")
print(f"Observations: {len(df)}")
print(f"Date range: {df['date'].min().date()} to {df['date'].max().date()}")
# Extract posterior samples
posterior = idata.posterior
print("\nPosterior variables:", list(posterior.data_vars.keys()))
# Correlation between latent processes
if 'correlation' in posterior:
corr_samples = posterior['correlation'].values.flatten()
print("\nLatent correlation posterior:")
print(f" Mean: {np.mean(corr_samples):.3f}")
print(f" SD: {np.std(corr_samples):.3f}")
Citation: Repobility (2026). State of AI-Generated Code. https://repobility.com/research/
CrossLaggedModelComparison.__init__ method · python · L76-L149 (74 LOC)src/analysis/compare_crosslagged_models.py
def __init__(
self,
workout_vars: List[str],
data_dir: str = "data",
output_dir: str = "output/crosslagged_comparison",
fixed_lags: List[float] = [0, 1, 2, 3, 7],
estimated_lag_prior_mean: float = 3.0,
estimated_lag_prior_sd: float = 2.0,
cumulative_window: int = 7,
cumulative_step: float = 1.0,
chains: int = 4,
iter_warmup: int = 500,
iter_sampling: int = 500,
use_sparse: bool = True,
n_inducing_points: int = 50,
skip_plots: bool = False,
force_refit: bool = False,
no_cache: bool = False,
):
"""Initialize comparison with configuration.
Args:
workout_vars: List of workout/activity variables to analyze.
data_dir: Path to data directory.
output_dir: Directory for output files.
fixed_lags: List of lag values for fixed lag model (days).
estimated_lag_prior_mean: Prior meanCrossLaggedModelComparison.load_data method · python · L151-L184 (34 LOC)src/analysis/compare_crosslagged_models.py
def load_data(self) -> None:
"""Load weight and workout data for all variables."""
print("\n" + "=" * 70)
print("LOADING DATA")
print("=" * 70)
# Load weight data (common for all variables)
print("\n1. Loading weight data...")
self.df_weight = load_weight_data(self.data_dir)
print(f" Weight measurements: {len(self.df_weight)}")
# Load workout data for each variable
for var_name in self.workout_vars:
print(f"\n2. Loading workout data for '{var_name}'...")
df_workouts_raw = load_workout_data(
data_dir=self.data_dir,
activity_type=var_name,
include_exercise_details=False,
)
print(f" Raw workout records: {len(df_workouts_raw)}")
# Aggregate workouts to daily count
df_workouts_agg = prepare_workout_aggregates(
df_workouts_raw,
aggregation="daily",
CrossLaggedModelComparison.run_fixed_lag_analysis method · python · L186-L241 (56 LOC)src/analysis/compare_crosslagged_models.py
def run_fixed_lag_analysis(self) -> None:
"""Run fixed lag model comparison."""
print("\n" + "=" * 70)
print("FIXED LAG MODEL ANALYSIS")
print("=" * 70)
# Create args object for run_fixed_lag_comparison
class Args:
pass
args = Args()
args.data_dir = str(self.data_dir)
args.no_sparse = not self.use_sparse
args.n_inducing_points = self.n_inducing_points
args.chains = self.chains
args.iter_warmup = self.iter_warmup
args.iter_sampling = self.iter_sampling
args.no_cache = self.no_cache
args.force_refit = self.force_refit
args.include_prediction_grid = False
args.prediction_step_days = 1.0
args.skip_plots = self.skip_plots
# Run fixed lag analysis using demo_bivariate function
self.fixed_results = run_fixed_lag_comparison(
args=args,
workout_vars=self.workout_vars,
lag_values=selfCrossLaggedModelComparison.run_estimated_lag_analysis method · python · L243-L301 (59 LOC)src/analysis/compare_crosslagged_models.py
def run_estimated_lag_analysis(self) -> None:
"""Run estimated lag model analysis."""
print("\n" + "=" * 70)
print("ESTIMATED LAG MODEL ANALYSIS")
print("=" * 70)
# Create args object for run_estimated_lag_analysis
class Args:
pass
args = Args()
args.data_dir = str(self.data_dir)
args.no_sparse = not self.use_sparse
args.n_inducing_points = self.n_inducing_points
args.chains = self.chains
args.iter_warmup = self.iter_warmup
args.iter_sampling = self.iter_sampling
args.no_cache = self.no_cache
args.force_refit = self.force_refit
args.include_prediction_grid = False
args.prediction_step_days = 1.0
args.skip_plots = self.skip_plots
args.lag_prior_mean = self.estimated_lag_prior_mean
args.lag_prior_sd = self.estimated_lag_prior_sd
# Run estimated lag analysis using demo_bivariate function
self.estiCrossLaggedModelComparison.run_cumulative_lag_analysis method · python · L303-L359 (57 LOC)src/analysis/compare_crosslagged_models.py
def run_cumulative_lag_analysis(self) -> None:
"""Run cumulative lag model analysis."""
print("\n" + "=" * 70)
print("CUMULATIVE LAG MODEL ANALYSIS")
print("=" * 70)
# Create args object for run_cumulative_lag_analysis
class Args:
pass
args = Args()
args.data_dir = str(self.data_dir)
args.no_sparse = not self.use_sparse
args.n_inducing_points = self.n_inducing_points
args.chains = self.chains
args.iter_warmup = self.iter_warmup
args.iter_sampling = self.iter_sampling
args.no_cache = self.no_cache
args.force_refit = self.force_refit
args.include_prediction_grid = False
args.prediction_step_days = 1.0
args.skip_plots = self.skip_plots
args.lag_window = self.cumulative_window
args.lag_step = self.cumulative_step
# Run cumulative lag analysis using demo_bivariate function
self.cumulative_results = CrossLaggedModelComparison.create_comparison_tables method · python · L361-L413 (53 LOC)src/analysis/compare_crosslagged_models.py
def create_comparison_tables(self) -> pd.DataFrame:
"""Create comparison tables from all model results."""
print("\n" + "=" * 70)
print("CREATING COMPARISON TABLES")
print("=" * 70)
if not self.comparison_results:
print("No results to compare.")
return pd.DataFrame()
# Create DataFrame
df = pd.DataFrame(self.comparison_results)
# Save raw comparison table
df.to_csv(self.output_dir / "model_comparison_raw.csv", index=False)
print(f"Saved raw comparison table: {self.output_dir / 'model_comparison_raw.csv'}")
# Create summary table (best model per variable by WAIC)
summary_rows = []
for var_name in df['variable'].unique():
df_var = df[df['variable'] == var_name]
# Find best model by WAIC (lowest)
if df_var['waic'].notna().any():
best_idx = df_var['waic'].idxmin()
best_row = df_var.loc[beCrossLaggedModelComparison.create_comparison_plots method · python · L415-L541 (127 LOC)src/analysis/compare_crosslagged_models.py
def create_comparison_plots(self, df: pd.DataFrame) -> None:
"""Create comparison visualizations."""
if self.skip_plots or df.empty:
return
print("\n" + "=" * 70)
print("CREATING COMPARISON VISUALIZATIONS")
print("=" * 70)
# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
# 1. β comparison across models for each variable
for var_name in df['variable'].unique():
df_var = df[df['variable'] == var_name]
plt.figure(figsize=(12, 8))
# Separate fixed lag points
df_fixed = df_var[df_var['model_type'] == 'fixed']
df_other = df_var[df_var['model_type'] != 'fixed']
# Plot fixed lag points with line
if not df_fixed.empty:
# Sort by lag
df_fixed = df_fixed.sort_values('lag_value')
plt.errorbar(
df_fixed['lag_value'],
CrossLaggedModelComparison.generate_report method · python · L543-L658 (116 LOC)src/analysis/compare_crosslagged_models.py
def generate_report(self, df: pd.DataFrame) -> None:
"""Generate comprehensive markdown report."""
report_path = self.output_dir / "crosslagged_comparison_report.md"
with open(report_path, 'w') as f:
f.write("# Cross-Lagged Model Comparison Report\n\n")
f.write(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"**Workout variables**: {', '.join(self.workout_vars)}\n")
f.write(f"**Weight observations**: {len(self.df_weight)}\n\n")
f.write("## Model Configuration\n\n")
f.write("### Fixed Lag Model\n")
f.write(f"- Lag values: {self.fixed_lags} days\n")
f.write(f"- MCMC: {self.chains} chains, {self.iter_warmup} warmup, {self.iter_sampling} sampling\n\n")
f.write("### Estimated Lag Model\n")
f.write(f"- Lag prior: N({self.estimated_lag_prior_mean}, {self.estimated_lag_prior_sd}²) days\n")
f.write(f"- MCMGenerated by Repobility's multi-pass static-analysis pipeline (https://repobility.com)
CrossLaggedModelComparison.run_comparison method · python · L660-L707 (48 LOC)src/analysis/compare_crosslagged_models.py
def run_comparison(self) -> None:
"""Run full comparison pipeline."""
print("\n" + "=" * 70)
print("CROSS-LAGGED MODEL COMPARISON")
print("=" * 70)
print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Workout variables: {', '.join(self.workout_vars)}")
print(f"Output directory: {self.output_dir}")
print(f"MCMC: {self.chains} chains, {self.iter_warmup} warmup, {self.iter_sampling} sampling")
print(f"Sparse GP: {self.use_sparse} ({self.n_inducing_points} inducing points)")
print(f"Fixed lags: {self.fixed_lags}")
print(f"Estimated lag prior: N({self.estimated_lag_prior_mean}, {self.estimated_lag_prior_sd}²) days")
print(f"Cumulative window: {self.cumulative_window} days (step: {self.cumulative_step} days)")
print("=" * 70)
# Load data
self.load_data()
if not self.workout_data:
print("ERROR: No workout data loaded. Exiting.")
main function · python · L710-L861 (152 LOC)src/analysis/compare_crosslagged_models.py
def main():
parser = argparse.ArgumentParser(
description="Compare different cross-lagged modeling approaches",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
# Required arguments
parser.add_argument(
"--workout-vars",
type=str,
required=True,
help="Comma-separated list of workout/activity variables to analyze. "
"Examples: 'strength_training,walking,cycling'"
)
# Model configurations
parser.add_argument(
"--fixed-lags",
type=str,
default="0,1,2,3,7",
help="Comma-separated list of lag values for fixed lag model (days)"
)
parser.add_argument(
"--estimated-lag-prior-mean",
type=float,
default=3.0,
help="Prior mean for estimated lag parameter (days)"
)
parser.add_argument(
"--estimated-lag-prior-sd",
type=float,
default=2.0,
help="Prior standard deviation for estimated lag parprepare_weight_for_simple_gp function · python · L14-L25 (12 LOC)src/analysis/correlate_independent_gps.py
def prepare_weight_for_simple_gp(df_weight: pd.DataFrame) -> pd.DataFrame:
"""Prepare weight data for simple GP (aggregate to daily if multiple measurements)."""
# If multiple measurements per day, aggregate to daily mean
df = df_weight.copy()
df["date"] = df["timestamp"].dt.date
df["date"] = pd.to_datetime(df["date"])
# Group by date
daily = df.groupby("date")["weight_lbs"].agg(["mean", "std", "count"]).reset_index()
daily = daily.rename(columns={"mean": "weight", "date": "timestamp"})
return dailyalign_to_common_time function · python · L28-L48 (21 LOC)src/analysis/correlate_independent_gps.py
def align_to_common_time(df1, time_col1, df2, time_col2):
"""Align two time series to a common global time zero.
Returns:
df1_aligned, df2_aligned, global_t_min, global_t_max
Each aligned DataFrame has new column 'days_since_global'.
"""
# Convert to datetime if not already
t1 = pd.to_datetime(df1[time_col1])
t2 = pd.to_datetime(df2[time_col2])
global_t_min = min(t1.min(), t2.min())
global_t_max = max(t1.max(), t2.max())
df1_aligned = df1.copy()
df2_aligned = df2.copy()
df1_aligned['days_since_global'] = (t1 - global_t_min).dt.days
df2_aligned['days_since_global'] = (t2 - global_t_min).dt.days
return df1_aligned, df2_aligned, global_t_min, global_t_maxsample_latent_functions_on_grid function · python · L51-L120 (70 LOC)src/analysis/correlate_independent_gps.py
def sample_latent_functions_on_grid(
idata_weight,
idata_other,
stan_data_weight,
stan_data_other,
n_samples: int = 100,
) -> tuple:
"""Sample latent functions on common time grid using f_pred.
Requires that both GPs were fitted with same prediction grid (t_pred).
Returns:
Tuple of (t_grid_days, f_weight_samples, f_other_samples)
where each samples matrix is n_samples x len(t_grid_days)
t_grid_days is absolute days (since global time zero).
"""
# Check if f_pred exists in posterior_predictive or posterior
f_pred_weight = None
f_pred_other = None
if "f_pred" in idata_weight.posterior_predictive:
f_pred_weight = idata_weight.posterior_predictive["f_pred"]
elif "f_pred" in idata_weight.posterior:
f_pred_weight = idata_weight.posterior["f_pred"]
if "f_pred" in idata_other.posterior_predictive:
f_pred_other = idata_other.posterior_predictive["f_pred"]
elif "f_pred" in idata_otcompute_correlation_from_samples function · python · L123-L152 (30 LOC)src/analysis/correlate_independent_gps.py
def compute_correlation_from_samples(
f_weight_samples: np.ndarray,
f_other_samples: np.ndarray,
) -> dict:
"""Compute correlation statistics from latent function samples."""
n_samples = f_weight_samples.shape[0]
correlations = np.zeros(n_samples)
for i in range(n_samples):
# Pearson correlation between two vectors
if np.std(f_weight_samples[i]) > 1e-10 and np.std(f_other_samples[i]) > 1e-10:
corr = np.corrcoef(f_weight_samples[i], f_other_samples[i])[0, 1]
correlations[i] = corr
else:
correlations[i] = np.nan
# Remove NaN
correlations = correlations[~np.isnan(correlations)]
if len(correlations) == 0:
return None
return {
"mean": np.mean(correlations),
"std": np.std(correlations),
"2.5%": np.percentile(correlations, 2.5),
"50%": np.percentile(correlations, 50),
"97.5%": np.percentile(correlations, 97.5),
"samples": correlations,analyze_weight_vo2max function · python · L155-L306 (152 LOC)src/analysis/correlate_independent_gps.py
def analyze_weight_vo2max():
"""Analyze correlation between weight and VO2 max using independent GPs with common prediction grid."""
output_dir = Path("output/independent_gp_correlation")
output_dir.mkdir(parents=True, exist_ok=True)
print("=== Weight vs VO2 Max Correlation Analysis ===")
# Load data
print("\n1. Loading data...")
df_weight_raw = load_weight_data()
df_vo2max = load_vo2max_data()
print(f" Weight: {len(df_weight_raw)} measurements")
print(f" VO2 max: {len(df_vo2max)} measurements")
# Prepare weight data (aggregate to daily)
df_weight_daily = prepare_weight_for_simple_gp(df_weight_raw)
print(f" Weight (daily): {len(df_weight_daily)} days")
# Align both datasets to common global time
print("\n2. Aligning to common global time...")
df_weight_aligned, df_vo2_aligned, global_t_min, global_t_max = align_to_common_time(
df_weight_daily, "timestamp", df_vo2max, "date"
)
print(f" Global tiplot_correlation_matrix function · python · L15-L45 (31 LOC)src/analysis/explore_correlations.py
def plot_correlation_matrix(df: pd.DataFrame, output_dir: Path = Path("output/correlations")):
"""Plot correlation matrix between weight and other variables."""
# Select numeric columns of interest
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# Filter to relevant metrics (exclude derived columns)
exclude = ["weight_day_of_week", "weight_day_of_year", "weight_variable"]
metric_cols = [c for c in numeric_cols if c not in exclude and not c.startswith("_")]
# Keep top 20 most relevant (prioritize weight columns and key metrics)
weight_cols = [c for c in metric_cols if "weight" in c]
other_cols = [c for c in metric_cols if "weight" not in c]
# Select up to 15 other columns with most non-missing values
other_cols = sorted(other_cols, key=lambda c: df[c].notnull().sum(), reverse=True)[:15]
selected_cols = weight_cols + other_cols
corr_df = df[selected_cols].corr()
# Create figure
plt.figure(figsize=(14, 12)Source: Repobility analyzer · https://repobility.com
plot_scatter_pairs function · python · L48-L102 (55 LOC)src/analysis/explore_correlations.py
def plot_scatter_pairs(df: pd.DataFrame, output_dir: Path = Path("output/correlations")):
"""Create scatter plots of weight vs key metrics."""
key_metrics = [
"resting_heart_rate",
"total_steps",
"active_kilocalories",
"avg_stress_level",
"moderate_intensity_minutes",
"vigorous_intensity_minutes",
"highly_active_seconds",
"min_heart_rate",
"max_heart_rate",
]
# Filter to available columns
key_metrics = [m for m in key_metrics if m in df.columns]
# Create subplot grid
n_cols = 3
n_rows = (len(key_metrics) + n_cols - 1) // n_cols
fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
axes = axes.flatten() if n_rows > 1 else [axes]
for idx, metric in enumerate(key_metrics):
ax = axes[idx]
# Drop NA pairs
subset = df[["weight_mean", metric]].dropna()
if len(subset) < 2:
ax.text(0.5, 0.5, f"No data for {metric}"plot_time_series function · python · L105-L135 (31 LOC)src/analysis/explore_correlations.py
def plot_time_series(df: pd.DataFrame, output_dir: Path = Path("output/correlations")):
"""Plot time series of weight and key metrics."""
# Select a few key metrics to plot alongside weight
key_metrics = ["resting_heart_rate", "total_steps", "active_kilocalories", "avg_stress_level"]
key_metrics = [m for m in key_metrics if m in df.columns]
fig, axes = plt.subplots(len(key_metrics) + 1, 1, figsize=(14, 3 * (len(key_metrics) + 1)), sharex=True)
# Plot weight
ax = axes[0]
ax.plot(df["date"], df["weight_mean"], label="Weight (mean)", color="blue", linewidth=1.5)
ax.fill_between(df["date"], df["weight_min"], df["weight_max"], alpha=0.2, color="blue", label="Weight range")
ax.set_ylabel("Weight (lbs)")
ax.legend(loc="upper left")
ax.grid(True, alpha=0.3)
# Plot each metric
for idx, metric in enumerate(key_metrics, start=1):
ax = axes[idx]
ax.plot(df["date"], df[metric], label=metric, color=f"C{idx}", linewidth=1.5)
main function · python · L138-L172 (35 LOC)src/analysis/explore_correlations.py
def main():
"""Run all exploration plots."""
output_dir = Path("output/correlations")
output_dir.mkdir(parents=True, exist_ok=True)
print("Loading merged data...")
df = merge_weight_with_daily_metrics()
print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
# Basic statistics
print("\nWeight statistics:")
print(df[["weight_mean", "weight_std", "weight_count"]].describe())
print("\nKey metrics statistics:")
key = ["resting_heart_rate", "total_steps", "active_kilocalories", "avg_stress_level"]
for metric in key:
if metric in df.columns:
print(f"{metric}: mean={df[metric].mean():.2f}, sd={df[metric].std():.2f}")
# Generate plots
print("\nGenerating correlation matrix...")
plot_correlation_matrix(df, output_dir)
print("\nGenerating scatter plots...")
plot_scatter_pairs(df, output_dir)
print("\nGenerating time series plots...")
plot_time_serieHealthWorkoutCorrelationAnalyzer.__init__ method · python · L47-L83 (37 LOC)src/analysis/explore_health_workout_correlations.py
def __init__(
self,
workout_vars: List[str],
max_lag: int = 7,
data_dir: str = "data",
output_dir: str = "output/health_correlations",
max_metrics: int = 20,
min_observations: int = 50,
):
"""Initialize analyzer.
Args:
workout_vars: List of workout variables to analyze.
max_lag: Maximum lag to compute correlations for (0-7 days).
data_dir: Path to data directory.
output_dir: Directory for output files.
max_metrics: Maximum number of health metrics to analyze.
min_observations: Minimum number of observations required.
"""
self.workout_vars = workout_vars
self.max_lag = max_lag
self.data_dir = Path(data_dir)
self.output_dir = Path(output_dir)
self.max_metrics = max_metrics
self.min_observations = min_observations
# Create output directory
self.output_dir.mkdir(parenHealthWorkoutCorrelationAnalyzer.load_data method · python · L85-L130 (46 LOC)src/analysis/explore_health_workout_correlations.py
def load_data(self) -> None:
"""Load all required data."""
print("\n" + "=" * 70)
print("LOADING DATA")
print("=" * 70)
# Load health metrics
print("\nLoading health metrics...")
self.health_df = load_combined_health_data(self.data_dir)
print(f" Loaded {len(self.health_df)} days of health data")
print(f" Available metrics: {len(self.health_df.columns)}")
# Get available metrics
self.available_metrics = get_available_health_metrics()
# Select health metrics to analyze
self._select_health_metrics()
print(f"\nSelected {len(self.selected_metrics)} health metrics for analysis:")
for metric in self.selected_metrics[:10]: # Show first 10
print(f" - {metric}")
if len(self.selected_metrics) > 10:
print(f" ... and {len(self.selected_metrics) - 10} more")
# Load workout data for each variable
for var_name in self.workoHealthWorkoutCorrelationAnalyzer._select_health_metrics method · python · L132-L153 (22 LOC)src/analysis/explore_health_workout_correlations.py
def _select_health_metrics(self) -> None:
"""Select health metrics to analyze."""
# Get all metrics from all categories
all_metrics = []
for category_metrics in self.available_metrics.get('categories', {}).values():
all_metrics.extend(category_metrics)
# Remove duplicates
unique_metrics = list(dict.fromkeys(all_metrics))
# Filter out non-numeric columns and date columns
numeric_metrics = []
for metric in unique_metrics:
if metric in self.health_df.columns:
if pd.api.types.is_numeric_dtype(self.health_df[metric]):
# Check for sufficient non-missing values
non_missing = self.health_df[metric].notna().sum()
if non_missing >= self.min_observations:
numeric_metrics.append(metric)
# Limit to max_metrics
self.selected_metrics = numeric_metrics[:self.max_metrics]HealthWorkoutCorrelationAnalyzer.compute_correlations method · python · L155-L231 (77 LOC)src/analysis/explore_health_workout_correlations.py
def compute_correlations(self) -> None:
"""Compute lagged correlations between workouts and health metrics."""
print("\n" + "=" * 70)
print("COMPUTING CORRELATIONS")
print("=" * 70)
total_analyses = len(self.workout_vars) * len(self.selected_metrics)
print(f"\nTotal analyses to compute: {total_analyses}")
for workout_var in self.workout_vars:
if workout_var not in self.workout_data:
print(f"\nSkipping {workout_var}: no workout data")
continue
print(f"\n{'='*60}")
print(f"Analyzing: {workout_var}")
print(f"{'='*60}")
# Get workout data
workout_df = self.workout_data[workout_var].copy()
workout_df = workout_df.rename(columns={'workout_count': 'workout'})
# Merge with health data
merged_df = pd.merge(
self.health_df[['date'] + self.selected_metrics],
workoHealthWorkoutCorrelationAnalyzer._compute_correlation method · python · L233-L267 (35 LOC)src/analysis/explore_health_workout_correlations.py
def _compute_correlation(
self,
df: pd.DataFrame,
x_col: str,
y_col: str,
lag: int = 0
) -> Tuple[Optional[float], Optional[float]]:
"""Compute correlation between x and y with optional lag.
For lag > 0: x(t) correlated with y(t+lag)
"""
# Create lagged series
if lag == 0:
x_series = df[x_col]
y_series = df[y_col]
else:
# Shift y forward by lag days
x_series = df[x_col].iloc[:-lag] if lag > 0 else df[x_col]
y_series = df[y_col].iloc[lag:] if lag > 0 else df[y_col]
# Align series
aligned_df = pd.DataFrame({
'x': x_series.reset_index(drop=True),
'y': y_series.reset_index(drop=True)
}).dropna()
if len(aligned_df) < self.min_observations:
return None, None
# Compute Pearson correlation
try:
corr, p_value = stats.pearsonr(aligned_df['x'], Want this analysis on your repo? https://repobility.com/scan/
HealthWorkoutCorrelationAnalyzer.generate_summary_report method · python · L269-L328 (60 LOC)src/analysis/explore_health_workout_correlations.py
def generate_summary_report(self) -> None:
"""Generate summary report of correlation findings."""
print("\n" + "=" * 70)
print("GENERATING SUMMARY REPORT")
print("=" * 70)
if not self.correlation_results:
print("No correlation results to summarize")
return
# Create summary dataframe
summary_rows = []
for workout_var, health_results in self.correlation_results.items():
for health_metric, lag_results in health_results.items():
# Find strongest correlation (absolute value)
if not lag_results:
continue
strongest = max(
lag_results.items(),
key=lambda x: abs(x[1]['correlation'])
)
lag, result = strongest
summary_rows.append({
'workout_var': workout_var,
'health_metric': health_metric,
HealthWorkoutCorrelationAnalyzer._generate_html_report method · python · L330-L479 (150 LOC)src/analysis/explore_health_workout_correlations.py
def _generate_html_report(self, summary_df: pd.DataFrame) -> None:
"""Generate HTML report of findings."""
# Group by significance
significant = summary_df[summary_df['significant']]
not_significant = summary_df[~summary_df['significant']]
html_content = f"""
<!DOCTYPE html>
<html>
<head>
<title>Health-Workout Correlation Analysis</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 40px; }}
h1 {{ color: #333; border-bottom: 2px solid #333; padding-bottom: 10px; }}
h2 {{ color: #555; margin-top: 30px; }}
h3 {{ color: #777; margin-top: 20px; }}
table {{ border-collapse: collapse; width: 100%; margin: 20px 0; }}
th, td {{ border: 1px solid #ddd; padding: 12px; text-align: left; }}
th {{ background-color: #f2f2f2; font-weight: bold; }}
tr:nth-child(even) {{ bacHealthWorkoutCorrelationAnalyzer._print_top_findings method · python · L481-L514 (34 LOC)src/analysis/explore_health_workout_correlations.py
def _print_top_findings(self, summary_df: pd.DataFrame) -> None:
"""Print top findings from analysis."""
print("\n" + "=" * 70)
print("TOP FINDINGS")
print("=" * 70)
# Top positive correlations
positive = summary_df[summary_df['correlation'] > 0].copy()
if not positive.empty:
positive = positive.sort_values('correlation', ascending=False)
print("\nStrongest Positive Correlations:")
for _, row in positive.head(5).iterrows():
sig = "**" if row['significant'] else ""
print(f" {row['workout_var']} → {row['health_metric']} (lag {row['strongest_lag']}d): r = {row['correlation']:.3f}{sig}")
# Top negative correlations
negative = summary_df[summary_df['correlation'] < 0].copy()
if not negative.empty:
negative = negative.sort_values('correlation', ascending=True)
print("\nStrongest Negative Correlations:")