Function bodies 140 total

store method · python · L116-L121 (6 LOC)

backend/scrapers/aqicn_aqi.py

    async def store(self, data: list[dict]) -> None:
        if not data or supabase is None:
            return
        for reading in data:
            supabase.table("aqi_readings").insert(reading).execute()
        logger.info("aqicn_aqi: stored %d readings", len(data))

BaseScraper class · python · L14-L109 (96 LOC)

backend/scrapers/base.py

class BaseScraper(ABC):
    """Provides retry logic, exponential backoff, and run-logging.

    Subclasses must implement:
        - fetch()   -- retrieve raw data from the source
        - validate() -- clean / validate the raw data
        - store()   -- persist validated data to Supabase
    """

    name: str = "base"
    max_retries: int = 3

    async def run(self) -> bool:
        """Execute the full scrape cycle with retries.

        Returns True on success, False on exhausted retries.
        """
        last_error: Exception | None = None

        for attempt in range(1, self.max_retries + 1):
            try:
                raw = await self.fetch()
                validated = await self.validate(raw)
                await self.store(validated)
                await self.log_success()
                logger.info("%s: completed on attempt %d", self.name, attempt)
                return True
            except Exception as exc:
                last_error = exc

run method · python · L26-L55 (30 LOC)

backend/scrapers/base.py

    async def run(self) -> bool:
        """Execute the full scrape cycle with retries.

        Returns True on success, False on exhausted retries.
        """
        last_error: Exception | None = None

        for attempt in range(1, self.max_retries + 1):
            try:
                raw = await self.fetch()
                validated = await self.validate(raw)
                await self.store(validated)
                await self.log_success()
                logger.info("%s: completed on attempt %d", self.name, attempt)
                return True
            except Exception as exc:
                last_error = exc
                wait = 2 ** attempt  # 2, 4, 8 seconds
                logger.warning(
                    "%s: attempt %d failed (%s), retrying in %ds",
                    self.name,
                    attempt,
                    exc,
                    wait,
                )
                await asyncio.sleep(wait)

        await self.log_failure(str(last

fetch method · python · L62-L64 (3 LOC)

backend/scrapers/base.py

    async def fetch(self) -> Any:
        """Retrieve raw data from the external source."""
        ...

validate method · python · L67-L69 (3 LOC)

backend/scrapers/base.py

    async def validate(self, raw: Any) -> Any:
        """Clean and validate the raw data."""
        ...

store method · python · L72-L74 (3 LOC)

backend/scrapers/base.py

    async def store(self, data: Any) -> None:
        """Persist validated data to Supabase."""
        ...

log_success method · python · L80-L93 (14 LOC)

backend/scrapers/base.py

    async def log_success(self) -> None:
        """Record a successful scraper run in the scraper_runs table."""
        if supabase is None:
            return
        try:
            supabase.table("scraper_runs").insert(
                {
                    "scraper_name": self.name,
                    "status": "success",
                    "run_at": datetime.now(timezone.utc).isoformat(),
                }
            ).execute()
        except Exception as exc:
            logger.warning("Failed to log success for %s: %s", self.name, exc)

Open data scored by Repobility · https://repobility.com

log_failure method · python · L95-L109 (15 LOC)

backend/scrapers/base.py

    async def log_failure(self, error_message: str) -> None:
        """Record a failed scraper run in the scraper_runs table."""
        if supabase is None:
            return
        try:
            supabase.table("scraper_runs").insert(
                {
                    "scraper_name": self.name,
                    "status": "failure",
                    "error_message": error_message,
                    "run_at": datetime.now(timezone.utc).isoformat(),
                }
            ).execute()
        except Exception as exc:
            logger.warning("Failed to log failure for %s: %s", self.name, exc)

tag_areas function · python · L41-L44 (4 LOC)

backend/scrapers/community.py

def tag_areas(text):
    text_lower = text.lower()
    areas = [aid for aid, kws in AREA_KEYWORDS.items() if any(k in text_lower for k in kws)]
    return areas or ["pune"]

categorize function · python · L47-L53 (7 LOC)

backend/scrapers/community.py

def categorize(text):
    t = text.lower()
    if any(w in t for w in ["traffic", "road", "metro", "bus"]): return "Transport"
    if any(w in t for w in ["water", "power", "pmc", "msedcl"]): return "Essential"
    if any(w in t for w in ["food", "restaurant", "cafe"]): return "Food"
    if any(w in t for w in ["crime", "police", "theft"]): return "Safety"
    return "Community"

CommunityScraper class · python · L56-L128 (73 LOC)

backend/scrapers/community.py

class CommunityScraper(BaseScraper):
    name = "community"
    max_retries = 3

    async def fetch(self):
        results = []
        async with aiohttp.ClientSession() as session:
            for feed in COMMUNITY_FEEDS:
                url = feed["url"]
                try:
                    async with session.get(url, headers={"User-Agent": "Sajaag/0.1"}, timeout=aiohttp.ClientTimeout(total=15)) as resp:
                        if resp.status == 200:
                            results.append({"xml": await resp.text(), "config": feed})
                        elif feed["platform"] == "twitter" and "rsshub.app" in url:
                            for alt in RSSHUB_FALLBACKS:
                                try:
                                    alt_url = url.replace("https://rsshub.app", alt)
                                    async with session.get(alt_url, timeout=aiohttp.ClientTimeout(total=10)) as r:
                                        if r.status == 200:

fetch method · python · L60-L82 (23 LOC)

backend/scrapers/community.py

    async def fetch(self):
        results = []
        async with aiohttp.ClientSession() as session:
            for feed in COMMUNITY_FEEDS:
                url = feed["url"]
                try:
                    async with session.get(url, headers={"User-Agent": "Sajaag/0.1"}, timeout=aiohttp.ClientTimeout(total=15)) as resp:
                        if resp.status == 200:
                            results.append({"xml": await resp.text(), "config": feed})
                        elif feed["platform"] == "twitter" and "rsshub.app" in url:
                            for alt in RSSHUB_FALLBACKS:
                                try:
                                    alt_url = url.replace("https://rsshub.app", alt)
                                    async with session.get(alt_url, timeout=aiohttp.ClientTimeout(total=10)) as r:
                                        if r.status == 200:
                                            results.append({"xml": await r.text(), "config":

validate method · python · L84-L118 (35 LOC)

backend/scrapers/community.py

    async def validate(self, raw):
        posts = []
        seen = set()
        now = datetime.now(timezone.utc).isoformat()
        for feed_data in raw:
            config = feed_data["config"]
            parsed = feedparser.parse(feed_data["xml"])
            for entry in parsed.entries[:15]:
                title = entry.get("title", "").strip()
                if not title:
                    continue
                pid = md5(f"{title}:{config['source']}".encode()).hexdigest()[:16]
                if pid in seen:
                    continue
                seen.add(pid)
                body = re.sub(r'<[^>]+>', '', entry.get("summary", "")).strip()[:500]
                pub = now
                if entry.get("published"):
                    try:
                        import email.utils
                        pub = email.utils.parsedate_to_datetime(entry["published"]).isoformat()
                    except Exception:
                        pass
                posts.app

store method · python · L120-L128 (9 LOC)

backend/scrapers/community.py

    async def store(self, data):
        if not data or supabase is None:
            return
        for post in data:
            try:
                supabase.table("community_posts").upsert(post, on_conflict="id").execute()
            except Exception as e:
                logger.warning("Failed to upsert community post: %s", e)
        logger.info("community: stored %d posts", len(data))

FuelPriceScraper class · python · L19-L97 (79 LOC)

backend/scrapers/fuel_prices.py

class FuelPriceScraper(BaseScraper):
    name = "fuel_prices"
    max_retries = 3

    async def fetch(self) -> Any:
        async with aiohttp.ClientSession() as session:
            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
            async with session.get(FUEL_URL, headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp:
                resp.raise_for_status()
                return await resp.text()

    async def validate(self, raw: Any) -> list[dict]:
        soup = BeautifulSoup(raw, "lxml")
        now = datetime.now(timezone.utc).isoformat()
        today = date.today().isoformat()

        petrol = None
        diesel = None

        # Look for price values in the page
        # GoodReturns typically shows prices in a table or prominent divs
        price_pattern = re.compile(r'₹?\s*(\d+\.\d{2})')

        # Try table-based extraction
        tables = soup.find_all("table")
        for table in tables:
            tex

Provenance: Repobility (https://repobility.com) — every score reproducible from /scan/

fetch method · python · L23-L28 (6 LOC)

backend/scrapers/fuel_prices.py

    async def fetch(self) -> Any:
        async with aiohttp.ClientSession() as session:
            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
            async with session.get(FUEL_URL, headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp:
                resp.raise_for_status()
                return await resp.text()

validate method · python · L30-L90 (61 LOC)

backend/scrapers/fuel_prices.py

    async def validate(self, raw: Any) -> list[dict]:
        soup = BeautifulSoup(raw, "lxml")
        now = datetime.now(timezone.utc).isoformat()
        today = date.today().isoformat()

        petrol = None
        diesel = None

        # Look for price values in the page
        # GoodReturns typically shows prices in a table or prominent divs
        price_pattern = re.compile(r'₹?\s*(\d+\.\d{2})')

        # Try table-based extraction
        tables = soup.find_all("table")
        for table in tables:
            text = table.get_text().lower()
            if "petrol" in text or "diesel" in text:
                rows = table.find_all("tr")
                for row in rows:
                    cells = [c.get_text(strip=True) for c in row.find_all(["td", "th"])]
                    for i, cell in enumerate(cells):
                        if "petrol" in cell.lower():
                            for c in cells[i+1:]:
                                m = price_pattern.search(c)

store method · python · L92-L97 (6 LOC)

backend/scrapers/fuel_prices.py

    async def store(self, data: list[dict]) -> None:
        if not data or supabase is None:
            return
        for record in data:
            supabase.table("fuel_prices").insert(record).execute()
        logger.info("fuel_prices: stored %d records", len(data))

match_area function · python · L43-L50 (8 LOC)

backend/scrapers/msedcl_power.py

def match_area(text: str) -> str | None:
    """Match outage location text to an area ID."""
    text_lower = text.lower()
    for area_id, keywords in SUBSTATION_AREA_MAP.items():
        for kw in keywords:
            if kw in text_lower:
                return area_id
    return None

parse_time function · python · L53-L72 (20 LOC)

backend/scrapers/msedcl_power.py

def parse_time(time_str: str) -> str | None:
    """Try to parse various time formats from MSEDCL pages."""
    time_str = time_str.strip()

    patterns = [
        (r'(\d{1,2}):(\d{2})\s*(am|pm|AM|PM)', '%I:%M %p'),
        (r'(\d{1,2})\s*(am|pm|AM|PM)', '%I %p'),
        (r'(\d{1,2}):(\d{2})', '%H:%M'),
    ]

    for pattern, fmt in patterns:
        match = re.search(pattern, time_str)
        if match:
            try:
                t = datetime.strptime(match.group(0), fmt)
                return t.strftime('%H:%M')
            except ValueError:
                continue

    return None

MsedclPowerScraper class · python · L75-L291 (217 LOC)

backend/scrapers/msedcl_power.py

class MsedclPowerScraper(BaseScraper):
    name = "msedcl_power"
    max_retries = 3

    async def fetch(self) -> Any:
        """Fetch outage pages from MSEDCL."""
        results = {}

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml",
        }

        async with aiohttp.ClientSession() as session:
            # Try planned outages page
            for label, url in [("planned", MSEDCL_PLANNED_URL), ("outage", MSEDCL_OUTAGE_URL)]:
                try:
                    async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=30)) as resp:
                        if resp.status == 200:
                            results[label] = await resp.text()
                        else:
                            logger.warning("MSEDCL %s page returned %d", label, resp.status)
                except Exception as e:

fetch method · python · L79-L123 (45 LOC)

backend/scrapers/msedcl_power.py

    async def fetch(self) -> Any:
        """Fetch outage pages from MSEDCL."""
        results = {}

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml",
        }

        async with aiohttp.ClientSession() as session:
            # Try planned outages page
            for label, url in [("planned", MSEDCL_PLANNED_URL), ("outage", MSEDCL_OUTAGE_URL)]:
                try:
                    async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=30)) as resp:
                        if resp.status == 200:
                            results[label] = await resp.text()
                        else:
                            logger.warning("MSEDCL %s page returned %d", label, resp.status)
                except Exception as e:
                    logger.warning("Failed to fetch MSEDCL %s: %s", label, e)

validate method · python · L125-L281 (157 LOC)

backend/scrapers/msedcl_power.py

    async def validate(self, raw: Any) -> list[dict]:
        """Parse MSEDCL HTML/JSON into power outage records."""
        outages = []
        now = datetime.now(timezone.utc).isoformat()
        today = date.today().isoformat()
        seen = set()

        # Parse JSON API response if available
        if "api" in raw and isinstance(raw["api"], (list, dict)):
            api_data = raw["api"] if isinstance(raw["api"], list) else raw["api"].get("data", [])
            for item in api_data:
                area_text = item.get("area", item.get("location", ""))
                area_id = match_area(area_text)
                if not area_id:
                    continue

                key = f"{area_id}:{item.get('start', '')}:{item.get('end', '')}"
                if key in seen:
                    continue
                seen.add(key)

                outages.append({
                    "area_id": area_id,
                    "date": item.get("date", today),

All rows above produced by Repobility · https://repobility.com

store method · python · L283-L291 (9 LOC)

backend/scrapers/msedcl_power.py

    async def store(self, data: list[dict]) -> None:
        """Insert power outage records into Supabase."""
        if not data or supabase is None:
            return

        for record in data:
            supabase.table("power_outages").insert(record).execute()

        logger.info("msedcl_power: stored %d records", len(data))

detect_severity function · python · L48-L53 (6 LOC)