Function bodies 529 total

_int_env function · python · L7-L15 (9 LOC)

src/hive_mcp/config.py

def _int_env(key: str, default: int) -> int:
    """安全解析整数环境变量"""
    val = os.getenv(key, "")
    if not val:
        return default
    try:
        return int(val)
    except ValueError:
        return default

_float_env function · python · L18-L26 (9 LOC)

src/hive_mcp/config.py

def _float_env(key: str, default: float) -> float:
    """安全解析浮点环境变量"""
    val = os.getenv(key, "")
    if not val:
        return default
    try:
        return float(val)
    except ValueError:
        return default

is_consented function · python · L30-L38 (9 LOC)

src/hive_mcp/consent.py

def is_consented() -> bool:
    """检查本地缓存是否已同意协议"""
    p = Path(CONSENT_FILE)
    if not p.exists():
        return False
    try:
        return json.loads(p.read_text()).get("consented", False)
    except Exception:
        return False

check_consent_from_server function · python · L41-L59 (19 LOC)

src/hive_mcp/consent.py

async def check_consent_from_server() -> bool:
    """从服务器查询用户是否在注册时已同意协议，同意则自动写入本地缓存"""
    if not API_KEY:
        return False
    try:
        async with httpx.AsyncClient(timeout=5, trust_env=False) as client:
            r = await client.get(
                f"{DIRECTOR_URL}/api/v1/user/consent",
                headers={"Authorization": f"Bearer {API_KEY}"},
            )
            if r.is_success:
                data = r.json()
                if data.get("consented"):
                    save_consent(True)
                    logger.info("用户已在网站注册时同意协议，自动激活")
                    return True
    except Exception as e:
        logger.debug("查询服务器 consent 失败: %s", e)
    return False

save_consent function · python · L62-L67 (6 LOC)

src/hive_mcp/consent.py

def save_consent(agreed: bool) -> None:
    p = Path(CONSENT_FILE)
    p.parent.mkdir(parents=True, exist_ok=True)
    tmp = p.with_suffix(".tmp")
    tmp.write_text(json.dumps({"consented": agreed, "timestamp": datetime.now(timezone.utc).isoformat()}, indent=2))
    tmp.replace(p)

clean_html function · python · L62-L88 (27 LOC)

src/hive_mcp/content_utils.py

def clean_html(raw_html: str) -> str:
    """移除 script/style/广告/噪音标签，返回干净 HTML。(T-02)"""
    if not raw_html:
        return ""

    if _HAS_BS4:
        soup = BeautifulSoup(raw_html, "html.parser")

        # 移除噪音标签
        for tag in soup.find_all(_NOISE_TAGS):
            tag.decompose()

        # 按 class/id 移除广告类元素
        for tag in soup.find_all(True):
            cls = " ".join(tag.get("class", []))
            tid = tag.get("id", "")
            if _NOISE_PATTERN.search(cls) or _NOISE_PATTERN.search(tid):
                tag.decompose()

        return str(soup)
    else:
        # Fallback：正则清洗
        content = re.sub(r"<script[^>]*>.*?</script>", "", raw_html,
                         flags=re.DOTALL | re.IGNORECASE)
        content = re.sub(r"<style[^>]*>.*?</style>", "", content,
                         flags=re.DOTALL | re.IGNORECASE)
        return content

html_to_markdown function · python · L95-L130 (36 LOC)

src/hive_mcp/content_utils.py

def html_to_markdown(raw_html: str, base_url: str = "") -> str:
    """将 HTML 转换为 Markdown，优先用 trafilatura 提取正文。(T-01 + T-02)"""
    if not raw_html:
        return ""

    # T-02: 优先使用 trafilatura 提取正文 (Readability 算法)
    if _HAS_TRAFILATURA:
        try:
            extracted = trafilatura.extract(
                raw_html, url=base_url,
                include_links=True, include_formatting=True,
                output_format="txt",
                favor_precision=True,
            )
            if extracted and len(extracted) > 100:
                return extracted.strip()
        except Exception:
            pass  # fallback to html2text

    # Fallback: html2text (全页转换)
    cleaned = clean_html(raw_html)

    if _HAS_HTML2TEXT:
        h = _html2text.HTML2Text()
        h.ignore_links = False
        h.ignore_images = True         # 减少干扰
        h.ignore_emphasis = False
        h.body_width = 0               # 不强制换行
        h.skip_internal_links = True
        h.inline_links =

Provenance: Repobility (https://repobility.com) — every score reproducible from /scan/

_html_to_text_fallback function · python · L139-L145 (7 LOC)

src/hive_mcp/content_utils.py

def _html_to_text_fallback(html_content: str) -> str:
    """Fallback：基础正则去标签。"""
    text = re.sub(r"<[^>]+>", " ", html_content)
    text = html.unescape(text)
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

format_as_json function · python · L152-L158 (7 LOC)

src/hive_mcp/content_utils.py

def format_as_json(raw_body: str) -> str:
    """尝试解析 + 美化 JSON，失败则原样返回。"""
    try:
        obj = json.loads(raw_body)
        return json.dumps(obj, ensure_ascii=False, indent=2)
    except (json.JSONDecodeError, TypeError):
        return raw_body

build_search_url function · python · L165-L187 (23 LOC)

src/hive_mcp/content_utils.py

def build_search_url(query: str, engine: str = "duckduckgo",
                     country_code: Optional[str] = None) -> str:
    """
    DEPRECATED: Use hive_mcp.tools.search.web_search() instead.

    构建搜索 URL。
    """
    q = quote_plus(query)
    if engine == "bing":
        url = f"https://www.bing.com/search?q={q}&count=20&setlang=zh-CN"
        if country_code:
            url += f"&cc={country_code.upper()}"
        return url
    elif engine == "google":
        url = f"https://www.google.com/search?q={q}&num=20&hl=zh-CN"
        if country_code:
            url += f"&gl={country_code.lower()}"
        return url
    else:  # duckduckgo (默认，无需 API Key)
        url = f"https://html.duckduckgo.com/html/?q={q}"
        if country_code:
            url += f"&kl={country_code.lower()}-{country_code.lower()}"
        return url

parse_search_results function · python · L190-L210 (21 LOC)

src/hive_mcp/content_utils.py

def parse_search_results(raw_html: str, engine: str = "duckduckgo",
                         max_results: int = 10) -> list[dict]:
    """
    DEPRECATED: Use hive_mcp.tools.search.web_search() instead.

    解析搜索结果页面，返回结构化列表。
    """
    if not _HAS_BS4:
        return []

    soup = BeautifulSoup(raw_html, "html.parser")
    results = []

    if engine == "duckduckgo":
        results = _parse_ddg(soup, max_results)
    elif engine == "bing":
        results = _parse_bing(soup, max_results)
    elif engine == "google":
        results = _parse_google(soup, max_results)

    return results

_parse_ddg function · python · L213-L245 (33 LOC)

src/hive_mcp/content_utils.py

def _parse_ddg(soup: "BeautifulSoup", max_results: int) -> list[dict]:
    """DEPRECATED: Use hive_mcp.tools.search.web_search() instead."""
    results = []
    # DDG HTML endpoint 的结果结构
    for item in soup.select(".result, .web-result")[:max_results * 2]:
        title_el = item.select_one(".result__title a, .result__a, h2 a")
        snippet_el = item.select_one(".result__snippet, .result__body")

        if not title_el:
            continue

        title = title_el.get_text(strip=True)
        href = title_el.get("href", "")

        # DDG 通过 uddg 参数传递真实 URL
        if "//duckduckgo.com/l/" in href or href.startswith("/l/"):
            from urllib.parse import urlparse, parse_qs
            try:
                params = parse_qs(urlparse(href).query)
                href = params.get("uddg", [href])[0]
            except Exception:
                pass

        if href.startswith("//"):
            href = "https:" + href

        snippet = snippet_el.get_text(strip=True) if sni

_parse_bing function · python · L248-L261 (14 LOC)

src/hive_mcp/content_utils.py

def _parse_bing(soup: "BeautifulSoup", max_results: int) -> list[dict]:
    """DEPRECATED: Use hive_mcp.tools.search.web_search() instead."""
    results = []
    for item in soup.select("li.b_algo")[:max_results]:
        title_el = item.select_one("h2 a")
        snippet_el = item.select_one(".b_caption p, .b_snippet, p")
        if not title_el:
            continue
        title = title_el.get_text(strip=True)
        url = title_el.get("href", "")
        snippet = snippet_el.get_text(strip=True) if snippet_el else ""
        if title and url:
            results.append({"title": title, "url": url, "snippet": snippet})
    return results

_parse_google function · python · L264-L287 (24 LOC)

src/hive_mcp/content_utils.py

def _parse_google(soup: "BeautifulSoup", max_results: int) -> list[dict]:
    """DEPRECATED: Use hive_mcp.tools.search.web_search() instead."""
    results = []
    for item in soup.select("div.g, div[data-sokoban-container]")[:max_results * 2]:
        title_el = item.select_one("h3")
        link_el = item.select_one("a[href]")
        snippet_el = item.select_one("div.VwiC3b, span.aCOpRe, div[data-sncf]")
        if not (title_el and link_el):
            continue
        title = title_el.get_text(strip=True)
        url = link_el.get("href", "")
        if url.startswith("/url?"):
            from urllib.parse import urlparse, parse_qs
            try:
                params = parse_qs(urlparse(url).query)
                url = params.get("q", [url])[0]
            except Exception:
                pass
        snippet = snippet_el.get_text(strip=True) if snippet_el else ""
        if title and url and url.startswith("http"):
            results.append({"title": title, "url": url,

format_search_results_md function · python · L290-L310 (21 LOC)

src/hive_mcp/content_utils.py

def format_search_results_md(results: list[dict], query: str, engine: str) -> str:
    """将搜索结果格式化为 Markdown 字符串。"""
    if not results:
        return (
            f"❌ 未找到 `{query}` 的搜索结果。\n"
            f"建议：检查关键词拼写，或换用其他搜索引擎（bing/google/duckduckgo）。"
        )

    lines = [
        f"## 🔍 搜索结果",
        f"**关键词:** `{query}`  |  **引擎:** `{engine}`  |  **共 {len(results)} 条**",
        "",
        "---",
    ]
    for i, r in enumerate(results, 1):
        lines.append(f"\n### {i}. {r['title']}")
        lines.append(f"🔗 <{r['url']}>")
        if r.get("snippet"):
            lines.append(f"\n> {r['snippet']}")

    return "\n".join(lines)

Repobility — same analyzer, your code, free for public repos · /scan/

truncate function · python · L317-L337 (21 LOC)

src/hive_mcp/content_utils.py

def truncate(content: str, max_chars: int, start_index: int = 0) -> tuple[str, bool, int]:
    """
    截断内容，返回 (内容, 是否被截断, 下一次 start_index)。
    start_index: 从第 N 个字符开始返回 (用于分页读取长文档)
    """
    if not content:
        return content, False, 0

    # 应用 start_index 偏移
    if start_index > 0:
        content = content[start_index:]

    if len(content) <= max_chars:
        return content, False, 0

    # 尽量在段落边界截断
    cutoff = content.rfind("\n\n", 0, max_chars)
    if cutoff < max_chars * 0.8:
        cutoff = max_chars
    next_index = start_index + cutoff
    return content[:cutoff], True, next_index

dependency_status function · python · L340-L348 (9 LOC)

src/hive_mcp/content_utils.py

def dependency_status() -> dict:
    """返回软依赖安装状态（用于调试）。"""
    return {
        "html2text": _HAS_HTML2TEXT,
        "beautifulsoup4": _HAS_BS4,
        "trafilatura": _HAS_TRAFILATURA,
        "markdown_conversion": _HAS_HTML2TEXT or _HAS_TRAFILATURA,
        "search_parsing": _HAS_BS4,
    }

_fetch_html function · python · L24-L30 (7 LOC)

src/hive_mcp/platforms/ecommerce.py

async def _fetch_html(url: str, proxy_url: Optional[str] = None, headers: Optional[dict] = None) -> str:
    """通过代理获取 HTML。"""
    h = {**_HEADERS, **(headers or {})}
    async with httpx.AsyncClient(proxy=proxy_url, timeout=30.0, follow_redirects=True) as client:
        resp = await client.get(url, headers=h)
        resp.raise_for_status()
        return resp.text

amazon_search function · python · L33-L72 (40 LOC)

src/hive_mcp/platforms/ecommerce.py

async def amazon_search(
    query: str,
    country: str = "us",  # us, uk, de, jp, etc.
    count: int = 10,
    proxy_url: Optional[str] = None,
) -> list[dict]:
    """搜索 Amazon 商品。"""
    try:
        from bs4 import BeautifulSoup
    except ImportError:
        return [{"error": "beautifulsoup4 未安装"}]

    domains = {"us": "www.amazon.com", "uk": "www.amazon.co.uk", "de": "www.amazon.de", "jp": "www.amazon.co.jp", "cn": "www.amazon.cn"}
    domain = domains.get(country, "www.amazon.com")
    url = f"https://{domain}/s?k={quote_plus(query)}"

    html = await _fetch_html(url, proxy_url)
    soup = BeautifulSoup(html, "html.parser")

    products = []
    for item in soup.select('[data-component-type="s-search-result"]')[:count]:
        title_el = item.select_one("h2 a span") or item.select_one("h2 span")
        price_el = item.select_one(".a-price .a-offscreen")
        rating_el = item.select_one(".a-icon-alt")
        reviews_el = item.select_one('[aria-label*="stars"] + span'

amazon_product_detail function · python · L75-L103 (29 LOC)

src/hive_mcp/platforms/ecommerce.py

async def amazon_product_detail(
    url: str,
    proxy_url: Optional[str] = None,
) -> dict:
    """获取 Amazon 商品详情。"""
    try:
        from bs4 import BeautifulSoup
    except ImportError:
        return {"error": "beautifulsoup4 未安装"}

    html = await _fetch_html(url, proxy_url)
    soup = BeautifulSoup(html, "html.parser")

    title = soup.select_one("#productTitle")
    price = soup.select_one(".a-price .a-offscreen") or soup.select_one("#priceblock_ourprice")
    rating = soup.select_one("#acrPopover .a-icon-alt")
    reviews = soup.select_one("#acrCustomerReviewText")
    description = soup.select_one("#productDescription")
    features = soup.select("#feature-bullets li span.a-list-item")

    return {
        "title": title.get_text(strip=True) if title else None,
        "price": price.get_text(strip=True) if price else None,
        "rating": rating.get_text(strip=True) if rating else None,
        "reviews_count": reviews.get_text(strip=True) if reviews else None,

amazon_reviews function · python · L106-L141 (36 LOC)

src/hive_mcp/platforms/ecommerce.py

async def amazon_reviews(
    asin: str,
    country: str = "us",
    count: int = 10,
    proxy_url: Optional[str] = None,
) -> list[dict]:
    """获取 Amazon 商品评论。"""
    try:
        from bs4 import BeautifulSoup
    except ImportError:
        return [{"error": "beautifulsoup4 未安装"}]

    domains = {"us": "www.amazon.com", "uk": "www.amazon.co.uk", "de": "www.amazon.de", "jp": "www.amazon.co.jp"}
    domain = domains.get(country, "www.amazon.com")
    url = f"https://{domain}/product-reviews/{asin}?sortBy=recent"

    html = await _fetch_html(url, proxy_url)
    soup = BeautifulSoup(html, "html.parser")

    reviews = []
    for review in soup.select('[data-hook="review"]')[:count]:
        title_el = review.select_one('[data-hook="review-title"] span:last-child')
        body_el = review.select_one('[data-hook="review-body"] span')
        rating_el = review.select_one('[data-hook="review-star-rating"] .a-icon-alt')
        author_el = review.select_one('.a-profile-name')
        da

_get_loader function · python · L34-L57 (24 LOC)