Files
Aether/src/services/health/monitor.py
fawney19 09e0f594ff refactor: 重构限流系统和健康监控,支持按 API 格式区分
- 将 adaptive_concurrency 重命名为 adaptive_rpm,从并发控制改为 RPM 控制
- 健康监控器支持按 API 格式独立管理健康度和熔断器状态
- 新增 model_permissions 模块,支持按格式配置允许的模型
- 重构前端提供商相关表单组件,新增 Collapsible UI 组件
- 新增数据库迁移脚本支持新的数据结构
2026-01-10 18:48:35 +08:00

945 lines
38 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
健康监控器 - Endpoint 和 Key 的健康度追踪(按 API 格式区分)
功能:
1. 基于滑动窗口的错误率计算(按 API 格式独立)
2. 三态熔断器:关闭 -> 打开 -> 半开 -> 关闭(按 API 格式独立)
3. 半开状态允许少量请求验证服务恢复
4. 提供健康度查询和管理 API
数据结构:
- health_by_format: {"CLAUDE": {"health_score": 1.0, "consecutive_failures": 0, ...}, ...}
- circuit_breaker_by_format: {"CLAUDE": {"open": false, "open_at": null, ...}, ...}
"""
import os
from datetime import datetime, timedelta, timezone
from typing import Any, Dict, List, Optional, Tuple
from sqlalchemy import case, func
from sqlalchemy.orm import Session
from src.config.constants import CircuitBreakerDefaults
from src.core.batch_committer import get_batch_committer
from src.core.logger import logger
from src.core.metrics import health_open_circuits
from src.models.database import ProviderAPIKey, ProviderEndpoint
class CircuitState:
"""熔断器状态"""
CLOSED = "closed" # 关闭(正常)
OPEN = "open" # 打开(熔断)
HALF_OPEN = "half_open" # 半开(验证恢复)
# 默认健康度数据结构
def _default_health_data() -> Dict[str, Any]:
return {
"health_score": 1.0,
"consecutive_failures": 0,
"last_failure_at": None,
"request_results_window": [],
}
# 默认熔断器数据结构
def _default_circuit_data() -> Dict[str, Any]:
return {
"open": False,
"open_at": None,
"next_probe_at": None,
"half_open_until": None,
"half_open_successes": 0,
"half_open_failures": 0,
}
class HealthMonitor:
"""健康监控器(滑动窗口 + 半开状态模式,按 API 格式区分)"""
# === 滑动窗口配置 ===
WINDOW_SIZE = int(os.getenv("HEALTH_WINDOW_SIZE", str(CircuitBreakerDefaults.WINDOW_SIZE)))
WINDOW_SECONDS = int(
os.getenv("HEALTH_WINDOW_SECONDS", str(CircuitBreakerDefaults.WINDOW_SECONDS))
)
MIN_REQUESTS = int(
os.getenv("HEALTH_MIN_REQUESTS", str(CircuitBreakerDefaults.MIN_REQUESTS_FOR_DECISION))
)
ERROR_RATE_THRESHOLD = float(
os.getenv("HEALTH_ERROR_RATE_THRESHOLD", str(CircuitBreakerDefaults.ERROR_RATE_THRESHOLD))
)
# === 半开状态配置 ===
HALF_OPEN_DURATION = int(
os.getenv(
"HEALTH_HALF_OPEN_DURATION", str(CircuitBreakerDefaults.HALF_OPEN_DURATION_SECONDS)
)
)
HALF_OPEN_SUCCESS_THRESHOLD = int(
os.getenv(
"HEALTH_HALF_OPEN_SUCCESS", str(CircuitBreakerDefaults.HALF_OPEN_SUCCESS_THRESHOLD)
)
)
HALF_OPEN_FAILURE_THRESHOLD = int(
os.getenv(
"HEALTH_HALF_OPEN_FAILURE", str(CircuitBreakerDefaults.HALF_OPEN_FAILURE_THRESHOLD)
)
)
# === 恢复配置 ===
INITIAL_RECOVERY_SECONDS = int(
os.getenv(
"HEALTH_INITIAL_RECOVERY_SECONDS", str(CircuitBreakerDefaults.INITIAL_RECOVERY_SECONDS)
)
)
RECOVERY_BACKOFF = int(
os.getenv(
"HEALTH_RECOVERY_BACKOFF", str(CircuitBreakerDefaults.RECOVERY_BACKOFF_MULTIPLIER)
)
)
MAX_RECOVERY_SECONDS = int(
os.getenv("HEALTH_MAX_RECOVERY_SECONDS", str(CircuitBreakerDefaults.MAX_RECOVERY_SECONDS))
)
# === 兼容旧参数(用于健康度展示)===
SUCCESS_INCREMENT = float(
os.getenv("HEALTH_SUCCESS_INCREMENT", str(CircuitBreakerDefaults.SUCCESS_INCREMENT))
)
FAILURE_DECREMENT = float(
os.getenv("HEALTH_FAILURE_DECREMENT", str(CircuitBreakerDefaults.FAILURE_DECREMENT))
)
PROBE_RECOVERY_SCORE = float(
os.getenv("HEALTH_PROBE_RECOVERY_SCORE", str(CircuitBreakerDefaults.PROBE_RECOVERY_SCORE))
)
# === 其他配置 ===
ALLOW_AUTO_RECOVER = os.getenv("HEALTH_AUTO_RECOVER_ENABLED", "true").lower() == "true"
CIRCUIT_HISTORY_LIMIT = int(os.getenv("HEALTH_CIRCUIT_HISTORY_LIMIT", "200"))
# 进程级别状态缓存
_circuit_history: List[Dict[str, Any]] = []
_open_circuit_keys: int = 0
# ==================== 数据访问辅助方法 ====================
@classmethod
def _get_health_data(cls, key: ProviderAPIKey, api_format: str) -> Dict[str, Any]:
"""获取指定格式的健康度数据,不存在则返回默认值"""
health_by_format = key.health_by_format or {}
if api_format not in health_by_format:
return _default_health_data()
return health_by_format[api_format]
@classmethod
def _set_health_data(cls, key: ProviderAPIKey, api_format: str, data: Dict[str, Any]) -> None:
"""设置指定格式的健康度数据"""
health_by_format = dict(key.health_by_format or {})
health_by_format[api_format] = data
key.health_by_format = health_by_format # type: ignore[assignment]
@classmethod
def _get_circuit_data(cls, key: ProviderAPIKey, api_format: str) -> Dict[str, Any]:
"""获取指定格式的熔断器数据,不存在则返回默认值"""
circuit_by_format = key.circuit_breaker_by_format or {}
if api_format not in circuit_by_format:
return _default_circuit_data()
return circuit_by_format[api_format]
@classmethod
def _set_circuit_data(cls, key: ProviderAPIKey, api_format: str, data: Dict[str, Any]) -> None:
"""设置指定格式的熔断器数据"""
circuit_by_format = dict(key.circuit_breaker_by_format or {})
circuit_by_format[api_format] = data
key.circuit_breaker_by_format = circuit_by_format # type: ignore[assignment]
# ==================== 核心方法 ====================
@classmethod
def record_success(
cls,
db: Session,
key_id: Optional[str] = None,
api_format: Optional[str] = None,
response_time_ms: Optional[int] = None,
) -> None:
"""记录成功请求(按 API 格式)
Args:
db: 数据库会话
key_id: Key ID必需
api_format: API 格式(必需,用于区分不同格式的健康度)
response_time_ms: 响应时间(可选)
Note:
api_format 在逻辑上是必需的,但为了向后兼容保持 Optional 签名。
如果未提供,会尝试从 Key 的 api_formats 中获取第一个格式作为 fallback。
"""
try:
if not key_id:
return
key = db.query(ProviderAPIKey).filter(ProviderAPIKey.id == key_id).first()
if not key:
return
# api_format 兼容处理:如果未提供,尝试使用 Key 的第一个格式
effective_api_format = api_format
if not effective_api_format:
if key.api_formats and len(key.api_formats) > 0:
effective_api_format = key.api_formats[0]
logger.debug(
f"record_success: api_format 未提供,使用默认格式 {effective_api_format}"
)
else:
logger.warning(
f"record_success: api_format 未提供且 Key 无可用格式: key_id={key_id[:8]}..."
)
return
now = datetime.now(timezone.utc)
now_ts = now.timestamp()
# 获取当前格式的健康度数据
health_data = cls._get_health_data(key, effective_api_format)
circuit_data = cls._get_circuit_data(key, effective_api_format)
# 1. 更新滑动窗口
window = health_data.get("request_results_window") or []
window.append({"ts": now_ts, "ok": True})
cutoff_ts = now_ts - cls.WINDOW_SECONDS
window = [r for r in window if r["ts"] > cutoff_ts]
if len(window) > cls.WINDOW_SIZE:
window = window[-cls.WINDOW_SIZE :]
health_data["request_results_window"] = window
# 2. 更新健康度(用于展示)
current_score = float(health_data.get("health_score") or 0)
new_score = min(current_score + cls.SUCCESS_INCREMENT, 1.0)
health_data["health_score"] = new_score
# 3. 更新统计
health_data["consecutive_failures"] = 0
health_data["last_failure_at"] = None
# 4. 处理熔断器状态
state = cls._get_circuit_state_from_data(circuit_data, now)
if state == CircuitState.HALF_OPEN:
# 半开状态:记录成功
circuit_data["half_open_successes"] = int(
circuit_data.get("half_open_successes") or 0
) + 1
if circuit_data["half_open_successes"] >= cls.HALF_OPEN_SUCCESS_THRESHOLD:
# 达到成功阈值,关闭熔断器
cls._close_circuit_data(circuit_data, health_data, reason="半开状态验证成功")
cls._push_circuit_event(
{
"event": "closed",
"key_id": key.id,
"api_format": effective_api_format,
"reason": "半开状态验证成功",
"timestamp": now.isoformat(),
}
)
logger.info(
f"[CLOSED] Key 熔断器关闭: {key.id[:8]}.../{effective_api_format} | 原因: 半开状态验证成功"
)
elif state == CircuitState.OPEN:
# 打开状态下的成功(探测成功),进入半开状态
cls._enter_half_open_data(circuit_data, now)
cls._push_circuit_event(
{
"event": "half_open",
"key_id": key.id,
"api_format": effective_api_format,
"timestamp": now.isoformat(),
}
)
logger.info(
f"[HALF-OPEN] Key 进入半开状态: {key.id[:8]}.../{effective_api_format} | "
f"需要 {cls.HALF_OPEN_SUCCESS_THRESHOLD} 次成功关闭熔断器"
)
# 保存数据
cls._set_health_data(key, effective_api_format, health_data)
cls._set_circuit_data(key, effective_api_format, circuit_data)
# 更新全局统计
key.success_count = int(key.success_count or 0) + 1 # type: ignore[assignment]
key.request_count = int(key.request_count or 0) + 1 # type: ignore[assignment]
if response_time_ms:
key.total_response_time_ms = int(key.total_response_time_ms or 0) + response_time_ms # type: ignore[assignment]
db.flush()
get_batch_committer().mark_dirty(db)
except Exception as e:
logger.error(f"记录成功请求失败: {e}")
db.rollback()
@classmethod
def record_failure(
cls,
db: Session,
key_id: Optional[str] = None,
api_format: Optional[str] = None,
error_type: Optional[str] = None,
) -> None:
"""记录失败请求(按 API 格式)
Args:
db: 数据库会话
key_id: Key ID必需
api_format: API 格式(必需,用于区分不同格式的健康度)
error_type: 错误类型(可选)
Note:
api_format 在逻辑上是必需的,但为了向后兼容保持 Optional 签名。
如果未提供,会尝试从 Key 的 api_formats 中获取第一个格式作为 fallback。
"""
try:
if not key_id:
return
key = db.query(ProviderAPIKey).filter(ProviderAPIKey.id == key_id).first()
if not key:
return
# api_format 兼容处理:如果未提供,尝试使用 Key 的第一个格式
effective_api_format = api_format
if not effective_api_format:
if key.api_formats and len(key.api_formats) > 0:
effective_api_format = key.api_formats[0]
logger.debug(
f"record_failure: api_format 未提供,使用默认格式 {effective_api_format}"
)
else:
logger.warning(
f"record_failure: api_format 未提供且 Key 无可用格式: key_id={key_id[:8]}..."
)
return
now = datetime.now(timezone.utc)
now_ts = now.timestamp()
# 获取当前格式的健康度数据
health_data = cls._get_health_data(key, effective_api_format)
circuit_data = cls._get_circuit_data(key, effective_api_format)
# 1. 更新滑动窗口
window = health_data.get("request_results_window") or []
window.append({"ts": now_ts, "ok": False})
cutoff_ts = now_ts - cls.WINDOW_SECONDS
window = [r for r in window if r["ts"] > cutoff_ts]
if len(window) > cls.WINDOW_SIZE:
window = window[-cls.WINDOW_SIZE :]
health_data["request_results_window"] = window
# 2. 更新健康度(用于展示)
current_score = float(health_data.get("health_score") or 1)
new_score = max(current_score - cls.FAILURE_DECREMENT, 0.0)
health_data["health_score"] = new_score
# 3. 更新统计
health_data["consecutive_failures"] = (
int(health_data.get("consecutive_failures") or 0) + 1
)
health_data["last_failure_at"] = now.isoformat()
# 4. 处理熔断器状态
state = cls._get_circuit_state_from_data(circuit_data, now)
if state == CircuitState.HALF_OPEN:
# 半开状态:记录失败
circuit_data["half_open_failures"] = int(
circuit_data.get("half_open_failures") or 0
) + 1
if circuit_data["half_open_failures"] >= cls.HALF_OPEN_FAILURE_THRESHOLD:
# 达到失败阈值,重新打开熔断器
# 注意:半开状态本身就是打开状态的子状态,不需要增加计数
consecutive = int(health_data.get("consecutive_failures") or 0)
recovery_seconds = cls._calculate_recovery_seconds(consecutive)
cls._open_circuit_data(
circuit_data, now, recovery_seconds, reason="半开状态验证失败"
)
cls._push_circuit_event(
{
"event": "opened",
"key_id": key.id,
"api_format": effective_api_format,
"reason": "半开状态验证失败",
"recovery_seconds": recovery_seconds,
"timestamp": now.isoformat(),
}
)
logger.warning(
f"[OPEN] Key 熔断器打开: {key.id[:8]}.../{effective_api_format} | 原因: 半开状态验证失败 | "
f"{recovery_seconds}秒后进入半开状态"
)
elif state == CircuitState.CLOSED:
# 关闭状态:检查是否需要打开熔断器
error_rate = cls._calculate_error_rate_from_window(window, now_ts)
if len(window) >= cls.MIN_REQUESTS and error_rate >= cls.ERROR_RATE_THRESHOLD:
consecutive = int(health_data.get("consecutive_failures") or 0)
recovery_seconds = cls._calculate_recovery_seconds(consecutive)
reason = f"错误率 {error_rate:.0%} 超过阈值 {cls.ERROR_RATE_THRESHOLD:.0%}"
cls._open_circuit_data(circuit_data, now, recovery_seconds, reason=reason)
cls._open_circuit_keys += 1
health_open_circuits.set(cls._open_circuit_keys)
cls._push_circuit_event(
{
"event": "opened",
"key_id": key.id,
"api_format": effective_api_format,
"reason": reason,
"recovery_seconds": recovery_seconds,
"timestamp": now.isoformat(),
}
)
logger.warning(
f"[OPEN] Key 熔断器打开: {key.id[:8]}.../{effective_api_format} | 原因: {reason} | "
f"{recovery_seconds}秒后进入半开状态"
)
# 保存数据
cls._set_health_data(key, effective_api_format, health_data)
cls._set_circuit_data(key, effective_api_format, circuit_data)
# 更新全局统计
key.error_count = int(key.error_count or 0) + 1 # type: ignore[assignment]
key.request_count = int(key.request_count or 0) + 1 # type: ignore[assignment]
key.last_error_at = now # type: ignore[assignment]
logger.debug(
f"[WARN] Key 健康度下降: {key_id[:8]}.../{effective_api_format} -> {new_score:.2f} "
f"(连续失败 {health_data['consecutive_failures']} 次, error_type={error_type})"
)
db.flush()
get_batch_committer().mark_dirty(db)
except Exception as e:
logger.error(f"记录失败请求失败: {e}")
db.rollback()
# ==================== 滑动窗口方法 ====================
@classmethod
def _calculate_error_rate_from_window(
cls, window: List[Dict[str, Any]], now_ts: float
) -> float:
"""从窗口数据计算错误率"""
if not window:
return 0.0
cutoff_ts = now_ts - cls.WINDOW_SECONDS
valid_records = [r for r in window if r["ts"] > cutoff_ts]
if not valid_records:
return 0.0
failures = sum(1 for r in valid_records if not r["ok"])
return failures / len(valid_records)
# ==================== 熔断器状态方法(操作数据字典)====================
@classmethod
def _get_circuit_state_from_data(cls, circuit_data: Dict[str, Any], now: datetime) -> str:
"""从数据字典获取当前熔断器状态"""
if not circuit_data.get("open"):
return CircuitState.CLOSED
# 检查是否在半开状态
half_open_until_str = circuit_data.get("half_open_until")
if half_open_until_str:
half_open_until = datetime.fromisoformat(half_open_until_str)
if now < half_open_until:
return CircuitState.HALF_OPEN
# 检查是否到了探测时间(进入半开)
next_probe_str = circuit_data.get("next_probe_at")
if next_probe_str:
next_probe_at = datetime.fromisoformat(next_probe_str)
if now >= next_probe_at:
return CircuitState.HALF_OPEN
return CircuitState.OPEN
@classmethod
def _open_circuit_data(
cls,
circuit_data: Dict[str, Any],
now: datetime,
recovery_seconds: int,
reason: str,
) -> None:
"""打开熔断器(操作数据字典)"""
circuit_data["open"] = True
circuit_data["open_at"] = now.isoformat()
circuit_data["half_open_until"] = None
circuit_data["half_open_successes"] = 0
circuit_data["half_open_failures"] = 0
circuit_data["next_probe_at"] = (now + timedelta(seconds=recovery_seconds)).isoformat()
@classmethod
def _enter_half_open_data(cls, circuit_data: Dict[str, Any], now: datetime) -> None:
"""进入半开状态(操作数据字典)"""
circuit_data["half_open_until"] = (
now + timedelta(seconds=cls.HALF_OPEN_DURATION)
).isoformat()
circuit_data["half_open_successes"] = 0
circuit_data["half_open_failures"] = 0
@classmethod
def _close_circuit_data(
cls, circuit_data: Dict[str, Any], health_data: Dict[str, Any], reason: str
) -> None:
"""关闭熔断器(操作数据字典)"""
circuit_data["open"] = False
circuit_data["open_at"] = None
circuit_data["next_probe_at"] = None
circuit_data["half_open_until"] = None
circuit_data["half_open_successes"] = 0
circuit_data["half_open_failures"] = 0
# 快速恢复健康度
current_score = float(health_data.get("health_score") or 0)
health_data["health_score"] = max(current_score, cls.PROBE_RECOVERY_SCORE)
cls._open_circuit_keys = max(0, cls._open_circuit_keys - 1)
health_open_circuits.set(cls._open_circuit_keys)
@classmethod
def _calculate_recovery_seconds(cls, consecutive_failures: int) -> int:
"""计算恢复等待时间(指数退避)"""
exponent = min(consecutive_failures // 5, 4)
seconds = cls.INITIAL_RECOVERY_SECONDS * (cls.RECOVERY_BACKOFF**exponent)
return min(int(seconds), cls.MAX_RECOVERY_SECONDS)
# ==================== 状态查询方法 ====================
@classmethod
def is_circuit_breaker_closed(
cls, resource: ProviderAPIKey, api_format: Optional[str] = None
) -> bool:
"""检查熔断器是否允许请求通过(按 API 格式)"""
if not api_format:
# 兼容旧调用:检查是否有任何格式的熔断器开启
circuit_by_format = resource.circuit_breaker_by_format or {}
for fmt, circuit_data in circuit_by_format.items():
if circuit_data.get("open"):
return False
return True
circuit_data = cls._get_circuit_data(resource, api_format)
if not circuit_data.get("open"):
return True
now = datetime.now(timezone.utc)
state = cls._get_circuit_state_from_data(circuit_data, now)
# 半开状态允许请求通过
if state == CircuitState.HALF_OPEN:
return True
# 检查是否到了探测时间
next_probe_str = circuit_data.get("next_probe_at")
if next_probe_str:
next_probe_at = datetime.fromisoformat(next_probe_str)
if now >= next_probe_at:
# 自动进入半开状态
cls._enter_half_open_data(circuit_data, now)
cls._set_circuit_data(resource, api_format, circuit_data)
return True
return False
@classmethod
def get_circuit_breaker_status(
cls, resource: ProviderAPIKey, api_format: Optional[str] = None
) -> Tuple[bool, Optional[str]]:
"""获取熔断器详细状态(按 API 格式)"""
if not api_format:
# 兼容旧调用:返回第一个开启的熔断器状态
circuit_by_format = resource.circuit_breaker_by_format or {}
for fmt, circuit_data in circuit_by_format.items():
if circuit_data.get("open"):
return cls._get_status_from_circuit_data(circuit_data)
return True, None
circuit_data = cls._get_circuit_data(resource, api_format)
return cls._get_status_from_circuit_data(circuit_data)
@classmethod
def _get_status_from_circuit_data(
cls, circuit_data: Dict[str, Any]
) -> Tuple[bool, Optional[str]]:
"""从熔断器数据获取状态描述"""
if not circuit_data.get("open"):
return True, None
now = datetime.now(timezone.utc)
state = cls._get_circuit_state_from_data(circuit_data, now)
if state == CircuitState.HALF_OPEN:
successes = int(circuit_data.get("half_open_successes") or 0)
return True, f"半开状态({successes}/{cls.HALF_OPEN_SUCCESS_THRESHOLD}成功)"
next_probe_str = circuit_data.get("next_probe_at")
if next_probe_str:
next_probe_at = datetime.fromisoformat(next_probe_str)
if now >= next_probe_at:
return True, None
remaining = next_probe_at - now
remaining_seconds = int(remaining.total_seconds())
if remaining_seconds >= 60:
time_str = f"{remaining_seconds // 60}min{remaining_seconds % 60}s"
else:
time_str = f"{remaining_seconds}s"
return False, f"熔断中({time_str}后半开)"
return False, "熔断中"
@classmethod
def get_key_health(
cls, db: Session, key_id: str, api_format: Optional[str] = None
) -> Optional[Dict[str, Any]]:
"""获取 Key 健康状态(支持按格式查询)"""
try:
key = db.query(ProviderAPIKey).filter(ProviderAPIKey.id == key_id).first()
if not key:
return None
now = datetime.now(timezone.utc)
now_ts = now.timestamp()
avg_response_time_ms = (
int(key.total_response_time_ms or 0) / int(key.success_count or 1)
if key.success_count
else 0
)
# 全局统计
result = {
"key_id": key.id,
"is_active": key.is_active,
"statistics": {
"request_count": int(key.request_count or 0),
"success_count": int(key.success_count or 0),
"error_count": int(key.error_count or 0),
"success_rate": (
int(key.success_count or 0) / int(key.request_count or 1)
if key.request_count
else 0.0
),
"avg_response_time_ms": round(avg_response_time_ms, 2),
},
}
# 按格式的健康度数据
health_by_format = key.health_by_format or {}
circuit_by_format = key.circuit_breaker_by_format or {}
if api_format:
# 查询单个格式
health_data = cls._get_health_data(key, api_format)
circuit_data = cls._get_circuit_data(key, api_format)
window = health_data.get("request_results_window") or []
valid_window = [r for r in window if r["ts"] > now_ts - cls.WINDOW_SECONDS]
result["api_format"] = api_format
result["health_score"] = float(health_data.get("health_score") or 1.0)
result["error_rate"] = cls._calculate_error_rate_from_window(window, now_ts)
result["window_size"] = len(valid_window)
result["consecutive_failures"] = int(
health_data.get("consecutive_failures") or 0
)
result["last_failure_at"] = health_data.get("last_failure_at")
result["circuit_breaker"] = {
"state": cls._get_circuit_state_from_data(circuit_data, now),
"open": circuit_data.get("open", False),
"open_at": circuit_data.get("open_at"),
"next_probe_at": circuit_data.get("next_probe_at"),
"half_open_until": circuit_data.get("half_open_until"),
"half_open_successes": int(circuit_data.get("half_open_successes") or 0),
"half_open_failures": int(circuit_data.get("half_open_failures") or 0),
}
else:
# 返回所有格式的健康度数据
formats_health = {}
for fmt in (key.api_formats or []):
health_data = health_by_format.get(fmt, _default_health_data())
circuit_data = circuit_by_format.get(fmt, _default_circuit_data())
window = health_data.get("request_results_window") or []
valid_window = [r for r in window if r["ts"] > now_ts - cls.WINDOW_SECONDS]
formats_health[fmt] = {
"health_score": float(health_data.get("health_score") or 1.0),
"error_rate": cls._calculate_error_rate_from_window(window, now_ts),
"window_size": len(valid_window),
"consecutive_failures": int(
health_data.get("consecutive_failures") or 0
),
"last_failure_at": health_data.get("last_failure_at"),
"circuit_breaker": {
"state": cls._get_circuit_state_from_data(circuit_data, now),
"open": circuit_data.get("open", False),
"open_at": circuit_data.get("open_at"),
"next_probe_at": circuit_data.get("next_probe_at"),
"half_open_until": circuit_data.get("half_open_until"),
"half_open_successes": int(
circuit_data.get("half_open_successes") or 0
),
"half_open_failures": int(
circuit_data.get("half_open_failures") or 0
),
},
}
result["health_by_format"] = formats_health
# 计算整体健康度(取最低值)
if formats_health:
result["health_score"] = min(
h["health_score"] for h in formats_health.values()
)
result["any_circuit_open"] = any(
h["circuit_breaker"]["open"] for h in formats_health.values()
)
else:
result["health_score"] = 1.0
result["any_circuit_open"] = False
return result
except Exception as e:
logger.error(f"获取 Key 健康状态失败: {e}")
return None
@classmethod
def get_endpoint_health(cls, db: Session, endpoint_id: str) -> Optional[Dict[str, Any]]:
"""获取 Endpoint 健康状态"""
try:
endpoint = (
db.query(ProviderEndpoint).filter(ProviderEndpoint.id == endpoint_id).first()
)
if not endpoint:
return None
return {
"endpoint_id": endpoint.id,
"health_score": float(endpoint.health_score or 1.0),
"consecutive_failures": int(endpoint.consecutive_failures or 0),
"last_failure_at": (
endpoint.last_failure_at.isoformat() if endpoint.last_failure_at else None
),
"is_active": endpoint.is_active,
}
except Exception as e:
logger.error(f"获取 Endpoint 健康状态失败: {e}")
return None
# ==================== 管理方法 ====================
@classmethod
def reset_health(
cls, db: Session, key_id: Optional[str] = None, api_format: Optional[str] = None
) -> bool:
"""重置健康度(支持按格式重置)"""
try:
if key_id:
key = db.query(ProviderAPIKey).filter(ProviderAPIKey.id == key_id).first()
if key:
if api_format:
# 重置单个格式
cls._set_health_data(key, api_format, _default_health_data())
cls._set_circuit_data(key, api_format, _default_circuit_data())
logger.info(f"[RESET] 重置 Key 健康度: {key_id}/{api_format}")
else:
# 重置所有格式
key.health_by_format = {} # type: ignore[assignment]
key.circuit_breaker_by_format = {} # type: ignore[assignment]
logger.info(f"[RESET] 重置 Key 所有格式健康度: {key_id}")
db.flush()
get_batch_committer().mark_dirty(db)
return True
except Exception as e:
logger.error(f"重置健康度失败: {e}")
db.rollback()
return False
@classmethod
def manually_enable(cls, db: Session, key_id: Optional[str] = None) -> bool:
"""手动启用 Key"""
try:
if key_id:
key = db.query(ProviderAPIKey).filter(ProviderAPIKey.id == key_id).first()
if key and not key.is_active:
key.is_active = True # type: ignore[assignment]
# 重置所有格式的健康度
key.health_by_format = {} # type: ignore[assignment]
key.circuit_breaker_by_format = {} # type: ignore[assignment]
logger.info(f"[OK] 手动启用 Key: {key_id}")
db.flush()
get_batch_committer().mark_dirty(db)
return True
except Exception as e:
logger.error(f"手动启用失败: {e}")
db.rollback()
return False
@classmethod
def get_all_health_status(cls, db: Session) -> Dict[str, Any]:
"""获取所有健康状态摘要"""
try:
endpoint_stats = db.query(
func.count(ProviderEndpoint.id).label("total"),
func.sum(case((ProviderEndpoint.is_active == True, 1), else_=0)).label("active"),
func.sum(case((ProviderEndpoint.health_score < 0.5, 1), else_=0)).label(
"unhealthy"
),
).first()
# 统计 Key需要遍历 JSON 字段计算熔断状态)
keys = db.query(ProviderAPIKey).all()
total_keys = len(keys)
active_keys = sum(1 for k in keys if k.is_active)
unhealthy_keys = 0
circuit_open_keys = 0
for key in keys:
health_by_format = key.health_by_format or {}
circuit_by_format = key.circuit_breaker_by_format or {}
# 检查是否有任何格式健康度低于 0.5
for fmt, health_data in health_by_format.items():
if float(health_data.get("health_score") or 1.0) < 0.5:
unhealthy_keys += 1
break
# 检查是否有任何格式熔断器开启
for fmt, circuit_data in circuit_by_format.items():
if circuit_data.get("open"):
circuit_open_keys += 1
break
return {
"endpoints": {
"total": endpoint_stats.total or 0 if endpoint_stats else 0,
"active": int(endpoint_stats.active or 0) if endpoint_stats else 0,
"unhealthy": int(endpoint_stats.unhealthy or 0) if endpoint_stats else 0,
},
"keys": {
"total": total_keys,
"active": active_keys,
"unhealthy": unhealthy_keys,
"circuit_open": circuit_open_keys,
},
}
except Exception as e:
logger.error(f"获取健康状态摘要失败: {e}")
return {
"endpoints": {"total": 0, "active": 0, "unhealthy": 0},
"keys": {"total": 0, "active": 0, "unhealthy": 0, "circuit_open": 0},
}
# ==================== 历史记录方法 ====================
@classmethod
def _push_circuit_event(cls, event: Dict[str, Any]) -> None:
cls._circuit_history.append(event)
if len(cls._circuit_history) > cls.CIRCUIT_HISTORY_LIMIT:
cls._circuit_history.pop(0)
@classmethod
def get_circuit_history(cls, limit: int = 50) -> List[Dict[str, Any]]:
if limit <= 0:
return []
return cls._circuit_history[-limit:]
# ==================== 兼容旧方法 ====================
@classmethod
def is_eligible_for_probe(
cls,
db: Session,
endpoint_id: Optional[str] = None,
key_id: Optional[str] = None,
api_format: Optional[str] = None,
) -> bool:
"""检查是否有资格进行探测(按 API 格式)"""
if not cls.ALLOW_AUTO_RECOVER:
return False
if endpoint_id:
return False # Endpoint 不支持探测
if key_id:
key = db.query(ProviderAPIKey).filter(ProviderAPIKey.id == key_id).first()
if key:
if api_format:
circuit_data = cls._get_circuit_data(key, api_format)
if circuit_data.get("open"):
now = datetime.now(timezone.utc)
state = cls._get_circuit_state_from_data(circuit_data, now)
return state == CircuitState.HALF_OPEN
else:
# 兼容旧调用:检查是否有任何格式处于半开状态
circuit_by_format = key.circuit_breaker_by_format or {}
now = datetime.now(timezone.utc)
for fmt, circuit_data in circuit_by_format.items():
if circuit_data.get("open"):
state = cls._get_circuit_state_from_data(circuit_data, now)
if state == CircuitState.HALF_OPEN:
return True
return False
# ==================== 便捷方法 ====================
@classmethod
def get_health_score(
cls, key: ProviderAPIKey, api_format: Optional[str] = None
) -> float:
"""获取指定格式的健康度分数"""
if not api_format:
# 返回所有格式中的最低健康度
health_by_format = key.health_by_format or {}
if not health_by_format:
return 1.0
return min(
float(h.get("health_score") or 1.0) for h in health_by_format.values()
)
health_data = cls._get_health_data(key, api_format)
return float(health_data.get("health_score") or 1.0)
@classmethod
def is_any_circuit_open(cls, key: ProviderAPIKey) -> bool:
"""检查是否有任何格式的熔断器开启"""
circuit_by_format = key.circuit_breaker_by_format or {}
for circuit_data in circuit_by_format.values():
if circuit_data.get("open"):
return True
return False
# 全局健康监控器实例
health_monitor = HealthMonitor()
health_open_circuits.set(0)