Files
Aether/src/core/provider_health.py

206 lines
7.4 KiB
Python
Raw Normal View History

2025-12-10 20:52:44 +08:00
"""
提供商健康度管理
基于简单的失败计数和优先级调整
"""
import time
from collections import defaultdict
from datetime import datetime, timedelta
from typing import Dict, Optional
class ProviderHealthTracker:
"""
追踪提供商的健康状态
根据失败率动态调整优先级
"""
def __init__(
self,
failure_window: int = 300, # 5分钟时间窗口
failure_threshold: int = 3, # 3次失败降低优先级
recovery_time: int = 600, # 10分钟后重置
):
self.failure_window = failure_window
self.failure_threshold = failure_threshold
self.recovery_time = recovery_time
# 存储每个提供商的失败记录
self.failures: Dict[str, list] = defaultdict(list)
# 存储每个提供商的成功记录
self.successes: Dict[str, list] = defaultdict(list)
# 存储优先级调整
self.priority_adjustments: Dict[str, int] = {}
def record_success(self, provider_name: str):
"""记录成功的请求"""
current_time = time.time()
# 记录成功时间
self.successes[provider_name].append(current_time)
# 清理旧记录
self._cleanup_old_records(provider_name, current_time)
# 如果连续成功,可以恢复优先级
if len(self.successes[provider_name]) >= 5:
if self.priority_adjustments.get(provider_name, 0) < 0:
self.priority_adjustments[provider_name] += 1
def record_failure(self, provider_name: str):
"""记录失败的请求"""
current_time = time.time()
# 记录失败时间
self.failures[provider_name].append(current_time)
# 清理旧记录
self._cleanup_old_records(provider_name, current_time)
# 检查是否需要降低优先级
recent_failures = len(self.failures[provider_name])
if recent_failures >= self.failure_threshold:
# 降低优先级
current_adjustment = self.priority_adjustments.get(provider_name, 0)
self.priority_adjustments[provider_name] = current_adjustment - 1
def get_priority_adjustment(self, provider_name: str) -> int:
"""
获取优先级调整值
负数表示降低优先级正数表示提高优先级
"""
return self.priority_adjustments.get(provider_name, 0)
def get_health_status(self, provider_name: str) -> Dict:
"""
获取提供商的健康状态
"""
current_time = time.time()
self._cleanup_old_records(provider_name, current_time)
recent_failures = len(self.failures[provider_name])
recent_successes = len(self.successes[provider_name])
total_requests = recent_failures + recent_successes
failure_rate = recent_failures / total_requests if total_requests > 0 else 0
return {
"provider": provider_name,
"recent_failures": recent_failures,
"recent_successes": recent_successes,
"failure_rate": failure_rate,
"priority_adjustment": self.get_priority_adjustment(provider_name),
"status": self._get_status_label(failure_rate, recent_failures),
}
def _cleanup_old_records(self, provider_name: str, current_time: float):
"""清理超出时间窗口的记录"""
# 清理失败记录
self.failures[provider_name] = [
t for t in self.failures[provider_name] if current_time - t < self.failure_window
]
# 清理成功记录
self.successes[provider_name] = [
t for t in self.successes[provider_name] if current_time - t < self.failure_window
]
# 如果很久没有失败,重置优先级调整
if not self.failures[provider_name] and self.priority_adjustments.get(provider_name, 0) < 0:
# 检查恢复时间
if all(current_time - t > self.recovery_time for t in self.successes[provider_name]):
self.priority_adjustments[provider_name] = 0
def _get_status_label(self, failure_rate: float, recent_failures: int) -> str:
"""根据失败率返回状态标签"""
if recent_failures >= self.failure_threshold:
return "degraded" # 降级
elif failure_rate > 0.5:
return "unstable" # 不稳定
elif failure_rate > 0.1:
return "warning" # 警告
else:
return "healthy" # 健康
def should_use_provider(self, provider_name: str) -> bool:
"""
判断是否应该使用该提供商
简单的策略如果优先级调整低于-3暂时不使用
"""
adjustment = self.get_priority_adjustment(provider_name)
return adjustment > -3
def reset_provider_health(self, provider_name: str):
"""重置提供商的健康状态(管理员手动操作)"""
self.failures[provider_name] = []
self.successes[provider_name] = []
self.priority_adjustments[provider_name] = 0
class SimpleProviderSelector:
"""
简单的提供商选择器
基于优先级和健康状态
"""
def __init__(self, health_tracker: ProviderHealthTracker):
self.health_tracker = health_tracker
def select_provider(self, providers: list, specified_provider: Optional[str] = None):
"""
选择提供商
Args:
providers: 可用提供商列表已按基础优先级排序
specified_provider: 用户指定的提供商
Returns:
选中的提供商
"""
# 如果用户指定了提供商,直接使用(不管健康状态)
if specified_provider:
return next((p for p in providers if p.name == specified_provider), None)
# 否则,根据优先级和健康状态选择
# 对提供商列表进行动态排序
sorted_providers = sorted(
providers,
key=lambda p: (
p.priority + self.health_tracker.get_priority_adjustment(p.name),
-p.id, # 相同优先级时使用ID作为次要排序
),
reverse=True, # 优先级高的在前
)
# 选择第一个健康的提供商
for provider in sorted_providers:
if self.health_tracker.should_use_provider(provider.name):
return provider
# 如果都不健康,还是返回第一个(降级策略)
return sorted_providers[0] if sorted_providers else None
def get_provider_rankings(self, providers: list) -> list:
"""
获取提供商的当前排名用于调试和监控
"""
rankings = []
for provider in providers:
health_status = self.health_tracker.get_health_status(provider.name)
effective_priority = provider.priority + health_status["priority_adjustment"]
rankings.append(
{
"name": provider.name,
"base_priority": provider.priority,
"adjustment": health_status["priority_adjustment"],
"effective_priority": effective_priority,
"status": health_status["status"],
"failure_rate": health_status["failure_rate"],
}
)
# 按有效优先级排序
rankings.sort(key=lambda x: x["effective_priority"], reverse=True)
return rankings