2 Commits

Author SHA1 Message Date
fawney19
af476ff21e feat: enhance error logging and upstream response tracking for provider failures 2025-12-19 15:29:48 +08:00
fawney19
3bbc1c6b66 feat: add provider compatibility error detection for intelligent failover
- Introduce ProviderCompatibilityException for unsupported parameter/feature errors
- Add COMPATIBILITY_ERROR_PATTERNS to detect provider-specific limitations
- Implement _is_compatibility_error() method in ErrorClassifier
- Prioritize compatibility error checking before client error validation
- Remove 'max_tokens' from CLIENT_ERROR_PATTERNS as it can indicate compatibility issues
- Enable automatic failover when provider doesn't support requested features
- Improve error classification accuracy with pattern matching for common compatibility issues
2025-12-19 13:28:26 +08:00
6 changed files with 121 additions and 36 deletions

View File

@@ -639,6 +639,8 @@ class ChatHandlerBase(BaseMessageHandler, ABC):
logger.info(f" [{self.request_id}] 发送非流式请求: Provider={provider.name}, "
f"模型={model} -> {mapped_model or '无映射'}")
logger.debug(f" [{self.request_id}] 请求URL: {url}")
logger.debug(f" [{self.request_id}] 请求体stream字段: {provider_payload.get('stream', 'N/A')}")
# 创建 HTTP 客户端(支持代理配置)
from src.clients.http_client import HTTPClientPool
@@ -662,10 +664,32 @@ class ChatHandlerBase(BaseMessageHandler, ABC):
response_headers=response_headers,
)
elif resp.status_code >= 500:
raise ProviderNotAvailableException(f"提供商服务不可用: {provider.name}")
elif resp.status_code != 200:
# 记录响应体以便调试
error_body = ""
try:
error_body = resp.text[:1000]
logger.error(f" [{self.request_id}] 上游返回5xx错误: status={resp.status_code}, body={error_body[:500]}")
except Exception:
pass
raise ProviderNotAvailableException(
f"提供商返回错误: {provider.name}, 状态: {resp.status_code}"
f"提供商服务不可用: {provider.name}",
provider_name=str(provider.name),
upstream_status=resp.status_code,
upstream_response=error_body,
)
elif resp.status_code != 200:
# 记录非200响应以便调试
error_body = ""
try:
error_body = resp.text[:1000]
logger.warning(f" [{self.request_id}] 上游返回非200: status={resp.status_code}, body={error_body[:500]}")
except Exception:
pass
raise ProviderNotAvailableException(
f"提供商返回错误: {provider.name}, 状态: {resp.status_code}",
provider_name=str(provider.name),
upstream_status=resp.status_code,
upstream_response=error_body,
)
response_json = resp.json()

View File

@@ -188,12 +188,16 @@ class ProviderNotAvailableException(ProviderException):
message: str,
provider_name: Optional[str] = None,
request_metadata: Optional[Any] = None,
upstream_status: Optional[int] = None,
upstream_response: Optional[str] = None,
):
super().__init__(
message=message,
provider_name=provider_name,
request_metadata=request_metadata,
)
self.upstream_status = upstream_status
self.upstream_response = upstream_response
class ProviderTimeoutException(ProviderException):
@@ -442,6 +446,36 @@ class EmbeddedErrorException(ProviderException):
self.error_status = error_status
class ProviderCompatibilityException(ProviderException):
"""Provider 兼容性错误异常 - 应该触发故障转移
用于处理因 Provider 不支持某些参数或功能导致的错误。
这类错误不是用户请求本身的问题,换一个 Provider 可能就能成功,应该触发故障转移。
常见场景:
- Unsupported parameter不支持的参数
- Unsupported model不支持的模型
- Unsupported feature不支持的功能
"""
def __init__(
self,
message: str,
provider_name: Optional[str] = None,
status_code: int = 400,
upstream_error: Optional[str] = None,
request_metadata: Optional[Any] = None,
):
self.upstream_error = upstream_error
super().__init__(
message=message,
provider_name=provider_name,
request_metadata=request_metadata,
)
# 覆盖状态码为 400保持与上游一致
self.status_code = status_code
class UpstreamClientException(ProxyException):
"""上游返回的客户端错误异常 - HTTP 4xx 错误,不应该重试

View File

@@ -4,13 +4,10 @@
"""
from contextlib import asynccontextmanager
from pathlib import Path
import uvicorn
from fastapi import FastAPI, HTTPException, Request
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles
from src.api.admin import router as admin_router
from src.api.announcements import router as announcement_router
@@ -299,33 +296,6 @@ app.include_router(dashboard_router) # 仪表盘端点
app.include_router(public_router) # 公开API端点用户可查看提供商和模型
app.include_router(monitoring_router) # 监控端点
# 静态文件服务(前端构建产物)
# 检查前端构建目录是否存在
frontend_dist = Path(__file__).parent.parent / "frontend" / "dist"
if frontend_dist.exists():
# 挂载静态资源目录
app.mount("/assets", StaticFiles(directory=str(frontend_dist / "assets")), name="assets")
# SPA catch-all路由 - 必须放在最后
@app.get("/{full_path:path}")
async def serve_spa(request: Request, full_path: str):
"""
处理所有未匹配的GET请求返回index.html供前端路由处理
仅对非API路径生效
"""
# 如果是API路径不处理
if full_path in {"api", "v1"} or full_path.startswith(("api/", "v1/")):
raise HTTPException(status_code=404, detail="Not Found")
# 返回index.html让前端路由处理
index_file = frontend_dist / "index.html"
if index_file.exists():
return FileResponse(str(index_file))
else:
raise HTTPException(status_code=404, detail="Frontend not built")
else:
logger.warning("前端构建目录不存在,前端路由将无法使用")
def main():

View File

@@ -15,6 +15,7 @@ from src.core.enums import APIFormat
from src.core.exceptions import (
ConcurrencyLimitError,
ProviderAuthException,
ProviderCompatibilityException,
ProviderException,
ProviderNotAvailableException,
ProviderRateLimitException,
@@ -81,7 +82,9 @@ class ErrorClassifier:
"context_length_exceeded", # 上下文长度超限
"content_length_limit", # 请求内容长度超限 (Claude API)
"content_length_exceeds", # 内容长度超限变体 (AWS CodeWhisperer)
"max_tokens", # token 数超限
# 注意:移除了 "max_tokens",因为 max_tokens 相关错误可能是 Provider 兼容性问题
# 如 "Unsupported parameter: 'max_tokens' is not supported with this model"
# 这类错误应由 COMPATIBILITY_ERROR_PATTERNS 处理
"invalid_prompt", # 无效的提示词
"content too long", # 内容过长
"input is too long", # 输入过长 (AWS)
@@ -136,6 +139,19 @@ class ErrorClassifier:
"CONTENT_POLICY_VIOLATION",
)
# Provider 兼容性错误模式 - 这类错误应该触发故障转移
# 因为换一个 Provider 可能就能成功
COMPATIBILITY_ERROR_PATTERNS: Tuple[str, ...] = (
"unsupported parameter", # 不支持的参数
"unsupported model", # 不支持的模型
"unsupported feature", # 不支持的功能
"not supported with this model", # 此模型不支持
"model does not support", # 模型不支持
"parameter is not supported", # 参数不支持
"feature is not supported", # 功能不支持
"not available for this model", # 此模型不可用
)
def _parse_error_response(self, error_text: Optional[str]) -> Dict[str, Any]:
"""
解析错误响应为结构化数据
@@ -261,6 +277,25 @@ class ErrorClassifier:
search_text = f"{parsed['message']} {parsed['raw']}".lower()
return any(pattern.lower() in search_text for pattern in self.CLIENT_ERROR_PATTERNS)
def _is_compatibility_error(self, error_text: Optional[str]) -> bool:
"""
检测错误响应是否为 Provider 兼容性错误(应触发故障转移)
这类错误是因为 Provider 不支持某些参数或功能导致的,
换一个 Provider 可能就能成功。
Args:
error_text: 错误响应文本
Returns:
是否为兼容性错误
"""
if not error_text:
return False
search_text = error_text.lower()
return any(pattern.lower() in search_text for pattern in self.COMPATIBILITY_ERROR_PATTERNS)
def _extract_error_message(self, error_text: Optional[str]) -> Optional[str]:
"""
从错误响应中提取错误消息
@@ -425,6 +460,16 @@ class ErrorClassifier:
),
)
# 400 错误:先检查是否为 Provider 兼容性错误(应触发故障转移)
if status == 400 and self._is_compatibility_error(error_response_text):
logger.info(f"检测到 Provider 兼容性错误,将触发故障转移: {extracted_message}")
return ProviderCompatibilityException(
message=extracted_message or "Provider 不支持此请求",
provider_name=provider_name,
status_code=400,
upstream_error=error_response_text,
)
# 400 错误:检查是否为客户端请求错误(不应重试)
if status == 400 and self._is_client_error(error_response_text):
logger.info(f"检测到客户端请求错误,不进行重试: {extracted_message}")

View File

@@ -427,6 +427,9 @@ class FallbackOrchestrator:
)
# str(cause) 可能为空(如 httpx 超时异常),使用 repr() 作为备用
error_msg = str(cause) or repr(cause)
# 如果是 ProviderNotAvailableException附加上游响应
if hasattr(cause, "upstream_response") and cause.upstream_response:
error_msg = f"{error_msg} | 上游响应: {cause.upstream_response[:500]}"
RequestCandidateService.mark_candidate_failed(
db=self.db,
candidate_id=candidate_record_id,
@@ -439,6 +442,9 @@ class FallbackOrchestrator:
# 未知错误:记录失败并抛出
error_msg = str(cause) or repr(cause)
# 如果是 ProviderNotAvailableException附加上游响应
if hasattr(cause, "upstream_response") and cause.upstream_response:
error_msg = f"{error_msg} | 上游响应: {cause.upstream_response[:500]}"
RequestCandidateService.mark_candidate_failed(
db=self.db,
candidate_id=candidate_record_id,

View File

@@ -289,11 +289,17 @@ class RequestResult:
status_code = 500
error_type = "internal_error"
# 构建错误消息,包含上游响应信息
error_message = str(exception)
if isinstance(exception, ProviderNotAvailableException):
if exception.upstream_response:
error_message = f"{error_message} | 上游响应: {exception.upstream_response[:500]}"
return cls(
status=RequestStatus.FAILED,
metadata=metadata,
status_code=status_code,
error_message=str(exception),
error_message=error_message,
error_type=error_type,
response_time_ms=response_time_ms,
is_stream=is_stream,