fix: 修复迁移脚本在全新安装时报错及改进统计回填逻辑

迁移脚本修复: - 移除 AUTOCOMMIT 模式，改为在同一事务中创建索引 - 分别检查每个索引是否存在，只创建缺失的索引 - 修复全新安装时 AUTOCOMMIT 连接看不到未提交表的问题 (#46) 统计回填改进: - 分别检查 StatsDaily 和 StatsDailyModel 的缺失日期 - 只回填实际缺失的数据而非连续区间 - 添加失败统计计数和 rollback 错误日志
fix: deploy.sh 将 Dockerfile.app.local 纳入代码变化检测
2026-01-03 00:02:28 +08:00 · 2025-12-24 21:50:05 +08:00 · 2025-12-24 18:10:42 +08:00 · 2025-12-24 18:10:42 +08:00 · 2025-12-24 18:10:42 +08:00 · 2025-12-24 01:31:45 +08:00
29 changed files with 917 additions and 336 deletions
--- a/Dockerfile.app
+++ b/Dockerfile.app
@@ -105,7 +105,7 @@ RUN printf '%s\n' \
 'stderr_logfile=/var/log/nginx/error.log' \
 '' \
 '[program:app]' \
-'command=gunicorn src.main:app -w %(ENV_GUNICORN_WORKERS)s -k uvicorn.workers.UvicornWorker --bind 0.0.0.0:%(ENV_PORT)s --timeout 120 --access-logfile - --error-logfile - --log-level info' \
+'command=gunicorn src.main:app --preload -w %(ENV_GUNICORN_WORKERS)s -k uvicorn.workers.UvicornWorker --bind 0.0.0.0:%(ENV_PORT)s --timeout 120 --access-logfile - --error-logfile - --log-level info' \
 'directory=/app' \
 'autostart=true' \
 'autorestart=true' \
--- a/Dockerfile.app.local
+++ b/Dockerfile.app.local
@@ -106,7 +106,7 @@ RUN printf '%s\n' \
 'stderr_logfile=/var/log/nginx/error.log' \
 '' \
 '[program:app]' \
-'command=gunicorn src.main:app -w %(ENV_GUNICORN_WORKERS)s -k uvicorn.workers.UvicornWorker --bind 0.0.0.0:%(ENV_PORT)s --timeout 120 --access-logfile - --error-logfile - --log-level info' \
+'command=gunicorn src.main:app --preload -w %(ENV_GUNICORN_WORKERS)s -k uvicorn.workers.UvicornWorker --bind 0.0.0.0:%(ENV_PORT)s --timeout 120 --access-logfile - --error-logfile - --log-level info' \
 'directory=/app' \
 'autostart=true' \
 'autorestart=true' \
--- a/alembic/versions/20251210_baseline.py
+++ b/alembic/versions/20251210_baseline.py
@@ -394,6 +394,10 @@ def upgrade() -> None:
            index=True,
        ),
    )
+    # usage 表复合索引（优化常见查询）
+    op.create_index("idx_usage_user_created", "usage", ["user_id", "created_at"])
+    op.create_index("idx_usage_apikey_created", "usage", ["api_key_id", "created_at"])
+    op.create_index("idx_usage_provider_model_created", "usage", ["provider", "model", "created_at"])

    # ==================== user_quotas ====================
    op.create_table(
--- a/alembic/versions/20251220_1500_add_usage_composite_indexes.py
+++ b/alembic/versions/20251220_1500_add_usage_composite_indexes.py
@@ -0,0 +1,65 @@
+"""add usage table composite indexes for query optimization
+
+Revision ID: b2c3d4e5f6g7
+Revises: a1b2c3d4e5f6
+Create Date: 2025-12-20 15:00:00.000000+00:00
+
+"""
+from alembic import op
+from sqlalchemy import text
+
+# revision identifiers, used by Alembic.
+revision = 'b2c3d4e5f6g7'
+down_revision = 'a1b2c3d4e5f6'
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    """为 usage 表添加复合索引以优化常见查询
+
+    注意：这些索引已经在 baseline 迁移中创建。
+    此迁移仅用于从旧版本升级的场景，新安装会跳过。
+    """
+    conn = op.get_bind()
+
+    # 检查 usage 表是否存在
+    result = conn.execute(text(
+        "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'usage')"
+    ))
+    if not result.scalar():
+        # 表不存在，跳过
+        return
+
+    # 定义需要创建的索引
+    indexes = [
+        ("idx_usage_user_created", "ON usage (user_id, created_at)"),
+        ("idx_usage_apikey_created", "ON usage (api_key_id, created_at)"),
+        ("idx_usage_provider_model_created", "ON usage (provider, model, created_at)"),
+    ]
+
+    # 分别检查并创建每个索引
+    for index_name, index_def in indexes:
+        result = conn.execute(text(
+            f"SELECT EXISTS (SELECT 1 FROM pg_indexes WHERE indexname = '{index_name}')"
+        ))
+        if result.scalar():
+            continue  # 索引已存在，跳过
+
+        conn.execute(text(f"CREATE INDEX {index_name} {index_def}"))
+
+
+def downgrade() -> None:
+    """删除复合索引"""
+    conn = op.get_bind()
+
+    # 使用 IF EXISTS 避免索引不存在时报错
+    conn.execute(text(
+        "DROP INDEX IF EXISTS idx_usage_provider_model_created"
+    ))
+    conn.execute(text(
+        "DROP INDEX IF EXISTS idx_usage_apikey_created"
+    ))
+    conn.execute(text(
+        "DROP INDEX IF EXISTS idx_usage_user_created"
+    ))
--- a/deploy.sh
+++ b/deploy.sh
@@ -26,10 +26,13 @@ calc_deps_hash() {
    cat pyproject.toml frontend/package.json frontend/package-lock.json Dockerfile.base.local 2>/dev/null | md5sum | cut -d' ' -f1
 }

-# 计算代码文件的哈希值
+# 计算代码文件的哈希值（包含 Dockerfile.app.local）
 calc_code_hash() {
-    find src -type f -name "*.py" 2>/dev/null | sort | xargs cat 2>/dev/null | md5sum | cut -d' ' -f1
-    find frontend/src -type f \( -name "*.vue" -o -name "*.ts" -o -name "*.tsx" -o -name "*.js" \) 2>/dev/null | sort | xargs cat 2>/dev/null | md5sum | cut -d' ' -f1
+    {
+        cat Dockerfile.app.local 2>/dev/null
+        find src -type f -name "*.py" 2>/dev/null | sort | xargs cat 2>/dev/null
+        find frontend/src -type f \( -name "*.vue" -o -name "*.ts" -o -name "*.tsx" -o -name "*.js" \) 2>/dev/null | sort | xargs cat 2>/dev/null
+    } | md5sum | cut -d' ' -f1
 }

 # 计算迁移文件的哈希值
@@ -179,7 +182,13 @@ else
    echo ">>> Dependencies unchanged."
 fi

-# 检查代码是否变化，或者 base 重建了（app 依赖 base）
+# 检查代码或迁移是否变化，或者 base 重建了（app 依赖 base）
+# 注意：迁移文件打包在镜像中，所以迁移变化也需要重建 app 镜像
+MIGRATION_CHANGED=false
+if check_migration_changed; then
+    MIGRATION_CHANGED=true
+fi
+
 if ! docker image inspect aether-app:latest >/dev/null 2>&1; then
    echo ">>> App image not found, building..."
    build_app
@@ -192,6 +201,10 @@ elif check_code_changed; then
    echo ">>> Code changed, rebuilding app image..."
    build_app
    NEED_RESTART=true
+elif [ "$MIGRATION_CHANGED" = true ]; then
+    echo ">>> Migration files changed, rebuilding app image..."
+    build_app
+    NEED_RESTART=true
 else
    echo ">>> Code unchanged."
 fi
@@ -204,9 +217,9 @@ else
    echo ">>> No changes detected, skipping restart."
 fi

-# 检查迁移变化
-if check_migration_changed; then
-    echo ">>> Migration files changed, running database migration..."
+# 检查迁移变化（如果前面已经检测到变化并重建了镜像，这里直接运行迁移）
+if [ "$MIGRATION_CHANGED" = true ]; then
+    echo ">>> Running database migration..."
    sleep 3
    run_migration
 else
--- a/frontend/src/api/endpoints/types.ts
+++ b/frontend/src/api/endpoints/types.ts
@@ -110,6 +110,24 @@ export interface EndpointAPIKey {
  request_results_window?: Array<{ ts: number; ok: boolean }>  // 请求结果滑动窗口
 }

+export interface EndpointAPIKeyUpdate {
+  name?: string
+  api_key?: string  // 仅在需要更新时提供
+  rate_multiplier?: number
+  internal_priority?: number
+  global_priority?: number | null
+  max_concurrent?: number | null  // null 表示切换为自适应模式
+  rate_limit?: number
+  daily_limit?: number
+  monthly_limit?: number
+  allowed_models?: string[] | null
+  capabilities?: Record<string, boolean> | null
+  cache_ttl_minutes?: number
+  max_probe_interval_minutes?: number
+  note?: string
+  is_active?: boolean
+}
+
 export interface EndpointHealthDetail {
  api_format: string
  health_score: number
--- a/frontend/src/features/providers/components/KeyFormDialog.vue
+++ b/frontend/src/features/providers/components/KeyFormDialog.vue
@@ -260,6 +260,7 @@ import {
  updateEndpointKey,
  getAllCapabilities,
  type EndpointAPIKey,
+  type EndpointAPIKeyUpdate,
  type ProviderEndpoint,
  type CapabilityDefinition
 } from '@/api/endpoints'
@@ -386,10 +387,11 @@ function loadKeyData() {
    api_key: '',
    rate_multiplier: props.editingKey.rate_multiplier || 1.0,
    internal_priority: props.editingKey.internal_priority ?? 50,
-    max_concurrent: props.editingKey.max_concurrent || undefined,
-    rate_limit: props.editingKey.rate_limit || undefined,
-    daily_limit: props.editingKey.daily_limit || undefined,
-    monthly_limit: props.editingKey.monthly_limit || undefined,
+    // 保留原始的 null/undefined 状态，null 表示自适应模式
+    max_concurrent: props.editingKey.max_concurrent ?? undefined,
+    rate_limit: props.editingKey.rate_limit ?? undefined,
+    daily_limit: props.editingKey.daily_limit ?? undefined,
+    monthly_limit: props.editingKey.monthly_limit ?? undefined,
    cache_ttl_minutes: props.editingKey.cache_ttl_minutes ?? 5,
    max_probe_interval_minutes: props.editingKey.max_probe_interval_minutes ?? 32,
    note: props.editingKey.note || '',
@@ -439,12 +441,17 @@ async function handleSave() {
  saving.value = true
  try {
    if (props.editingKey) {
-      // 更新
-      const updateData: any = {
+      // 更新模式
+      // 注意：max_concurrent 需要显式发送 null 来切换到自适应模式
+      // undefined 会在 JSON 中被忽略，所以用 null 表示"清空/自适应"
+      const updateData: EndpointAPIKeyUpdate = {
        name: form.value.name,
        rate_multiplier: form.value.rate_multiplier,
        internal_priority: form.value.internal_priority,
-        max_concurrent: form.value.max_concurrent,
+        // 显式使用 null 表示自适应模式，这样后端能区分"未提供"和"设置为 null"
+        // 注意：只有 max_concurrent 需要这种处理，因为它有"自适应模式"的概念
+        // 其他限制字段（rate_limit 等）不支持"清空"操作，undefined 会被 JSON 忽略即不更新
+        max_concurrent: form.value.max_concurrent === undefined ? null : form.value.max_concurrent,
        rate_limit: form.value.rate_limit,
        daily_limit: form.value.daily_limit,
        monthly_limit: form.value.monthly_limit,
--- a/frontend/src/features/providers/components/ProviderDetailDrawer.vue
+++ b/frontend/src/features/providers/components/ProviderDetailDrawer.vue
@@ -483,9 +483,9 @@
                                  <span
                                    v-if="key.max_concurrent || key.is_adaptive"
                                    class="text-muted-foreground"
-                                    :title="key.is_adaptive ? `自适应并发限制（学习值: ${key.learned_max_concurrent ?? '未学习'}）` : '固定并发限制'"
+                                    :title="key.is_adaptive ? `自适应并发限制（学习值: ${key.learned_max_concurrent ?? '未学习'}）` : `固定并发限制: ${key.max_concurrent}`"
                                  >
-                                    {{ key.is_adaptive ? '自适应' : '固定' }}并发: {{ key.learned_max_concurrent || key.max_concurrent || 3 }}
+                                    {{ key.is_adaptive ? '自适应' : '固定' }}并发: {{ key.is_adaptive ? (key.learned_max_concurrent ?? '学习中') : key.max_concurrent }}
                                  </span>
                                </div>
                              </div>
--- a/frontend/src/features/usage/components/UsageRecordsTable.vue
+++ b/frontend/src/features/usage/components/UsageRecordsTable.vue
@@ -366,14 +366,34 @@
            </div>
          </TableCell>
          <TableCell class="text-right py-4 w-[70px]">
+            <!-- pending 状态：只显示增长的总时间 -->
            <div
-              v-if="record.status === 'pending' || record.status === 'streaming'"
+              v-if="record.status === 'pending'"
              class="flex flex-col items-end text-xs gap-0.5"
            >
+              <span class="text-muted-foreground">-</span>
              <span class="text-primary tabular-nums">
                {{ getElapsedTime(record) }}
              </span>
            </div>
+            <!-- streaming 状态：首字固定 + 总时间增长 -->
+            <div
+              v-else-if="record.status === 'streaming'"
+              class="flex flex-col items-end text-xs gap-0.5"
+            >
+              <span
+                v-if="record.first_byte_time_ms != null"
+                class="tabular-nums"
+              >{{ (record.first_byte_time_ms / 1000).toFixed(2) }}s</span>
+              <span
+                v-else
+                class="text-muted-foreground"
+              >-</span>
+              <span class="text-primary tabular-nums">
+                {{ getElapsedTime(record) }}
+              </span>
+            </div>
+            <!-- 已完成状态：首字 + 总耗时 -->
            <div
              v-else-if="record.response_time_ms != null"
              class="flex flex-col items-end text-xs gap-0.5"
--- a/src/api/admin/endpoints/keys.py
+++ b/src/api/admin/endpoints/keys.py
@@ -246,6 +246,15 @@ class AdminUpdateEndpointKeyAdapter(AdminApiAdapter):
        if "api_key" in update_data:
            update_data["api_key"] = crypto_service.encrypt(update_data["api_key"])

+        # 特殊处理 max_concurrent：需要区分"未提供"和"显式设置为 null"
+        # 当 max_concurrent 被显式设置时（在 model_fields_set 中），即使值为 None 也应该更新
+        if "max_concurrent" in self.key_data.model_fields_set:
+            update_data["max_concurrent"] = self.key_data.max_concurrent
+            # 切换到自适应模式时，清空学习到的并发限制，让系统重新学习
+            if self.key_data.max_concurrent is None:
+                update_data["learned_max_concurrent"] = None
+                logger.info("Key %s 切换为自适应并发模式", self.key_id)
+
        for field, value in update_data.items():
            setattr(key, field, value)
        key.updated_at = datetime.now(timezone.utc)
@@ -253,7 +262,7 @@ class AdminUpdateEndpointKeyAdapter(AdminApiAdapter):
        db.commit()
        db.refresh(key)

-        logger.info(f"[OK] 更新 Key: ID={self.key_id}, Updates={list(update_data.keys())}")
+        logger.info("[OK] 更新 Key: ID=%s, Updates=%s", self.key_id, list(update_data.keys()))

        try:
            decrypted_key = crypto_service.decrypt(key.api_key)
--- a/src/api/admin/monitoring/cache.py
+++ b/src/api/admin/monitoring/cache.py
@@ -947,7 +947,7 @@ class AdminClearProviderCacheAdapter(AdminApiAdapter):
 class AdminCacheConfigAdapter(AdminApiAdapter):
    async def handle(self, context: ApiRequestContext) -> Dict[str, Any]:  # type: ignore[override]
        from src.services.cache.affinity_manager import CacheAffinityManager
-        from src.services.cache.aware_scheduler import CacheAwareScheduler
+        from src.config.constants import ConcurrencyDefaults
        from src.services.rate_limit.adaptive_reservation import get_adaptive_reservation_manager

        # 获取动态预留管理器的配置
@@ -958,7 +958,7 @@ class AdminCacheConfigAdapter(AdminApiAdapter):
            "status": "ok",
            "data": {
                "cache_ttl_seconds": CacheAffinityManager.DEFAULT_CACHE_TTL,
-                "cache_reservation_ratio": CacheAwareScheduler.CACHE_RESERVATION_RATIO,
+                "cache_reservation_ratio": ConcurrencyDefaults.CACHE_RESERVATION_RATIO,
                "dynamic_reservation": {
                    "enabled": True,
                    "config": reservation_stats["config"],
@@ -981,7 +981,7 @@ class AdminCacheConfigAdapter(AdminApiAdapter):
        context.add_audit_metadata(
            action="cache_config",
            cache_ttl_seconds=CacheAffinityManager.DEFAULT_CACHE_TTL,
-            cache_reservation_ratio=CacheAwareScheduler.CACHE_RESERVATION_RATIO,
+            cache_reservation_ratio=ConcurrencyDefaults.CACHE_RESERVATION_RATIO,
            dynamic_reservation_enabled=True,
        )
        return response
@@ -1236,7 +1236,7 @@ class AdminModelMappingCacheStatsAdapter(AdminApiAdapter):
                        try:
                            cached_data = json.loads(cached_str)
                            provider_model_name = cached_data.get("provider_model_name")
-                            provider_model_mappings = cached_data.get("provider_model_mappings", [])
+                            cached_model_mappings = cached_data.get("provider_model_mappings", [])

                            # 获取 Provider 和 GlobalModel 信息
                            provider = provider_map.get(provider_id)
@@ -1245,8 +1245,8 @@ class AdminModelMappingCacheStatsAdapter(AdminApiAdapter):
                            if provider and global_model:
                                # 提取映射名称
                                mapping_names = []
-                                if provider_model_mappings:
-                                    for mapping_entry in provider_model_mappings:
+                                if cached_model_mappings:
+                                    for mapping_entry in cached_model_mappings:
                                        if isinstance(mapping_entry, dict) and mapping_entry.get("name"):
                                            mapping_names.append(mapping_entry["name"])

--- a/src/api/handlers/base/base_handler.py
+++ b/src/api/handlers/base/base_handler.py
@@ -376,6 +376,9 @@ class BaseMessageHandler:

        使用 asyncio 后台任务执行数据库更新，避免阻塞流式传输

+        注意：TTFB（首字节时间）由 StreamContext.record_first_byte_time() 记录，
+        并在最终 record_success 时传递到数据库，避免重复记录导致数据不一致。
+
        Args:
            request_id: 请求 ID，如果不传则使用 self.request_id
        """
@@ -407,6 +410,9 @@ class BaseMessageHandler:

        使用 asyncio 后台任务执行数据库更新，避免阻塞流式传输

+        注意：TTFB（首字节时间）由 StreamContext.record_first_byte_time() 记录，
+        并在最终 record_success 时传递到数据库，避免重复记录导致数据不一致。
+
        Args:
            ctx: 流式上下文，包含 provider_name 和 mapped_model
        """
--- a/src/api/handlers/base/chat_handler_base.py
+++ b/src/api/handlers/base/chat_handler_base.py
@@ -484,9 +484,8 @@ class ChatHandlerBase(BaseMessageHandler, ABC):

            stream_response.raise_for_status()

-            # 使用字节流迭代器（避免 aiter_lines 的性能问题）
-            # aiter_raw() 返回原始数据块，无缓冲，实现真正的流式传输
-            byte_iterator = stream_response.aiter_raw()
+            # 使用字节流迭代器（避免 aiter_lines 的性能问题, aiter_bytes 会自动解压 gzip/deflate）
+            byte_iterator = stream_response.aiter_bytes()

            # 预读检测嵌套错误
            prefetched_chunks = await stream_processor.prefetch_and_check_error(
--- a/src/api/handlers/base/cli_handler_base.py
+++ b/src/api/handlers/base/cli_handler_base.py
@@ -57,8 +57,10 @@ from src.models.database import (
    ProviderEndpoint,
    User,
 )
+from src.config.settings import config
 from src.services.provider.transport import build_provider_url
 from src.utils.sse_parser import SSEEventParser
+from src.utils.timeout import read_first_chunk_with_ttfb_timeout


 class CliMessageHandlerBase(BaseMessageHandler):
@@ -474,8 +476,8 @@ class CliMessageHandlerBase(BaseMessageHandler):

            stream_response.raise_for_status()

-            # 使用字节流迭代器（避免 aiter_lines 的性能问题）
-            byte_iterator = stream_response.aiter_raw()
+            # 使用字节流迭代器（避免 aiter_lines 的性能问题, aiter_bytes 会自动解压 gzip/deflate）
+            byte_iterator = stream_response.aiter_bytes()

            # 预读第一个数据块，检测嵌套错误（HTTP 200 但响应体包含错误）
            prefetched_chunks = await self._prefetch_and_check_embedded_error(
@@ -529,7 +531,7 @@ class CliMessageHandlerBase(BaseMessageHandler):
            # 检查是否需要格式转换
            needs_conversion = self._needs_format_conversion(ctx)

-            async for chunk in stream_response.aiter_raw():
+            async for chunk in stream_response.aiter_bytes():
                # 在第一次输出数据前更新状态为 streaming
                if not streaming_status_updated:
                    self._update_usage_to_streaming_with_ctx(ctx)
@@ -672,6 +674,8 @@ class CliMessageHandlerBase(BaseMessageHandler):

        同时检测 HTML 响应（通常是 base_url 配置错误导致返回网页）。

+        首次读取时会应用 TTFB（首字节超时）检测，超时则触发故障转移。
+
        Args:
            byte_iterator: 字节流迭代器
            provider: Provider 对象
@@ -684,6 +688,7 @@ class CliMessageHandlerBase(BaseMessageHandler):
        Raises:
            EmbeddedErrorException: 如果检测到嵌套错误
            ProviderNotAvailableException: 如果检测到 HTML 响应（配置错误）
+            ProviderTimeoutException: 如果首字节超时（TTFB timeout）
        """
        prefetched_chunks: list = []
        max_prefetch_lines = 5  # 最多预读5行来检测错误
@@ -704,7 +709,19 @@ class CliMessageHandlerBase(BaseMessageHandler):
            else:
                provider_parser = self.parser

-            async for chunk in byte_iterator:
+            # 使用共享的 TTFB 超时函数读取首字节
+            ttfb_timeout = config.stream_first_byte_timeout
+            first_chunk, aiter = await read_first_chunk_with_ttfb_timeout(
+                byte_iterator,
+                timeout=ttfb_timeout,
+                request_id=self.request_id,
+                provider_name=str(provider.name),
+            )
+            prefetched_chunks.append(first_chunk)
+            buffer += first_chunk
+
+            # 继续读取剩余的预读数据
+            async for chunk in aiter:
                prefetched_chunks.append(chunk)
                buffer += chunk

@@ -785,12 +802,21 @@ class CliMessageHandlerBase(BaseMessageHandler):
                if should_stop or line_count >= max_prefetch_lines:
                    break

-        except EmbeddedErrorException:
-            # 重新抛出嵌套错误
+        except (EmbeddedErrorException, ProviderTimeoutException, ProviderNotAvailableException):
+            # 重新抛出可重试的 Provider 异常，触发故障转移
            raise
+        except (OSError, IOError) as e:
+            # 网络 I/O 异常：记录警告，可能需要重试
+            logger.warning(
+                f"  [{self.request_id}] 预读流时发生网络异常: {type(e).__name__}: {e}"
+            )
        except Exception as e:
-            # 其他异常（如网络错误）在预读阶段发生，记录日志但不中断
-            logger.debug(f"  [{self.request_id}] 预读流时发生异常: {e}")
+            # 未预期的严重异常：记录错误并重新抛出，避免掩盖问题
+            logger.error(
+                f"  [{self.request_id}] 预读流时发生严重异常: {type(e).__name__}: {e}",
+                exc_info=True
+            )
+            raise

        return prefetched_chunks

--- a/src/api/handlers/base/stream_processor.py
+++ b/src/api/handlers/base/stream_processor.py
@@ -25,10 +25,12 @@ from src.api.handlers.base.content_extractors import (
 from src.api.handlers.base.parsers import get_parser_for_format
 from src.api.handlers.base.response_parser import ResponseParser
 from src.api.handlers.base.stream_context import StreamContext
-from src.core.exceptions import EmbeddedErrorException
+from src.config.settings import config
+from src.core.exceptions import EmbeddedErrorException, ProviderTimeoutException
 from src.core.logger import logger
 from src.models.database import Provider, ProviderEndpoint
 from src.utils.sse_parser import SSEEventParser
+from src.utils.timeout import read_first_chunk_with_ttfb_timeout


@dataclass
@@ -170,6 +172,8 @@ class StreamProcessor:
        某些 Provider（如 Gemini）可能返回 HTTP 200，但在响应体中包含错误信息。
        这种情况需要在流开始输出之前检测，以便触发重试逻辑。

+        首次读取时会应用 TTFB（首字节超时）检测，超时则触发故障转移。
+
        Args:
            byte_iterator: 字节流迭代器
            provider: Provider 对象
@@ -182,6 +186,7 @@ class StreamProcessor:

        Raises:
            EmbeddedErrorException: 如果检测到嵌套错误
+            ProviderTimeoutException: 如果首字节超时（TTFB timeout）
        """
        prefetched_chunks: list = []
        parser = self.get_parser_for_provider(ctx)
@@ -192,7 +197,19 @@ class StreamProcessor:
        decoder = codecs.getincrementaldecoder("utf-8")(errors="replace")

        try:
-            async for chunk in byte_iterator:
+            # 使用共享的 TTFB 超时函数读取首字节
+            ttfb_timeout = config.stream_first_byte_timeout
+            first_chunk, aiter = await read_first_chunk_with_ttfb_timeout(
+                byte_iterator,
+                timeout=ttfb_timeout,
+                request_id=self.request_id,
+                provider_name=str(provider.name),
+            )
+            prefetched_chunks.append(first_chunk)
+            buffer += first_chunk
+
+            # 继续读取剩余的预读数据
+            async for chunk in aiter:
                prefetched_chunks.append(chunk)
                buffer += chunk

@@ -262,10 +279,21 @@ class StreamProcessor:
                if should_stop or line_count >= max_prefetch_lines:
                    break

-        except EmbeddedErrorException:
+        except (EmbeddedErrorException, ProviderTimeoutException):
+            # 重新抛出可重试的 Provider 异常，触发故障转移
            raise
+        except (OSError, IOError) as e:
+            # 网络 I/O <20><><EFBFBD>常：记录警告，可能需要重试
+            logger.warning(
+                f"  [{self.request_id}] 预读流时发生网络异常: {type(e).__name__}: {e}"
+            )
        except Exception as e:
-            logger.debug(f"  [{self.request_id}] 预读流时发生异常: {e}")
+            # 未预期的严重异常：记录错误并重新抛出，避免掩盖问题
+            logger.error(
+                f"  [{self.request_id}] 预读流时发生严重异常: {type(e).__name__}: {e}",
+                exc_info=True
+            )
+            raise

        return prefetched_chunks

--- a/src/api/handlers/base/utils.py
+++ b/src/api/handlers/base/utils.py
@@ -4,17 +4,28 @@ Handler 基础工具函数

 from typing import Any, Dict, Optional

+from src.core.logger import logger
+

 def extract_cache_creation_tokens(usage: Dict[str, Any]) -> int:
    """
-    提取缓存创建 tokens（兼容新旧格式）
+    提取缓存创建 tokens（兼容三种格式）

-    Claude API 在不同版本中使用了不同的字段名来表示缓存创建 tokens：
-    - 新格式（2024年后）：使用 claude_cache_creation_5_m_tokens 和
-      claude_cache_creation_1_h_tokens 分别表示 5 分钟和 1 小时缓存
-    - 旧格式：使用 cache_creation_input_tokens 表示总的缓存创建 tokens
+    根据 Anthropic API 文档，支持三种格式（按优先级）：

-    此函数自动检测并适配两种格式，优先使用新格式。
+    1. **嵌套格式（优先级最高）**：
+       usage.cache_creation.ephemeral_5m_input_tokens
+       usage.cache_creation.ephemeral_1h_input_tokens
+
+    2. **扁平新格式（优先级第二）**：
+       usage.claude_cache_creation_5_m_tokens
+       usage.claude_cache_creation_1_h_tokens
+
+    3. **旧格式（优先级第三）**：
+       usage.cache_creation_input_tokens
+
+    优先使用嵌套格式，如果嵌套格式字段存在但值为 0，则智能 fallback 到旧格式。
+    扁平格式和嵌套格式互斥，按顺序检查。

    Args:
        usage: API 响应中的 usage 字典
@@ -22,20 +33,63 @@ def extract_cache_creation_tokens(usage: Dict[str, Any]) -> int:
    Returns:
        缓存创建 tokens 总数
    """
-    # 检查新格式字段是否存在（而非值是否为 0）
-    # 如果字段存在，即使值为 0 也是合法的，不应 fallback 到旧格式
-    has_new_format = (
+    # 1. 检查嵌套格式（最新格式）
+    cache_creation = usage.get("cache_creation")
+    if isinstance(cache_creation, dict):
+        cache_5m = int(cache_creation.get("ephemeral_5m_input_tokens", 0))
+        cache_1h = int(cache_creation.get("ephemeral_1h_input_tokens", 0))
+        total = cache_5m + cache_1h
+
+        if total > 0:
+            logger.debug(
+                f"Using nested cache_creation: 5m={cache_5m}, 1h={cache_1h}, total={total}"
+            )
+            return total
+
+        # 嵌套格式存在但为 0，fallback 到旧格式
+        old_format = int(usage.get("cache_creation_input_tokens", 0))
+        if old_format > 0:
+            logger.debug(
+                f"Nested cache_creation is 0, using old format: {old_format}"
+            )
+            return old_format
+
+        # 都是 0，返回 0
+        return 0
+
+    # 2. 检查扁平新格式
+    has_flat_format = (
        "claude_cache_creation_5_m_tokens" in usage
        or "claude_cache_creation_1_h_tokens" in usage
    )

-    if has_new_format:
-        cache_5m = usage.get("claude_cache_creation_5_m_tokens", 0)
-        cache_1h = usage.get("claude_cache_creation_1_h_tokens", 0)
-        return int(cache_5m) + int(cache_1h)
+    if has_flat_format:
+        cache_5m = int(usage.get("claude_cache_creation_5_m_tokens", 0))
+        cache_1h = int(usage.get("claude_cache_creation_1_h_tokens", 0))
+        total = cache_5m + cache_1h

-    # 回退到旧格式
-    return int(usage.get("cache_creation_input_tokens", 0))
+        if total > 0:
+            logger.debug(
+                f"Using flat new format: 5m={cache_5m}, 1h={cache_1h}, total={total}"
+            )
+            return total
+
+        # 扁平格式存在但为 0，fallback 到旧格式
+        old_format = int(usage.get("cache_creation_input_tokens", 0))
+        if old_format > 0:
+            logger.debug(
+                f"Flat cache_creation is 0, using old format: {old_format}"
+            )
+            return old_format
+
+        # 都是 0，返回 0
+        return 0
+
+    # 3. 回退到旧格式
+    old_format = int(usage.get("cache_creation_input_tokens", 0))
+    if old_format > 0:
+        logger.debug(f"Using old format: cache_creation_input_tokens={old_format}")
+    return old_format


 def build_sse_headers(extra_headers: Optional[Dict[str, str]] = None) -> Dict[str, str]:
--- a/src/api/handlers/claude_cli/adapter.py
+++ b/src/api/handlers/claude_cli/adapter.py
@@ -115,7 +115,7 @@ class ClaudeCliAdapter(CliAdapterBase):
    ) -> Tuple[list, Optional[str]]:
        """查询 Claude API 支持的模型列表（带 CLI User-Agent）"""
        # 复用 ClaudeChatAdapter 的实现，添加 CLI User-Agent
-        cli_headers = {"User-Agent": config.internal_user_agent_claude}
+        cli_headers = {"User-Agent": config.internal_user_agent_claude_cli}
        if extra_headers:
            cli_headers.update(extra_headers)
        models, error = await ClaudeChatAdapter.fetch_models(
--- a/src/api/handlers/gemini_cli/adapter.py
+++ b/src/api/handlers/gemini_cli/adapter.py
@@ -112,7 +112,7 @@ class GeminiCliAdapter(CliAdapterBase):
    ) -> Tuple[list, Optional[str]]:
        """查询 Gemini API 支持的模型列表（带 CLI User-Agent）"""
        # 复用 GeminiChatAdapter 的实现，添加 CLI User-Agent
-        cli_headers = {"User-Agent": config.internal_user_agent_gemini}
+        cli_headers = {"User-Agent": config.internal_user_agent_gemini_cli}
        if extra_headers:
            cli_headers.update(extra_headers)
        models, error = await GeminiChatAdapter.fetch_models(
--- a/src/api/handlers/openai_cli/adapter.py
+++ b/src/api/handlers/openai_cli/adapter.py
@@ -57,7 +57,7 @@ class OpenAICliAdapter(CliAdapterBase):
    ) -> Tuple[list, Optional[str]]:
        """查询 OpenAI 兼容 API 支持的模型列表（带 CLI User-Agent）"""
        # 复用 OpenAIChatAdapter 的实现，添加 CLI User-Agent
-        cli_headers = {"User-Agent": config.internal_user_agent_openai}
+        cli_headers = {"User-Agent": config.internal_user_agent_openai_cli}
        if extra_headers:
            cli_headers.update(extra_headers)
        models, error = await OpenAIChatAdapter.fetch_models(
--- a/src/config/constants.py
+++ b/src/config/constants.py
@@ -77,7 +77,10 @@ class ConcurrencyDefaults:
    MAX_CONCURRENT_LIMIT = 200

    # 最小并发限制下限
-    MIN_CONCURRENT_LIMIT = 1
+    # 设置为 3 而不是 1，因为预留机制（10%预留给缓存用户）会导致
+    # 当 learned_max_concurrent=1 时新用户实际可用槽位为 0，永远无法命中
+    # 注意：当 limit < 10 时，预留机制实际不生效（预留槽位 = 0），这是可接受的
+    MIN_CONCURRENT_LIMIT = 3

    # === 探测性扩容参数 ===
    # 探测性扩容间隔（分钟）- 长时间无 429 且有流量时尝试扩容
--- a/src/config/settings.py
+++ b/src/config/settings.py
@@ -56,10 +56,11 @@ class Config:

        # Redis 依赖策略（生产默认必需，开发默认可选，可通过 REDIS_REQUIRED 覆盖）
        redis_required_env = os.getenv("REDIS_REQUIRED")
-        if redis_required_env is None:
-            self.require_redis = self.environment not in {"development", "test", "testing"}
-        else:
+        if redis_required_env is not None:
            self.require_redis = redis_required_env.lower() == "true"
+        else:
+            # 保持向后兼容：开发环境可选，生产环境必需
+            self.require_redis = self.environment not in {"development", "test", "testing"}

        # CORS配置 - 使用环境变量配置允许的源
        # 格式: 逗号分隔的域名列表,如 "http://localhost:3000,https://example.com"
@@ -133,6 +134,18 @@ class Config:
        self.concurrency_slot_ttl = int(os.getenv("CONCURRENCY_SLOT_TTL", "600"))
        self.cache_reservation_ratio = float(os.getenv("CACHE_RESERVATION_RATIO", "0.1"))

+        # 限流降级策略配置
+        # RATE_LIMIT_FAIL_OPEN: 当限流服务（Redis）异常时的行为
+        #
+        # True (默认): fail-open - 放行请求（优先可用性）
+        #   风险：Redis 故障期间无法限流，可能被滥用
+        #   适用：API 网关作为关键基础设施，必须保持高可用
+        #
+        # False: fail-close - 拒绝所有请求（优先安全性）
+        #   风险：Redis 故障会导致 API 网关不可用
+        #   适用：有严格速率限制要求的安全敏感场景
+        self.rate_limit_fail_open = os.getenv("RATE_LIMIT_FAIL_OPEN", "true").lower() == "true"
+
        # HTTP 请求超时配置（秒）
        self.http_connect_timeout = float(os.getenv("HTTP_CONNECT_TIMEOUT", "10.0"))
        self.http_write_timeout = float(os.getenv("HTTP_WRITE_TIMEOUT", "60.0"))
@@ -141,19 +154,22 @@ class Config:
        # 流式处理配置
        # STREAM_PREFETCH_LINES: 预读行数，用于检测嵌套错误
        # STREAM_STATS_DELAY: 统计记录延迟（秒），等待流完全关闭
+        # STREAM_FIRST_BYTE_TIMEOUT: 首字节超时（秒），等待首字节超过此时间触发故障转移
+        #   范围: 10-120 秒，默认 30 秒（必须小于 http_write_timeout 避免竞态）
        self.stream_prefetch_lines = int(os.getenv("STREAM_PREFETCH_LINES", "5"))
        self.stream_stats_delay = float(os.getenv("STREAM_STATS_DELAY", "0.1"))
+        self.stream_first_byte_timeout = self._parse_ttfb_timeout()

        # 内部请求 User-Agent 配置（用于查询上游模型列表等）
-        # 可通过环境变量覆盖默认值
-        self.internal_user_agent_claude = os.getenv(
-            "CLAUDE_USER_AGENT", "claude-cli/1.0"
+        # 可通过环境变量覆盖默认值，模拟对应 CLI 客户端
+        self.internal_user_agent_claude_cli = os.getenv(
+            "CLAUDE_CLI_USER_AGENT", "claude-code/1.0.1"
        )
-        self.internal_user_agent_openai = os.getenv(
-            "OPENAI_USER_AGENT", "openai-cli/1.0"
+        self.internal_user_agent_openai_cli = os.getenv(
+            "OPENAI_CLI_USER_AGENT", "openai-codex/1.0"
        )
-        self.internal_user_agent_gemini = os.getenv(
-            "GEMINI_USER_AGENT", "gemini-cli/1.0"
+        self.internal_user_agent_gemini_cli = os.getenv(
+            "GEMINI_CLI_USER_AGENT", "gemini-cli/0.1.0"
        )

        # 验证连接池配置
@@ -177,6 +193,39 @@ class Config:
        """智能计算最大溢出连接数 - 与 pool_size 相同"""
        return self.db_pool_size

+    def _parse_ttfb_timeout(self) -> float:
+        """
+        解析 TTFB 超时配置，带错误处理和范围限制
+
+        TTFB (Time To First Byte) 用于检测慢响应的 Provider，超时触发故障转移。
+        此值必须小于 http_write_timeout，避免竞态条件。
+
+        Returns:
+            超时时间（秒），范围 10-120，默认 30
+        """
+        default_timeout = 30.0
+        min_timeout = 10.0
+        max_timeout = 120.0  # 必须小于 http_write_timeout (默认 60s) 的 2 倍
+
+        raw_value = os.getenv("STREAM_FIRST_BYTE_TIMEOUT", str(default_timeout))
+        try:
+            timeout = float(raw_value)
+        except ValueError:
+            # 延迟导入，避免循环依赖（Config 初始化时 logger 可能未就绪）
+            self._ttfb_config_warning = (
+                f"无效的 STREAM_FIRST_BYTE_TIMEOUT 配置 '{raw_value}'，使用默认值 {default_timeout}秒"
+            )
+            return default_timeout
+
+        # 范围限制
+        clamped = max(min_timeout, min(max_timeout, timeout))
+        if clamped != timeout:
+            self._ttfb_config_warning = (
+                f"STREAM_FIRST_BYTE_TIMEOUT={timeout}秒超出范围 [{min_timeout}-{max_timeout}]，"
+                f"已调整为 {clamped}秒"
+            )
+        return clamped
+
    def _validate_pool_config(self) -> None:
        """验证连接池配置是否安全"""
        total_per_worker = self.db_pool_size + self.db_max_overflow
@@ -224,6 +273,10 @@ class Config:
        if hasattr(self, "_pool_config_warning") and self._pool_config_warning:
            logger.warning(self._pool_config_warning)

+        # TTFB 超时配置警告
+        if hasattr(self, "_ttfb_config_warning") and self._ttfb_config_warning:
+            logger.warning(self._ttfb_config_warning)
+
        # 管理员密码检查（必须在环境变量中设置）
        if hasattr(self, "_missing_admin_password") and self._missing_admin_password:
            logger.error("必须设置 ADMIN_PASSWORD 环境变量！")
--- a/src/middleware/plugin_middleware.py
+++ b/src/middleware/plugin_middleware.py
@@ -336,10 +336,44 @@ class PluginMiddleware:
                    )
                return result
            return None
+        except ConnectionError as e:
+            # Redis 连接错误：根据配置决定
+            logger.warning(f"Rate limit connection error: {e}")
+            if config.rate_limit_fail_open:
+                return None
+            else:
+                return RateLimitResult(
+                    allowed=False,
+                    remaining=0,
+                    retry_after=30,
+                    message="Rate limit service unavailable"
+                )
+        except TimeoutError as e:
+            # 超时错误：可能是负载过高，根据配置决定
+            logger.warning(f"Rate limit timeout: {e}")
+            if config.rate_limit_fail_open:
+                return None
+            else:
+                return RateLimitResult(
+                    allowed=False,
+                    remaining=0,
+                    retry_after=30,
+                    message="Rate limit service timeout"
+                )
        except Exception as e:
-            logger.error(f"Rate limit error: {e}")
-            # 发生错误时允许请求通过
-            return None
+            logger.error(f"Rate limit error: {type(e).__name__}: {e}")
+            # 其他异常：根据配置决定
+            if config.rate_limit_fail_open:
+                # fail-open: 异常时放行请求（优先可用性）
+                return None
+            else:
+                # fail-close: 异常时拒绝请求（优先安全性）
+                return RateLimitResult(
+                    allowed=False,
+                    remaining=0,
+                    retry_after=60,
+                    message="Rate limit service error"
+                )

    async def _call_pre_request_plugins(self, request: Request) -> None:
        """调用请求前的插件（当前保留扩展点）"""
--- a/src/models/endpoint_models.py
+++ b/src/models/endpoint_models.py
@@ -226,8 +226,11 @@ class EndpointAPIKeyUpdate(BaseModel):
    global_priority: Optional[int] = Field(
        default=None, description="全局 Key 优先级（全局 Key 优先模式，数字越小越优先）"
    )
-    # 注意：max_concurrent=None 表示不更新，要切换为自适应模式请使用专用 API
-    max_concurrent: Optional[int] = Field(default=None, ge=1, description="最大并发数")
+    # max_concurrent: 使用特殊标记区分"未提供"和"设置为 null（自适应模式）"
+    # - 不提供字段：不更新
+    # - 提供 null：切换为自适应模式
+    # - 提供数字：设置固定并发限制
+    max_concurrent: Optional[int] = Field(default=None, ge=1, description="最大并发数（null=自适应模式）")
    rate_limit: Optional[int] = Field(default=None, ge=1, description="速率限制")
    daily_limit: Optional[int] = Field(default=None, ge=1, description="每日限制")
    monthly_limit: Optional[int] = Field(default=None, ge=1, description="每月限制")
--- a/src/services/auth/service.py
+++ b/src/services/auth/service.py
@@ -27,7 +27,7 @@ if not config.jwt_secret_key:
    if config.environment == "production":
        raise ValueError("JWT_SECRET_KEY must be set in production environment!")
    config.jwt_secret_key = secrets.token_urlsafe(32)
-    logger.warning(f"JWT_SECRET_KEY未在环境变量中找到，已生成随机密钥用于开发: {config.jwt_secret_key[:10]}...")
+    logger.warning("JWT_SECRET_KEY未在环境变量中找到，已生成随机密钥用于开发")
    logger.warning("生产环境请设置JWT_SECRET_KEY环境变量!")

 JWT_SECRET_KEY = config.jwt_secret_key
--- a/src/services/system/cleanup_scheduler.py
+++ b/src/services/system/cleanup_scheduler.py
@@ -208,86 +208,120 @@ class CleanupScheduler:
                        return

                    # 非首次运行，检查最近是否有缺失的日期需要回填
-                    latest_stat = db.query(StatsDaily).order_by(StatsDaily.date.desc()).first()
+                    from src.models.database import StatsDailyModel

-                    if latest_stat:
-                        latest_date_utc = latest_stat.date
-                        if latest_date_utc.tzinfo is None:
-                            latest_date_utc = latest_date_utc.replace(tzinfo=timezone.utc)
-                        else:
-                            latest_date_utc = latest_date_utc.astimezone(timezone.utc)
+                    yesterday_business_date = today_local.date() - timedelta(days=1)
+                    max_backfill_days: int = SystemConfigService.get_config(
+                        db, "max_stats_backfill_days", 30
+                    ) or 30

-                        # 使用业务日期计算缺失区间（避免用 UTC 年月日导致日期偏移，且对 DST 更安全）
-                        latest_business_date = latest_date_utc.astimezone(app_tz).date()
-                        yesterday_business_date = today_local.date() - timedelta(days=1)
-                        missing_start_date = latest_business_date + timedelta(days=1)
+                    # 计算回填检查的起始日期
+                    check_start_date = yesterday_business_date - timedelta(
+                        days=max_backfill_days - 1
+                    )

-                        if missing_start_date <= yesterday_business_date:
-                            missing_days = (
-                                yesterday_business_date - missing_start_date
-                            ).days + 1
+                    # 获取 StatsDaily 和 StatsDailyModel 中已有数据的日期集合
+                    existing_daily_dates = set()
+                    existing_model_dates = set()

-                            # 限制最大回填天数，防止停机很久后一次性回填太多
-                            max_backfill_days: int = SystemConfigService.get_config(
-                                db, "max_stats_backfill_days", 30
-                            ) or 30
-                            if missing_days > max_backfill_days:
-                                logger.warning(
-                                    f"缺失 {missing_days} 天数据超过最大回填限制 "
-                                    f"{max_backfill_days} 天，只回填最近 {max_backfill_days} 天"
+                    daily_stats = (
+                        db.query(StatsDaily.date)
+                        .filter(StatsDaily.date >= check_start_date.isoformat())
+                        .all()
+                    )
+                    for (stat_date,) in daily_stats:
+                        if stat_date.tzinfo is None:
+                            stat_date = stat_date.replace(tzinfo=timezone.utc)
+                        existing_daily_dates.add(stat_date.astimezone(app_tz).date())
+
+                    model_stats = (
+                        db.query(StatsDailyModel.date)
+                        .filter(StatsDailyModel.date >= check_start_date.isoformat())
+                        .distinct()
+                        .all()
+                    )
+                    for (stat_date,) in model_stats:
+                        if stat_date.tzinfo is None:
+                            stat_date = stat_date.replace(tzinfo=timezone.utc)
+                        existing_model_dates.add(stat_date.astimezone(app_tz).date())
+
+                    # 找出需要回填的日期
+                    all_dates = set()
+                    current = check_start_date
+                    while current <= yesterday_business_date:
+                        all_dates.add(current)
+                        current += timedelta(days=1)
+
+                    # 需要回填 StatsDaily 的日期
+                    missing_daily_dates = all_dates - existing_daily_dates
+                    # 需要回填 StatsDailyModel 的日期
+                    missing_model_dates = all_dates - existing_model_dates
+                    # 合并所有需要处理的日期
+                    dates_to_process = missing_daily_dates | missing_model_dates
+
+                    if dates_to_process:
+                        sorted_dates = sorted(dates_to_process)
+                        logger.info(
+                            f"检测到 {len(dates_to_process)} 天的统计数据需要回填 "
+                            f"(StatsDaily 缺失 {len(missing_daily_dates)} 天, "
+                            f"StatsDailyModel 缺失 {len(missing_model_dates)} 天)"
+                        )
+
+                        users = (
+                            db.query(DBUser.id).filter(DBUser.is_active.is_(True)).all()
+                        )
+
+                        failed_dates = 0
+                        failed_users = 0
+
+                        for current_date in sorted_dates:
+                            try:
+                                current_date_local = datetime.combine(
+                                    current_date, datetime.min.time(), tzinfo=app_tz
                                )
-                                missing_start_date = yesterday_business_date - timedelta(
-                                    days=max_backfill_days - 1
-                                )
-                                missing_days = max_backfill_days
-
-                            logger.info(
-                                f"检测到缺失 {missing_days} 天的统计数据 "
-                                f"({missing_start_date} ~ {yesterday_business_date})，开始回填..."
-                            )
-
-                            current_date = missing_start_date
-                            users = (
-                                db.query(DBUser.id).filter(DBUser.is_active.is_(True)).all()
-                            )
-
-                            while current_date <= yesterday_business_date:
-                                try:
-                                    current_date_local = datetime.combine(
-                                        current_date, datetime.min.time(), tzinfo=app_tz
-                                    )
+                                # 只在缺失时才聚合对应的表
+                                if current_date in missing_daily_dates:
                                    StatsAggregatorService.aggregate_daily_stats(
                                        db, current_date_local
                                    )
+                                if current_date in missing_model_dates:
                                    StatsAggregatorService.aggregate_daily_model_stats(
                                        db, current_date_local
                                    )
-                                    for (user_id,) in users:
-                                        try:
-                                            StatsAggregatorService.aggregate_user_daily_stats(
-                                                db, user_id, current_date_local
-                                            )
-                                        except Exception as e:
-                                            logger.warning(
-                                                f"回填用户 {user_id} 日期 {current_date} 失败: {e}"
-                                            )
-                                            try:
-                                                db.rollback()
-                                            except Exception:
-                                                pass
-                                except Exception as e:
-                                    logger.warning(f"回填日期 {current_date} 失败: {e}")
+                                # 用户统计在任一缺失时都回填
+                                for (user_id,) in users:
                                    try:
-                                        db.rollback()
-                                    except Exception:
-                                        pass
+                                        StatsAggregatorService.aggregate_user_daily_stats(
+                                            db, user_id, current_date_local
+                                        )
+                                    except Exception as e:
+                                        failed_users += 1
+                                        logger.warning(
+                                            f"回填用户 {user_id} 日期 {current_date} 失败: {e}"
+                                        )
+                                        try:
+                                            db.rollback()
+                                        except Exception as rollback_err:
+                                            logger.error(f"回滚失败: {rollback_err}")
+                            except Exception as e:
+                                failed_dates += 1
+                                logger.warning(f"回填日期 {current_date} 失败: {e}")
+                                try:
+                                    db.rollback()
+                                except Exception as rollback_err:
+                                    logger.error(f"回滚失败: {rollback_err}")

-                                current_date += timedelta(days=1)
+                        StatsAggregatorService.update_summary(db)

-                            StatsAggregatorService.update_summary(db)
-                            logger.info(f"缺失数据回填完成，共 {missing_days} 天")
+                        if failed_dates > 0 or failed_users > 0:
+                            logger.warning(
+                                f"回填完成，共处理 {len(dates_to_process)} 天，"
+                                f"失败: {failed_dates} 天, {failed_users} 个用户记录"
+                            )
                        else:
-                            logger.info("统计数据已是最新，无需回填")
+                            logger.info(f"缺失数据回填完成，共处理 {len(dates_to_process)} 天")
+                    else:
+                        logger.info("统计数据已是最新，无需回填")
                    return

                # 定时任务：聚合昨天的数据
--- a/src/services/usage/service.py
+++ b/src/services/usage/service.py
@@ -3,6 +3,7 @@
 """

 import uuid
+from dataclasses import dataclass
 from datetime import datetime, timedelta, timezone
 from typing import Any, Dict, List, Optional, Tuple

@@ -16,6 +17,71 @@ from src.services.model.cost import ModelCostService
 from src.services.system.config import SystemConfigService


+@dataclass
+class UsageRecordParams:
+    """用量记录参数数据类，用于在内部方法间传递数据"""
+    db: Session
+    user: Optional[User]
+    api_key: Optional[ApiKey]
+    provider: str
+    model: str
+    input_tokens: int
+    output_tokens: int
+    cache_creation_input_tokens: int
+    cache_read_input_tokens: int
+    request_type: str
+    api_format: Optional[str]
+    is_stream: bool
+    response_time_ms: Optional[int]
+    first_byte_time_ms: Optional[int]
+    status_code: int
+    error_message: Optional[str]
+    metadata: Optional[Dict[str, Any]]
+    request_headers: Optional[Dict[str, Any]]
+    request_body: Optional[Any]
+    provider_request_headers: Optional[Dict[str, Any]]
+    response_headers: Optional[Dict[str, Any]]
+    response_body: Optional[Any]
+    request_id: str
+    provider_id: Optional[str]
+    provider_endpoint_id: Optional[str]
+    provider_api_key_id: Optional[str]
+    status: str
+    cache_ttl_minutes: Optional[int]
+    use_tiered_pricing: bool
+    target_model: Optional[str]
+
+    def __post_init__(self) -> None:
+        """验证关键字段，确保数据完整性"""
+        # Token 数量不能为负数
+        if self.input_tokens < 0:
+            raise ValueError(f"input_tokens 不能为负数: {self.input_tokens}")
+        if self.output_tokens < 0:
+            raise ValueError(f"output_tokens 不能为负数: {self.output_tokens}")
+        if self.cache_creation_input_tokens < 0:
+            raise ValueError(
+                f"cache_creation_input_tokens 不能为负数: {self.cache_creation_input_tokens}"
+            )
+        if self.cache_read_input_tokens < 0:
+            raise ValueError(
+                f"cache_read_input_tokens 不能为负数: {self.cache_read_input_tokens}"
+            )
+
+        # 响应时间不能为负数
+        if self.response_time_ms is not None and self.response_time_ms < 0:
+            raise ValueError(f"response_time_ms 不能为负数: {self.response_time_ms}")
+        if self.first_byte_time_ms is not None and self.first_byte_time_ms < 0:
+            raise ValueError(f"first_byte_time_ms 不能为负数: {self.first_byte_time_ms}")
+
+        # HTTP 状态码范围校验
+        if not (100 <= self.status_code <= 599):
+            raise ValueError(f"无效的 HTTP 状态码: {self.status_code}")
+
+        # 状态值校验
+        valid_statuses = {"pending", "streaming", "completed", "failed"}
+        if self.status not in valid_statuses:
+            raise ValueError(f"无效的状态值: {self.status}，有效值: {valid_statuses}")
+

 class UsageService:
    """用量统计服务"""
@@ -471,6 +537,97 @@ class UsageService:
            cache_ttl_minutes=cache_ttl_minutes,
        )

+    @classmethod
+    async def _prepare_usage_record(
+        cls,
+        params: UsageRecordParams,
+    ) -> Tuple[Dict[str, Any], float]:
+        """准备用量记录的共享逻辑
+
+        此方法提取了 record_usage 和 record_usage_async 的公共处理逻辑：
+        - 获取费率倍数
+        - 计算成本
+        - 构建 Usage 参数
+
+        Args:
+            params: 用量记录参数数据类
+
+        Returns:
+            (usage_params 字典, total_cost 总成本)
+        """
+        # 获取费率倍数和是否免费套餐
+        actual_rate_multiplier, is_free_tier = await cls._get_rate_multiplier_and_free_tier(
+            params.db, params.provider_api_key_id, params.provider_id
+        )
+
+        # 计算成本
+        is_failed_request = params.status_code >= 400 or params.error_message is not None
+        (
+            input_price, output_price, cache_creation_price, cache_read_price, request_price,
+            input_cost, output_cost, cache_creation_cost, cache_read_cost, cache_cost,
+            request_cost, total_cost, _tier_index
+        ) = await cls._calculate_costs(
+            db=params.db,
+            provider=params.provider,
+            model=params.model,
+            input_tokens=params.input_tokens,
+            output_tokens=params.output_tokens,
+            cache_creation_input_tokens=params.cache_creation_input_tokens,
+            cache_read_input_tokens=params.cache_read_input_tokens,
+            api_format=params.api_format,
+            cache_ttl_minutes=params.cache_ttl_minutes,
+            use_tiered_pricing=params.use_tiered_pricing,
+            is_failed_request=is_failed_request,
+        )
+
+        # 构建 Usage 参数
+        usage_params = cls._build_usage_params(
+            db=params.db,
+            user=params.user,
+            api_key=params.api_key,
+            provider=params.provider,
+            model=params.model,
+            input_tokens=params.input_tokens,
+            output_tokens=params.output_tokens,
+            cache_creation_input_tokens=params.cache_creation_input_tokens,
+            cache_read_input_tokens=params.cache_read_input_tokens,
+            request_type=params.request_type,
+            api_format=params.api_format,
+            is_stream=params.is_stream,
+            response_time_ms=params.response_time_ms,
+            first_byte_time_ms=params.first_byte_time_ms,
+            status_code=params.status_code,
+            error_message=params.error_message,
+            metadata=params.metadata,
+            request_headers=params.request_headers,
+            request_body=params.request_body,
+            provider_request_headers=params.provider_request_headers,
+            response_headers=params.response_headers,
+            response_body=params.response_body,
+            request_id=params.request_id,
+            provider_id=params.provider_id,
+            provider_endpoint_id=params.provider_endpoint_id,
+            provider_api_key_id=params.provider_api_key_id,
+            status=params.status,
+            target_model=params.target_model,
+            input_cost=input_cost,
+            output_cost=output_cost,
+            cache_creation_cost=cache_creation_cost,
+            cache_read_cost=cache_read_cost,
+            cache_cost=cache_cost,
+            request_cost=request_cost,
+            total_cost=total_cost,
+            input_price=input_price,
+            output_price=output_price,
+            cache_creation_price=cache_creation_price,
+            cache_read_price=cache_read_price,
+            request_price=request_price,
+            actual_rate_multiplier=actual_rate_multiplier,
+            is_free_tier=is_free_tier,
+        )
+
+        return usage_params, total_cost
+
    @classmethod
    async def record_usage_async(
        cls,
@@ -516,76 +673,25 @@ class UsageService:
        if request_id is None:
            request_id = str(uuid.uuid4())[:8]

-        # 获取费率倍数和是否免费套餐
-        actual_rate_multiplier, is_free_tier = await cls._get_rate_multiplier_and_free_tier(
-            db, provider_api_key_id, provider_id
-        )
-
-        # 计算成本
-        is_failed_request = status_code >= 400 or error_message is not None
-        (
-            input_price, output_price, cache_creation_price, cache_read_price, request_price,
-            input_cost, output_cost, cache_creation_cost, cache_read_cost, cache_cost,
-            request_cost, total_cost, tier_index
-        ) = await cls._calculate_costs(
-            db=db,
-            provider=provider,
-            model=model,
-            input_tokens=input_tokens,
-            output_tokens=output_tokens,
+        # 使用共享逻辑准备记录参数
+        params = UsageRecordParams(
+            db=db, user=user, api_key=api_key, provider=provider, model=model,
+            input_tokens=input_tokens, output_tokens=output_tokens,
            cache_creation_input_tokens=cache_creation_input_tokens,
            cache_read_input_tokens=cache_read_input_tokens,
-            api_format=api_format,
-            cache_ttl_minutes=cache_ttl_minutes,
-            use_tiered_pricing=use_tiered_pricing,
-            is_failed_request=is_failed_request,
-        )
-
-        # 构建 Usage 参数
-        usage_params = cls._build_usage_params(
-            db=db,
-            user=user,
-            api_key=api_key,
-            provider=provider,
-            model=model,
-            input_tokens=input_tokens,
-            output_tokens=output_tokens,
-            cache_creation_input_tokens=cache_creation_input_tokens,
-            cache_read_input_tokens=cache_read_input_tokens,
-            request_type=request_type,
-            api_format=api_format,
-            is_stream=is_stream,
-            response_time_ms=response_time_ms,
-            first_byte_time_ms=first_byte_time_ms,
-            status_code=status_code,
-            error_message=error_message,
-            metadata=metadata,
-            request_headers=request_headers,
-            request_body=request_body,
+            request_type=request_type, api_format=api_format, is_stream=is_stream,
+            response_time_ms=response_time_ms, first_byte_time_ms=first_byte_time_ms,
+            status_code=status_code, error_message=error_message, metadata=metadata,
+            request_headers=request_headers, request_body=request_body,
            provider_request_headers=provider_request_headers,
-            response_headers=response_headers,
-            response_body=response_body,
-            request_id=request_id,
-            provider_id=provider_id,
+            response_headers=response_headers, response_body=response_body,
+            request_id=request_id, provider_id=provider_id,
            provider_endpoint_id=provider_endpoint_id,
-            provider_api_key_id=provider_api_key_id,
-            status=status,
+            provider_api_key_id=provider_api_key_id, status=status,
+            cache_ttl_minutes=cache_ttl_minutes, use_tiered_pricing=use_tiered_pricing,
            target_model=target_model,
-            input_cost=input_cost,
-            output_cost=output_cost,
-            cache_creation_cost=cache_creation_cost,
-            cache_read_cost=cache_read_cost,
-            cache_cost=cache_cost,
-            request_cost=request_cost,
-            total_cost=total_cost,
-            input_price=input_price,
-            output_price=output_price,
-            cache_creation_price=cache_creation_price,
-            cache_read_price=cache_read_price,
-            request_price=request_price,
-            actual_rate_multiplier=actual_rate_multiplier,
-            is_free_tier=is_free_tier,
        )
+        usage_params, _ = await cls._prepare_usage_record(params)

        # 创建 Usage 记录
        usage = Usage(**usage_params)
@@ -660,76 +766,25 @@ class UsageService:
        if request_id is None:
            request_id = str(uuid.uuid4())[:8]

-        # 获取费率倍数和是否免费套餐
-        actual_rate_multiplier, is_free_tier = await cls._get_rate_multiplier_and_free_tier(
-            db, provider_api_key_id, provider_id
-        )
-
-        # 计算成本
-        is_failed_request = status_code >= 400 or error_message is not None
-        (
-            input_price, output_price, cache_creation_price, cache_read_price, request_price,
-            input_cost, output_cost, cache_creation_cost, cache_read_cost, cache_cost,
-            request_cost, total_cost, _tier_index
-        ) = await cls._calculate_costs(
-            db=db,
-            provider=provider,
-            model=model,
-            input_tokens=input_tokens,
-            output_tokens=output_tokens,
+        # 使用共享逻辑准备记录参数
+        params = UsageRecordParams(
+            db=db, user=user, api_key=api_key, provider=provider, model=model,
+            input_tokens=input_tokens, output_tokens=output_tokens,
            cache_creation_input_tokens=cache_creation_input_tokens,
            cache_read_input_tokens=cache_read_input_tokens,
-            api_format=api_format,
-            cache_ttl_minutes=cache_ttl_minutes,
-            use_tiered_pricing=use_tiered_pricing,
-            is_failed_request=is_failed_request,
-        )
-
-        # 构建 Usage 参数
-        usage_params = cls._build_usage_params(
-            db=db,
-            user=user,
-            api_key=api_key,
-            provider=provider,
-            model=model,
-            input_tokens=input_tokens,
-            output_tokens=output_tokens,
-            cache_creation_input_tokens=cache_creation_input_tokens,
-            cache_read_input_tokens=cache_read_input_tokens,
-            request_type=request_type,
-            api_format=api_format,
-            is_stream=is_stream,
-            response_time_ms=response_time_ms,
-            first_byte_time_ms=first_byte_time_ms,
-            status_code=status_code,
-            error_message=error_message,
-            metadata=metadata,
-            request_headers=request_headers,
-            request_body=request_body,
+            request_type=request_type, api_format=api_format, is_stream=is_stream,
+            response_time_ms=response_time_ms, first_byte_time_ms=first_byte_time_ms,
+            status_code=status_code, error_message=error_message, metadata=metadata,
+            request_headers=request_headers, request_body=request_body,
            provider_request_headers=provider_request_headers,
-            response_headers=response_headers,
-            response_body=response_body,
-            request_id=request_id,
-            provider_id=provider_id,
+            response_headers=response_headers, response_body=response_body,
+            request_id=request_id, provider_id=provider_id,
            provider_endpoint_id=provider_endpoint_id,
-            provider_api_key_id=provider_api_key_id,
-            status=status,
+            provider_api_key_id=provider_api_key_id, status=status,
+            cache_ttl_minutes=cache_ttl_minutes, use_tiered_pricing=use_tiered_pricing,
            target_model=target_model,
-            input_cost=input_cost,
-            output_cost=output_cost,
-            cache_creation_cost=cache_creation_cost,
-            cache_read_cost=cache_read_cost,
-            cache_cost=cache_cost,
-            request_cost=request_cost,
-            total_cost=total_cost,
-            input_price=input_price,
-            output_price=output_price,
-            cache_creation_price=cache_creation_price,
-            cache_read_price=cache_read_price,
-            request_price=request_price,
-            actual_rate_multiplier=actual_rate_multiplier,
-            is_free_tier=is_free_tier,
        )
+        usage_params, total_cost = await cls._prepare_usage_record(params)

        # 检查是否已存在相同 request_id 的记录
        existing_usage = db.query(Usage).filter(Usage.request_id == request_id).first()
@@ -751,7 +806,7 @@ class UsageService:
            api_key = db.merge(api_key)

        # 使用原子更新避免并发竞态条件
-        from sqlalchemy import func, update
+        from sqlalchemy import func as sql_func, update
        from src.models.database import ApiKey as ApiKeyModel, User as UserModel, GlobalModel

        # 更新用户使用量（独立 Key 不计入创建者的使用记录）
@@ -762,7 +817,7 @@ class UsageService:
                .values(
                    used_usd=UserModel.used_usd + total_cost,
                    total_usd=UserModel.total_usd + total_cost,
-                    updated_at=func.now(),
+                    updated_at=sql_func.now(),
                )
            )

@@ -776,8 +831,8 @@ class UsageService:
                        total_requests=ApiKeyModel.total_requests + 1,
                        total_cost_usd=ApiKeyModel.total_cost_usd + total_cost,
                        balance_used_usd=ApiKeyModel.balance_used_usd + total_cost,
-                        last_used_at=func.now(),
-                        updated_at=func.now(),
+                        last_used_at=sql_func.now(),
+                        updated_at=sql_func.now(),
                    )
                )
            else:
@@ -787,8 +842,8 @@ class UsageService:
                    .values(
                        total_requests=ApiKeyModel.total_requests + 1,
                        total_cost_usd=ApiKeyModel.total_cost_usd + total_cost,
-                        last_used_at=func.now(),
-                        updated_at=func.now(),
+                        last_used_at=sql_func.now(),
+                        updated_at=sql_func.now(),
                    )
                )

@@ -1121,19 +1176,48 @@ class UsageService:
        ]

    @staticmethod
-    def cleanup_old_usage_records(db: Session, days_to_keep: int = 90) -> int:
-        """清理旧的使用记录"""
+    def cleanup_old_usage_records(
+        db: Session, days_to_keep: int = 90, batch_size: int = 1000
+    ) -> int:
+        """清理旧的使用记录（分批删除避免长事务锁定）

+        Args:
+            db: 数据库会话
+            days_to_keep: 保留天数，默认 90 天
+            batch_size: 每批删除数量，默认 1000 条
+
+        Returns:
+            删除的总记录数
+        """
        cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_to_keep)
+        total_deleted = 0

-        # 删除旧记录
-        deleted = db.query(Usage).filter(Usage.created_at < cutoff_date).delete()
+        while True:
+            # 查询待删除的 ID（使用新索引 idx_usage_user_created）
+            batch_ids = (
+                db.query(Usage.id)
+                .filter(Usage.created_at < cutoff_date)
+                .limit(batch_size)
+                .all()
+            )

-        db.commit()
+            if not batch_ids:
+                break

-        logger.info(f"清理使用记录: 删除 {deleted} 条超过 {days_to_keep} 天的记录")
+            # 批量删除
+            deleted_count = (
+                db.query(Usage)
+                .filter(Usage.id.in_([row.id for row in batch_ids]))
+                .delete(synchronize_session=False)
+            )
+            db.commit()
+            total_deleted += deleted_count

-        return deleted
+            logger.debug(f"清理使用记录: 本批删除 {deleted_count} 条")
+
+        logger.info(f"清理使用记录: 共删除 {total_deleted} 条超过 {days_to_keep} 天的记录")
+
+        return total_deleted

    # ========== 请求状态追踪方法 ==========

@@ -1219,6 +1303,7 @@ class UsageService:
        error_message: Optional[str] = None,
        provider: Optional[str] = None,
        target_model: Optional[str] = None,
+        first_byte_time_ms: Optional[int] = None,
    ) -> Optional[Usage]:
        """
        快速更新使用记录状态
@@ -1230,6 +1315,7 @@ class UsageService:
            error_message: 错误消息（仅在 failed 状态时使用）
            provider: 提供商名称（可选，streaming 状态时更新）
            target_model: 映射后的目标模型名（可选）
+            first_byte_time_ms: 首字时间/TTFB（可选，streaming 状态时更新）

        Returns:
            更新后的 Usage 记录，如果未找到则返回 None
@@ -1247,6 +1333,8 @@ class UsageService:
            usage.provider = provider
        if target_model:
            usage.target_model = target_model
+        if first_byte_time_ms is not None:
+            usage.first_byte_time_ms = first_byte_time_ms

        db.commit()

--- a/src/services/usage/stream.py
+++ b/src/services/usage/stream.py
@@ -5,6 +5,7 @@

 import json
 import re
+import time
 from typing import Any, AsyncIterator, Dict, Optional, Tuple

 from sqlalchemy.orm import Session
@@ -457,26 +458,32 @@ class StreamUsageTracker:

        logger.debug(f"ID:{self.request_id} | 开始跟踪流式响应 | 估算输入tokens:{self.input_tokens}")

-        # 更新状态为 streaming，同时更新 provider
-        if self.request_id:
-            try:
-                from src.services.usage.service import UsageService
-                UsageService.update_usage_status(
-                    db=self.db,
-                    request_id=self.request_id,
-                    status="streaming",
-                    provider=self.provider,
-                )
-            except Exception as e:
-                logger.warning(f"更新使用记录状态为 streaming 失败: {e}")
-
        chunk_count = 0
+        first_chunk_received = False
        try:
            async for chunk in stream:
                chunk_count += 1
                # 保存原始字节流（用于错误诊断）
                self.raw_chunks.append(chunk)

+                # 第一个 chunk 收到时，更新状态为 streaming 并记录 TTFB
+                if not first_chunk_received:
+                    first_chunk_received = True
+                    if self.request_id:
+                        try:
+                            # 计算 TTFB（使用请求原始开始时间或 track_stream 开始时间）
+                            base_time = self.request_start_time or self.start_time
+                            first_byte_time_ms = int((time.time() - base_time) * 1000) if base_time else None
+                            UsageService.update_usage_status(
+                                db=self.db,
+                                request_id=self.request_id,
+                                status="streaming",
+                                provider=self.provider,
+                                first_byte_time_ms=first_byte_time_ms,
+                            )
+                        except Exception as e:
+                            logger.warning(f"更新使用记录状态为 streaming 失败: {e}")
+
                # 返回原始块给客户端
                yield chunk

--- a/src/utils/timeout.py
+++ b/src/utils/timeout.py
@@ -139,3 +139,83 @@ async def with_timeout_context(timeout: float, operation_name: str = "operation"
        # Python 3.10 及以下版本的兼容实现
        # 注意：这个简单实现不支持嵌套取消
        pass
+
+
+async def read_first_chunk_with_ttfb_timeout(
+    byte_iterator: Any,
+    timeout: float,
+    request_id: str,
+    provider_name: str,
+) -> tuple[bytes, Any]:
+    """
+    读取流的首字节并应用 TTFB 超时检测
+
+    首字节超时（Time To First Byte）用于检测慢响应的 Provider，
+    超时时触发故障转移到其他可用的 Provider。
+
+    Args:
+        byte_iterator: 异步字节流迭代器
+        timeout: TTFB 超时时间（秒）
+        request_id: 请求 ID（用于日志）
+        provider_name: Provider 名称（用于日志和异常）
+
+    Returns:
+        (first_chunk, aiter): 首个字节块和异步迭代器
+
+    Raises:
+        ProviderTimeoutException: 如果首字节超时
+    """
+    from src.core.exceptions import ProviderTimeoutException
+
+    aiter = byte_iterator.__aiter__()
+
+    try:
+        first_chunk = await asyncio.wait_for(aiter.__anext__(), timeout=timeout)
+        return first_chunk, aiter
+    except asyncio.TimeoutError:
+        # 完整的资源清理：先关闭迭代器，再关闭底层响应
+        await _cleanup_iterator_resources(aiter, request_id)
+        logger.warning(
+            f"  [{request_id}] 流首字节超时 (TTFB): "
+            f"Provider={provider_name}, timeout={timeout}s"
+        )
+        raise ProviderTimeoutException(
+            provider_name=provider_name,
+            timeout=int(timeout),
+        )
+
+
+async def _cleanup_iterator_resources(aiter: Any, request_id: str) -> None:
+    """
+    清理异步迭代器及其底层资源
+
+    确保在 TTFB 超时后正确释放 HTTP 连接，避免连接泄漏。
+
+    Args:
+        aiter: 异步迭代器
+        request_id: 请求 ID（用于日志）
+    """
+    # 1. 关闭迭代器本身
+    if hasattr(aiter, "aclose"):
+        try:
+            await aiter.aclose()
+        except Exception as e:
+            logger.debug(f"  [{request_id}] 关闭迭代器失败: {e}")
+
+    # 2. 关闭底层响应对象（httpx.Response）
+    # 迭代器可能持有 _response 属性指向底层响应
+    response = getattr(aiter, "_response", None)
+    if response is not None and hasattr(response, "aclose"):
+        try:
+            await response.aclose()
+        except Exception as e:
+            logger.debug(f"  [{request_id}] 关闭底层响应失败: {e}")
+
+    # 3. 尝试关闭 httpx 流（如果迭代器是 httpx 的 aiter_bytes）
+    # httpx 的 Response.aiter_bytes() 返回的生成器可能有 _stream 属性
+    stream = getattr(aiter, "_stream", None)
+    if stream is not None and hasattr(stream, "aclose"):
+        try:
+            await stream.aclose()
+        except Exception as e:
+            logger.debug(f"  [{request_id}] 关闭流对象失败: {e}")
--- a/tests/api/handlers/base/test_utils.py
+++ b/tests/api/handlers/base/test_utils.py
@@ -8,86 +8,116 @@ from src.api.handlers.base.utils import build_sse_headers, extract_cache_creatio
 class TestExtractCacheCreationTokens:
    """测试 extract_cache_creation_tokens 函数"""

-    def test_new_format_only(self) -> None:
-        """测试只有新格式字段"""
+    # === 嵌套格式测试（优先级最高）===
+
+    def test_nested_cache_creation_format(self) -> None:
+        """测试嵌套格式正常情况"""
+        usage = {
+            "cache_creation": {
+                "ephemeral_5m_input_tokens": 456,
+                "ephemeral_1h_input_tokens": 100,
+            }
+        }
+        assert extract_cache_creation_tokens(usage) == 556
+
+    def test_nested_cache_creation_with_old_format_fallback(self) -> None:
+        """测试嵌套格式为 0 时回退到旧格式"""
+        usage = {
+            "cache_creation": {
+                "ephemeral_5m_input_tokens": 0,
+                "ephemeral_1h_input_tokens": 0,
+            },
+            "cache_creation_input_tokens": 549,
+        }
+        assert extract_cache_creation_tokens(usage) == 549
+
+    def test_nested_has_priority_over_flat(self) -> None:
+        """测试嵌套格式优先于扁平格式"""
+        usage = {
+            "cache_creation": {
+                "ephemeral_5m_input_tokens": 100,
+                "ephemeral_1h_input_tokens": 200,
+            },
+            "claude_cache_creation_5_m_tokens": 999,  # 应该被忽略
+            "claude_cache_creation_1_h_tokens": 888,  # 应该被忽略
+            "cache_creation_input_tokens": 777,  # 应该被忽略
+        }
+        assert extract_cache_creation_tokens(usage) == 300
+
+    # === 扁平格式测试（优先级第二）===
+
+    def test_flat_new_format_still_works(self) -> None:
+        """测试扁平新格式兼容性"""
        usage = {
            "claude_cache_creation_5_m_tokens": 100,
            "claude_cache_creation_1_h_tokens": 200,
        }
        assert extract_cache_creation_tokens(usage) == 300

-    def test_new_format_5m_only(self) -> None:
-        """测试只有 5 分钟缓存"""
+    def test_flat_new_format_with_old_format_fallback(self) -> None:
+        """测试扁平格式为 0 时回退到旧格式"""
+        usage = {
+            "claude_cache_creation_5_m_tokens": 0,
+            "claude_cache_creation_1_h_tokens": 0,
+            "cache_creation_input_tokens": 549,
+        }
+        assert extract_cache_creation_tokens(usage) == 549
+
+    def test_flat_new_format_5m_only(self) -> None:
+        """测试只有 5 分钟扁平缓存"""
        usage = {
            "claude_cache_creation_5_m_tokens": 150,
            "claude_cache_creation_1_h_tokens": 0,
        }
        assert extract_cache_creation_tokens(usage) == 150

-    def test_new_format_1h_only(self) -> None:
-        """测试只有 1 小时缓存"""
+    def test_flat_new_format_1h_only(self) -> None:
+        """测试只有 1 小时扁平缓存"""
        usage = {
            "claude_cache_creation_5_m_tokens": 0,
            "claude_cache_creation_1_h_tokens": 250,
        }
        assert extract_cache_creation_tokens(usage) == 250

+    # === 旧格式测试（优先级第三）===
+
    def test_old_format_only(self) -> None:
-        """测试只有旧格式字段"""
+        """测试只有旧格式"""
        usage = {
-            "cache_creation_input_tokens": 500,
+            "cache_creation_input_tokens": 549,
        }
-        assert extract_cache_creation_tokens(usage) == 500
+        assert extract_cache_creation_tokens(usage) == 549

-    def test_both_formats_prefers_new(self) -> None:
-        """测试同时存在时优先使用新格式"""
-        usage = {
-            "claude_cache_creation_5_m_tokens": 100,
-            "claude_cache_creation_1_h_tokens": 200,
-            "cache_creation_input_tokens": 999,  # 应该被忽略
-        }
-        assert extract_cache_creation_tokens(usage) == 300
+    # === 边界情况测试 ===

-    def test_empty_usage(self) -> None:
-        """测试空字典"""
+    def test_no_cache_creation_tokens(self) -> None:
+        """测试没有任何缓存字段"""
        usage = {}
        assert extract_cache_creation_tokens(usage) == 0

-    def test_all_zeros(self) -> None:
-        """测试所有字段都为 0"""
+    def test_all_formats_zero(self) -> None:
+        """测试所有格式都为 0"""
        usage = {
+            "cache_creation": {
+                "ephemeral_5m_input_tokens": 0,
+                "ephemeral_1h_input_tokens": 0,
+            },
            "claude_cache_creation_5_m_tokens": 0,
            "claude_cache_creation_1_h_tokens": 0,
            "cache_creation_input_tokens": 0,
        }
        assert extract_cache_creation_tokens(usage) == 0

-    def test_partial_new_format_with_old_format_fallback(self) -> None:
-        """测试新格式字段不存在时回退到旧格式"""
-        usage = {
-            "cache_creation_input_tokens": 123,
-        }
-        assert extract_cache_creation_tokens(usage) == 123
-
-    def test_new_format_zero_should_not_fallback(self) -> None:
-        """测试新格式字段存在但为 0 时，不应 fallback 到旧格式"""
-        usage = {
-            "claude_cache_creation_5_m_tokens": 0,
-            "claude_cache_creation_1_h_tokens": 0,
-            "cache_creation_input_tokens": 456,
-        }
-        # 新格式字段存在，即使值为 0 也应该使用新格式（返回 0）
-        # 而不是 fallback 到旧格式（返回 456）
-        assert extract_cache_creation_tokens(usage) == 0
-
    def test_unrelated_fields_ignored(self) -> None:
        """测试忽略无关字段"""
        usage = {
            "input_tokens": 1000,
            "output_tokens": 2000,
            "cache_read_input_tokens": 300,
-            "claude_cache_creation_5_m_tokens": 50,
-            "claude_cache_creation_1_h_tokens": 75,
+            "cache_creation": {
+                "ephemeral_5m_input_tokens": 50,
+                "ephemeral_1h_input_tokens": 75,
+            },
        }
        assert extract_cache_creation_tokens(usage) == 125
Author	SHA1	Message	Date
fawney19	03ad16ea8a	fix: 修复迁移脚本在全新安装时报错及改进统计回填逻辑迁移脚本修复: - 移除 AUTOCOMMIT 模式，改为在同一事务中创建索引 - 分别检查每个索引是否存在，只创建缺失的索引 - 修复全新安装时 AUTOCOMMIT 连接看不到未提交表的问题 (#46) 统计回填改进: - 分别检查 StatsDaily 和 StatsDailyModel 的缺失日期 - 只回填实际缺失的数据而非连续区间 - 添加失败统计计数和 rollback 错误日志	2025-12-24 21:50:05 +08:00
fawney19	2fa64b98e3	fix: deploy.sh 将 Dockerfile.app.local 纳入代码变化检测	2025-12-24 18:10:42 +08:00
fawney19	75d7e89cbb	perf: 添加 gunicorn --preload 参数优化内存占用 Worker 进程共享只读内存（代码、常量），可减少约 30-40% 内存占用 Closes #44	2025-12-24 18:10:42 +08:00
fawney19	d73a443484	fix: 修复初次执行 migrate.sh 时 usage 表不存在的问题 (#43 ) - 在 baseline 中直接创建 usage 表复合索引 - 在后续迁移中添加表存在性检查，避免 AUTOCOMMIT 连接看不到事务中的表	2025-12-24 18:10:42 +08:00
Hwwwww-dev	15a9b88fc8	feat: enhance extract_cache_creation_tokens function to support three formats[#41 ] (#42 ) - Updated the function to prioritize nested format, followed by flat new format, and finally old format for cache creation tokens. - Added fallback logic for cases where the preferred formats return zero. - Expanded unit tests to cover new format scenarios and ensure proper functionality across all formats. Co-authored-by: heweimin <heweimin@retaileye.ai>	2025-12-24 01:31:45 +08:00
fawney19	03eb7203ec	fix(api): 同步 chat_handler_base 使用 aiter_bytes 支持自动解压	2025-12-24 01:13:35 +08:00
hank9999	e38cd6819b	fix(api): 优化字节流迭代器以支持自动解压 gzip (#39 )	2025-12-24 01:11:35 +08:00
fawney19	d44cfaddf6	fix: rename variable to avoid shadowing in model mapping cache stats 循环内部变量 provider_model_mappings 与外部列表同名，导致外部列表被覆盖为 None 引发 AttributeError	2025-12-23 00:38:37 +08:00
fawney19	65225710a8	refactor: use ConcurrencyDefaults for CACHE_RESERVATION_RATIO constant	2025-12-23 00:34:18 +08:00
fawney19	d7f5b16359	fix: rebuild app image when migration files change deploy.sh was only running alembic upgrade on the old container when migration files changed, but the migration files are baked into the Docker image. Now it rebuilds the app image when migrations change.	2025-12-23 00:23:22 +08:00
fawney19	7185818724	fix: remove index_exists check to avoid transaction conflict in migration - Remove index_exists function that used op.get_bind() within transaction - Use IF NOT EXISTS / IF EXISTS SQL syntax instead - Fixes CREATE INDEX CONCURRENTLY error in Docker migration	2025-12-23 00:21:03 +08:00
fawney19	868f3349e5	fix: use AUTOCOMMIT mode for CREATE INDEX CONCURRENTLY in migration PostgreSQL 不允许在事务块内执行 CREATE INDEX CONCURRENTLY，通过创建独立连接并设置 AUTOCOMMIT 隔离级别来解决此问题。	2025-12-23 00:18:11 +08:00
fawney19	d7384e69d9	fix: improve code quality and add type safety for Key updates - Replace f-string logging with lazy formatting in keys.py (lines 256, 265) - Add EndpointAPIKeyUpdate type interface for frontend type safety - Use typed EndpointAPIKeyUpdate instead of any in KeyFormDialog.vue	2025-12-23 00:11:10 +08:00
fawney19	1d5c378343	feat: add TTFB timeout detection and improve stream handling - Add stream first byte timeout (TTFB) detection to trigger failover when provider responds too slowly (configurable via STREAM_FIRST_BYTE_TIMEOUT) - Add rate limit fail-open/fail-close strategy configuration - Improve exception handling in stream prefetch with proper error classification - Refactor UsageService with shared _prepare_usage_record method - Add batch deletion for old usage records to avoid long transaction locks - Update CLI adapters to use proper User-Agent headers for each CLI client - Add composite indexes migration for usage table query optimization - Fix streaming status display in frontend to show TTFB during streaming - Remove sensitive JWT secret logging in auth service	2025-12-22 23:44:42 +08:00