feat: 流式预读增强与自适应并发算法优化

流式预读增强： - 新增预读字节上限（64KB），防止无换行响应导致内存增长 - 预读结束后检测非 SSE 格式的错误响应（HTML 页面、纯 JSON 错误） - 抽取 check_html_response 和 check_prefetched_response_error 到 utils.py 自适应并发算法优化（边界记忆 + 渐进探测）： - 缩容策略：从乘性减少改为边界 -1，一次 429 即可收敛到真实限制附近 - 扩容策略：普通扩容不超过已知边界，探测性扩容可谨慎突破（每次 +1） - 仅在并发限制 429 时记录边界，避免 RPM/UNKNOWN 类型覆盖
2026-01-12 04:28:28 +08:00 · 2026-01-05 12:17:45 +08:00
parent 4fa9a1303a
commit e5f12fddd9
5 changed files with 314 additions and 60 deletions
--- a/src/services/rate_limit/adaptive_concurrency.py
+++ b/src/services/rate_limit/adaptive_concurrency.py
@@ -1,14 +1,16 @@
 """
-自适应并发调整器 - 基于滑动窗口利用率的并发限制调整
+自适应并发调整器 - 基于边界记忆的并发限制调整

-核心改进（相对于旧版基于"持续高利用率"的方案）：
- 使用滑动窗口采样，容忍并发波动
- 基于窗口内高利用率采样比例决策，而非要求连续高利用率
- 增加探测性扩容机制，长时间稳定时主动尝试扩容
+核心算法：边界记忆 + 渐进探测
+- 触发 429 时记录边界（last_concurrent_peak），这就是真实上限
+- 缩容策略：新限制 = 边界 - 1，而非乘性减少
+- 扩容策略：不超过已知边界，除非是探测性扩容
+- 探测性扩容：长时间无 429 时尝试突破边界

-AIMD 参数说明：
- 扩容：加性增加 (+INCREASE_STEP)
- 缩容：乘性减少 (*DECREASE_MULTIPLIER，默认 0.85)
+设计原则：
+1. 快速收敛：一次 429 就能找到接近真实的限制
+2. 避免过度保守：不会因为多次 429 而无限下降
+3. 安全探测：允许在稳定后尝试更高并发
 """

 from datetime import datetime, timezone
@@ -35,21 +37,21 @@ class AdaptiveConcurrencyManager:
    """
    自适应并发管理器

-    核心算法：基于滑动窗口利用率的 AIMD
-    - 滑动窗口记录最近 N 次请求的利用率
-    - 当窗口内高利用率采样比例 >= 60% 时触发扩容
-    - 遇到 429 错误时乘性减少 (*0.85)
-    - 长时间无 429 且有流量时触发探测性扩容
+    核心算法：边界记忆 + 渐进探测
+    - 触发 429 时记录边界（last_concurrent_peak = 触发时的并发数）
+    - 缩容：新限制 = 边界 - 1（快速收敛到真实限制附近）
+    - 扩容：不超过边界（即 last_concurrent_peak），允许回到边界值尝试
+    - 探测性扩容：长时间（30分钟）无 429 时，可以尝试 +1 突破边界

    扩容条件（满足任一即可）：
-    1. 滑动窗口扩容：窗口内 >= 60% 的采样利用率 >= 70%，且不在冷却期
-    2. 探测性扩容：距上次 429 超过 30 分钟，且期间有足够请求量
+    1. 利用率扩容：窗口内高利用率比例 >= 60%，且当前限制 < 边界
+    2. 探测性扩容：距上次 429 超过 30 分钟，可以尝试突破边界

    关键特性：
-    1. 滑动窗口容忍并发波动，不会因单次低利用率重置
-    2. 区分并发限制和 RPM 限制
-    3. 探测性扩容避免长期卡在低限制
-    4. 记录调整历史
+    1. 快速收敛：一次 429 就能学到接近真实的限制值
+    2. 边界保护：普通扩容不会超过已知边界
+    3. 安全探测：长时间稳定后允许尝试更高并发
+    4. 区分并发限制和 RPM 限制
    """

    # 默认配置 - 使用统一常量
@@ -59,7 +61,6 @@ class AdaptiveConcurrencyManager:

    # AIMD 参数
    INCREASE_STEP = ConcurrencyDefaults.INCREASE_STEP
-    DECREASE_MULTIPLIER = ConcurrencyDefaults.DECREASE_MULTIPLIER

    # 滑动窗口参数
    UTILIZATION_WINDOW_SIZE = ConcurrencyDefaults.UTILIZATION_WINDOW_SIZE
@@ -115,7 +116,13 @@ class AdaptiveConcurrencyManager:
        # 更新429统计
        key.last_429_at = datetime.now(timezone.utc)  # type: ignore[assignment]
        key.last_429_type = rate_limit_info.limit_type  # type: ignore[assignment]
-        key.last_concurrent_peak = current_concurrent  # type: ignore[assignment]
+        # 仅在并发限制且拿到并发数时记录边界（RPM/UNKNOWN 不应覆盖并发边界记忆）
+        if (
+            rate_limit_info.limit_type == RateLimitType.CONCURRENT
+            and current_concurrent is not None
+            and current_concurrent > 0
+        ):
+            key.last_concurrent_peak = current_concurrent  # type: ignore[assignment]

        # 遇到 429 错误，清空利用率采样窗口（重新开始收集）
        key.utilization_samples = []  # type: ignore[assignment]
@@ -207,6 +214,9 @@ class AdaptiveConcurrencyManager:

        current_limit = int(key.learned_max_concurrent or self.DEFAULT_INITIAL_LIMIT)

+        # 获取已知边界（上次触发 429 时的并发数）
+        known_boundary = key.last_concurrent_peak
+
        # 计算当前利用率
        utilization = float(current_concurrent / current_limit) if current_limit > 0 else 0.0

@@ -217,22 +227,29 @@ class AdaptiveConcurrencyManager:
        samples = self._update_utilization_window(key, now_ts, utilization)

        # 检查是否满足扩容条件
-        increase_reason = self._check_increase_conditions(key, samples, now)
+        increase_reason = self._check_increase_conditions(key, samples, now, known_boundary)

        if increase_reason and current_limit < self.MAX_CONCURRENT_LIMIT:
            old_limit = current_limit
-            new_limit = self._increase_limit(current_limit)
+            is_probe = increase_reason == "probe_increase"
+            new_limit = self._increase_limit(current_limit, known_boundary, is_probe)
+
+            # 如果没有实际增长（已达边界），跳过
+            if new_limit <= old_limit:
+                return None

            # 计算窗口统计用于日志
            avg_util = sum(s["util"] for s in samples) / len(samples) if samples else 0
            high_util_count = sum(1 for s in samples if s["util"] >= self.UTILIZATION_THRESHOLD)
            high_util_ratio = high_util_count / len(samples) if samples else 0

+            boundary_info = f"边界: {known_boundary}" if known_boundary else "无边界"
            logger.info(
                f"[INCREASE] {increase_reason}: Key {key.id[:8]}... | "
                f"窗口采样: {len(samples)} | "
                f"平均利用率: {avg_util:.1%} | "
                f"高利用率比例: {high_util_ratio:.1%} | "
+                f"{boundary_info} | "
                f"调整: {old_limit} -> {new_limit}"
            )

@@ -246,13 +263,14 @@ class AdaptiveConcurrencyManager:
                high_util_ratio=round(high_util_ratio, 2),
                sample_count=len(samples),
                current_concurrent=current_concurrent,
+                known_boundary=known_boundary,
            )

            # 更新限制
            key.learned_max_concurrent = new_limit  # type: ignore[assignment]

            # 如果是探测性扩容，更新探测时间
-            if increase_reason == "probe_increase":
+            if is_probe:
                key.last_probe_increase_at = now  # type: ignore[assignment]

            # 扩容后清空采样窗口，重新开始收集
@@ -303,7 +321,11 @@ class AdaptiveConcurrencyManager:
        return samples

    def _check_increase_conditions(
-        self, key: ProviderAPIKey, samples: List[Dict[str, Any]], now: datetime
+        self,
+        key: ProviderAPIKey,
+        samples: List[Dict[str, Any]],
+        now: datetime,
+        known_boundary: Optional[int] = None,
    ) -> Optional[str]:
        """
        检查是否满足扩容条件
@@ -312,6 +334,7 @@ class AdaptiveConcurrencyManager:
            key: API Key对象
            samples: 利用率采样列表
            now: 当前时间
+            known_boundary: 已知边界（触发 429 时的并发数）

        Returns:
            扩容原因（如果满足条件），否则返回 None
@@ -320,15 +343,25 @@ class AdaptiveConcurrencyManager:
        if self._is_in_cooldown(key):
            return None

-        # 条件1：滑动窗口扩容
+        current_limit = int(key.learned_max_concurrent or self.DEFAULT_INITIAL_LIMIT)
+
+        # 条件1：滑动窗口扩容（不超过边界）
        if len(samples) >= self.MIN_SAMPLES_FOR_DECISION:
            high_util_count = sum(1 for s in samples if s["util"] >= self.UTILIZATION_THRESHOLD)
            high_util_ratio = high_util_count / len(samples)

            if high_util_ratio >= self.HIGH_UTILIZATION_RATIO:
-                return "high_utilization"
+                # 检查是否还有扩容空间（边界保护）
+                if known_boundary:
+                    # 允许扩容到边界值（而非 boundary - 1），因为缩容时已经 -1 了
+                    if current_limit < known_boundary:
+                        return "high_utilization"
+                    # 已达边界，不触发普通扩容
+                else:
+                    # 无边界信息，允许扩容
+                    return "high_utilization"

-        # 条件2：探测性扩容（长时间无 429 且有流量）
+        # 条件2：探测性扩容（长时间无 429 且有流量，可以突破边界）
        if self._should_probe_increase(key, samples, now):
            return "probe_increase"

@@ -406,32 +439,65 @@ class AdaptiveConcurrencyManager:
        current_concurrent: Optional[int] = None,
    ) -> int:
        """
-        减少并发限制
+        减少并发限制（基于边界记忆策略）

        策略：
-        - 如果知道当前并发数，设置为当前并发的70%
-        - 否则，使用乘性减少
+        - 如果知道触发 429 时的并发数，新限制 = 并发数 - 1
+        - 这样可以快速收敛到真实限制附近，而不会过度保守
+        - 例如：真实限制 8，触发时并发 8 -> 新限制 7（而非 8*0.85=6）
        """
-        if current_concurrent:
-            # 基于当前并发数减少
-            new_limit = max(
-                int(current_concurrent * self.DECREASE_MULTIPLIER), self.MIN_CONCURRENT_LIMIT
-            )
+        if current_concurrent is not None and current_concurrent > 0:
+            # 边界记忆策略：新限制 = 触发边界 - 1
+            candidate = current_concurrent - 1
        else:
-            # 乘性减少
-            new_limit = max(
-                int(current_limit * self.DECREASE_MULTIPLIER), self.MIN_CONCURRENT_LIMIT
-            )
+            # 没有并发信息时，保守减少 1
+            candidate = current_limit - 1
+
+        # 保证不会“缩容变扩容”（例如 current_concurrent > current_limit 的异常场景）
+        candidate = min(candidate, current_limit - 1)
+
+        new_limit = max(candidate, self.MIN_CONCURRENT_LIMIT)

        return new_limit

-    def _increase_limit(self, current_limit: int) -> int:
+    def _increase_limit(
+        self,
+        current_limit: int,
+        known_boundary: Optional[int] = None,
+        is_probe: bool = False,
+    ) -> int:
        """
-        增加并发限制
+        增加并发限制（考虑边界保护）

-        策略：加性增加 (+1)
+        策略：
+        - 普通扩容：每次 +INCREASE_STEP，但不超过 known_boundary
+          （因为缩容时已经 -1 了，这里允许回到边界值尝试）
+        - 探测性扩容：每次只 +1，可以突破边界，但要谨慎
+
+        Args:
+            current_limit: 当前限制
+            known_boundary: 已知边界（last_concurrent_peak），即触发 429 时的并发数
+            is_probe: 是否是探测性扩容（可以突破边界）
        """
-        new_limit = min(current_limit + self.INCREASE_STEP, self.MAX_CONCURRENT_LIMIT)
+        if is_probe:
+            # 探测模式：每次只 +1，谨慎突破边界
+            new_limit = current_limit + 1
+        else:
+            # 普通模式：每次 +INCREASE_STEP
+            new_limit = current_limit + self.INCREASE_STEP
+
+            # 边界保护：普通扩容不超过 known_boundary（允许回到边界值尝试）
+            if known_boundary:
+                if new_limit > known_boundary:
+                    new_limit = known_boundary
+
+        # 全局上限保护
+        new_limit = min(new_limit, self.MAX_CONCURRENT_LIMIT)
+
+        # 确保有增长（否则返回原值表示不扩容）
+        if new_limit <= current_limit:
+            return current_limit
+
        return new_limit

    def _record_adjustment(
@@ -503,11 +569,16 @@ class AdaptiveConcurrencyManager:
        if key.last_probe_increase_at:
            last_probe_at_str = cast(datetime, key.last_probe_increase_at).isoformat()

+        # 边界信息
+        known_boundary = key.last_concurrent_peak
+
        return {
            "adaptive_mode": is_adaptive,
            "max_concurrent": key.max_concurrent,  # NULL=自适应，数字=固定限制
            "effective_limit": effective_limit,  # 当前有效限制
            "learned_limit": key.learned_max_concurrent,  # 学习到的限制
+            # 边界记忆相关
+            "known_boundary": known_boundary,  # 触发 429 时的并发数（已知上限）
            "concurrent_429_count": int(key.concurrent_429_count or 0),
            "rpm_429_count": int(key.rpm_429_count or 0),
            "last_429_at": last_429_at_str,