refactor(handler): optimize stream processing and telemetry pipeline

- Enhance stream context for better token and latency tracking - Refactor stream processor for improved performance metrics - Improve telemetry integration with first_byte_time_ms support - Add comprehensive stream context unit tests
2026-01-07 02:02:27 +08:00 · 2025-12-16 02:39:03 +08:00
parent 9b496abb73
commit ad1c8c394c
8 changed files with 428 additions and 108 deletions
--- a/src/api/handlers/base/stream_processor.py
+++ b/src/api/handlers/base/stream_processor.py
@@ -9,7 +9,9 @@
 """

 import asyncio
+import codecs
 import json
+import time
 from typing import Any, AsyncGenerator, Callable, Optional

 import httpx
@@ -36,6 +38,8 @@ class StreamProcessor:
        request_id: str,
        default_parser: ResponseParser,
        on_streaming_start: Optional[Callable[[], None]] = None,
+        *,
+        collect_text: bool = False,
    ):
        """
        初始化流处理器
@@ -48,6 +52,7 @@ class StreamProcessor:
        self.request_id = request_id
        self.default_parser = default_parser
        self.on_streaming_start = on_streaming_start
+        self.collect_text = collect_text

    def get_parser_for_provider(self, ctx: StreamContext) -> ResponseParser:
        """
@@ -112,9 +117,10 @@ class StreamProcessor:
            )

        # 提取文本
-        text = parser.extract_text_content(data)
-        if text:
-            ctx.collected_text += text
+        if self.collect_text:
+            text = parser.extract_text_content(data)
+            if text:
+                ctx.append_text(text)

        # 检查完成
        event_type = event_name or data.get("type", "")
@@ -123,7 +129,7 @@ class StreamProcessor:

    async def prefetch_and_check_error(
        self,
-        line_iterator: Any,
+        byte_iterator: Any,
        provider: Provider,
        endpoint: ProviderEndpoint,
        ctx: StreamContext,
@@ -136,97 +142,126 @@ class StreamProcessor:
        这种情况需要在流开始输出之前检测，以便触发重试逻辑。

        Args:
-            line_iterator: 行迭代器
+            byte_iterator: 字节流迭代器
            provider: Provider 对象
            endpoint: Endpoint 对象
            ctx: 流式上下文
            max_prefetch_lines: 最多预读行数

        Returns:
-            预读的行列表
+            预读的字节块列表

        Raises:
            EmbeddedErrorException: 如果检测到嵌套错误
        """
-        prefetched_lines: list = []
+        prefetched_chunks: list = []
        parser = self.get_parser_for_provider(ctx)
+        buffer = b""
+        line_count = 0
+        should_stop = False
+        # 使用增量解码器处理跨 chunk 的 UTF-8 字符
+        decoder = codecs.getincrementaldecoder("utf-8")(errors="replace")

        try:
-            line_count = 0
-            async for line in line_iterator:
-                prefetched_lines.append(line)
-                line_count += 1
+            async for chunk in byte_iterator:
+                prefetched_chunks.append(chunk)
+                buffer += chunk

-                normalized_line = line.rstrip("\r")
-                if not normalized_line or normalized_line.startswith(":"):
-                    if line_count >= max_prefetch_lines:
+                # 尝试按行解析缓冲区
+                while b"\n" in buffer:
+                    line_bytes, buffer = buffer.split(b"\n", 1)
+                    try:
+                        # 使用增量解码器，可以正确处理跨 chunk 的多字节字符
+                        line = decoder.decode(line_bytes + b"\n", False).rstrip("\r\n")
+                    except Exception as e:
+                        logger.warning(
+                            f"[{self.request_id}] 预读时 UTF-8 解码失败: {e}, "
+                            f"bytes={line_bytes[:50]!r}"
+                        )
+                        continue
+
+                    line_count += 1
+
+                    # 跳过空行和注释行
+                    if not line or line.startswith(":"):
+                        if line_count >= max_prefetch_lines:
+                            should_stop = True
+                            break
+                        continue
+
+                    # 尝试解析 SSE 数据
+                    data_str = line
+                    if line.startswith("data: "):
+                        data_str = line[6:]
+
+                    if data_str == "[DONE]":
+                        should_stop = True
                        break
-                    continue

-                # 尝试解析 SSE 数据
-                data_str = normalized_line
-                if normalized_line.startswith("data: "):
-                    data_str = normalized_line[6:]
+                    try:
+                        data = json.loads(data_str)
+                    except json.JSONDecodeError:
+                        if line_count >= max_prefetch_lines:
+                            should_stop = True
+                            break
+                        continue

-                if data_str == "[DONE]":
+                    # 使用解析器检查是否为错误响应
+                    if isinstance(data, dict) and parser.is_error_response(data):
+                        parsed = parser.parse_response(data, 200)
+                        logger.warning(
+                            f"  [{self.request_id}] 检测到嵌套错误: "
+                            f"Provider={provider.name}, "
+                            f"error_type={parsed.error_type}, "
+                            f"message={parsed.error_message}"
+                        )
+                        raise EmbeddedErrorException(
+                            provider_name=str(provider.name),
+                            error_code=(
+                                int(parsed.error_type)
+                                if parsed.error_type and parsed.error_type.isdigit()
+                                else None
+                            ),
+                            error_message=parsed.error_message,
+                            error_status=parsed.error_type,
+                        )
+
+                    # 预读到有效数据，没有错误，停止预读
+                    should_stop = True
                    break

-                try:
-                    data = json.loads(data_str)
-                except json.JSONDecodeError:
-                    if line_count >= max_prefetch_lines:
-                        break
-                    continue
-
-                # 使用解析器检查是否为错误响应
-                if isinstance(data, dict) and parser.is_error_response(data):
-                    parsed = parser.parse_response(data, 200)
-                    logger.warning(
-                        f"  [{self.request_id}] 检测到嵌套错误: "
-                        f"Provider={provider.name}, "
-                        f"error_type={parsed.error_type}, "
-                        f"message={parsed.error_message}"
-                    )
-                    raise EmbeddedErrorException(
-                        provider_name=str(provider.name),
-                        error_code=(
-                            int(parsed.error_type)
-                            if parsed.error_type and parsed.error_type.isdigit()
-                            else None
-                        ),
-                        error_message=parsed.error_message,
-                        error_status=parsed.error_type,
-                    )
-
-                # 预读到有效数据，没有错误，停止预读
-                break
+                if should_stop or line_count >= max_prefetch_lines:
+                    break

        except EmbeddedErrorException:
            raise
        except Exception as e:
            logger.debug(f"  [{self.request_id}] 预读流时发生异常: {e}")

-        return prefetched_lines
+        return prefetched_chunks

    async def create_response_stream(
        self,
        ctx: StreamContext,
-        line_iterator: Any,
+        byte_iterator: Any,
        response_ctx: Any,
        http_client: httpx.AsyncClient,
-        prefetched_lines: Optional[list] = None,
+        prefetched_chunks: Optional[list] = None,
+        *,
+        start_time: Optional[float] = None,
    ) -> AsyncGenerator[bytes, None]:
        """
        创建响应流生成器

-        统一的流生成器，支持带预读数据和不带预读数据两种情况。
+        从字节流中解析 SSE 数据并转发，支持预读数据。

        Args:
            ctx: 流式上下文
-            line_iterator: 行迭代器
+            byte_iterator: 字节流迭代器
            response_ctx: HTTP 响应上下文管理器
            http_client: HTTP 客户端
-            prefetched_lines: 预读的行列表（可选）
+            prefetched_chunks: 预读的字节块列表（可选）
+            start_time: 请求开始时间,用于计算 TTFB（可选）

        Yields:
            编码后的响应数据块
@@ -234,25 +269,82 @@ class StreamProcessor:
        try:
            sse_parser = SSEEventParser()
            streaming_started = False
+            buffer = b""
+            # 使用增量解码器处理跨 chunk 的 UTF-8 字符
+            decoder = codecs.getincrementaldecoder("utf-8")(errors="replace")

            # 处理预读数据
-            if prefetched_lines:
+            if prefetched_chunks:
                if not streaming_started and self.on_streaming_start:
                    self.on_streaming_start()
                    streaming_started = True

-                for line in prefetched_lines:
-                    for chunk in self._process_line(ctx, sse_parser, line):
-                        yield chunk
+                for chunk in prefetched_chunks:
+                    # 记录首字时间 (TTFB) - 在 yield 之前记录
+                    if start_time is not None:
+                        ctx.record_first_byte_time(start_time)
+                        start_time = None  # 只记录一次
+
+                    # 把原始数据转发给客户端
+                    yield chunk
+
+                    buffer += chunk
+                    # 处理缓冲区中的完整行
+                    while b"\n" in buffer:
+                        line_bytes, buffer = buffer.split(b"\n", 1)
+                        try:
+                            # 使用增量解码器，可以正确处理跨 chunk 的多字节字符
+                            line = decoder.decode(line_bytes + b"\n", False)
+                            self._process_line(ctx, sse_parser, line)
+                        except Exception as e:
+                            # 解码失败，记录警告但继续处理
+                            logger.warning(
+                                f"[{self.request_id}] UTF-8 解码失败: {e}, "
+                                f"bytes={line_bytes[:50]!r}"
+                            )
+                            continue

            # 处理剩余的流数据
-            async for line in line_iterator:
+            async for chunk in byte_iterator:
                if not streaming_started and self.on_streaming_start:
                    self.on_streaming_start()
                    streaming_started = True

-                for chunk in self._process_line(ctx, sse_parser, line):
-                    yield chunk
+                # 记录首字时间 (TTFB) - 在 yield 之前记录（如果预读数据为空）
+                if start_time is not None:
+                    ctx.record_first_byte_time(start_time)
+                    start_time = None  # 只记录一次
+
+                # 原始数据透传
+                yield chunk
+
+                buffer += chunk
+                # 处理缓冲区中的完整行
+                while b"\n" in buffer:
+                    line_bytes, buffer = buffer.split(b"\n", 1)
+                    try:
+                        # 使用增量解码器，可以正确处理跨 chunk 的多字节字符
+                        line = decoder.decode(line_bytes + b"\n", False)
+                        self._process_line(ctx, sse_parser, line)
+                    except Exception as e:
+                        # 解码失败，记录警告但继续处理
+                        logger.warning(
+                            f"[{self.request_id}] UTF-8 解码失败: {e}, "
+                            f"bytes={line_bytes[:50]!r}"
+                        )
+                        continue
+
+            # 处理剩余的缓冲区数据（如果有未完成的行）
+            if buffer:
+                try:
+                    # 使用 final=True 处理最后的不完整字符
+                    line = decoder.decode(buffer, True)
+                    self._process_line(ctx, sse_parser, line)
+                except Exception as e:
+                    logger.warning(
+                        f"[{self.request_id}] 处理剩余缓冲区失败: {e}, "
+                        f"bytes={buffer[:50]!r}"
+                    )

            # 处理剩余事件
            for event in sse_parser.flush():
@@ -268,7 +360,7 @@ class StreamProcessor:
        ctx: StreamContext,
        sse_parser: SSEEventParser,
        line: str,
-    ) -> list[bytes]:
+    ) -> None:
        """
        处理单行数据

@@ -276,26 +368,17 @@ class StreamProcessor:
            ctx: 流式上下文
            sse_parser: SSE 解析器
            line: 原始行数据
-
-        Returns:
-            要发送的数据块列表
        """
-        result: list[bytes] = []
-        normalized_line = line.rstrip("\r")
+        # SSEEventParser 以“去掉换行符”的单行文本作为输入；这里统一剔除 CR/LF，
+        # 避免把空行误判成 "\n" 并导致事件边界解析错误。
+        normalized_line = line.rstrip("\r\n")
        events = sse_parser.feed_line(normalized_line)

-        if normalized_line == "":
-            for event in events:
-                self.handle_sse_event(ctx, event.get("event"), event.get("data") or "")
-            result.append(b"\n")
-        else:
+        if normalized_line != "":
            ctx.chunk_count += 1
-            result.append((line + "\n").encode("utf-8"))

-            for event in events:
-                self.handle_sse_event(ctx, event.get("event"), event.get("data") or "")
-
-        return result
+        for event in events:
+            self.handle_sse_event(ctx, event.get("event"), event.get("data") or "")

    async def create_monitored_stream(
        self,
@@ -317,16 +400,26 @@ class StreamProcessor:
            响应数据块
        """
        try:
+            # 断连检查频率：每次 await 都会引入调度开销，过于频繁会让流式"发一段停一段"
+            # 这里按时间间隔节流，兼顾及时停止上游读取与吞吐平滑性。
+            next_disconnect_check_at = 0.0
+            disconnect_check_interval_s = 0.25
+
            async for chunk in stream_generator:
-                if await is_disconnected():
-                    logger.warning(f"ID:{self.request_id} | Client disconnected")
-                    ctx.status_code = 499  # Client Closed Request
-                    ctx.error_message = "client_disconnected"
-                    break
+                now = time.monotonic()
+                if now >= next_disconnect_check_at:
+                    next_disconnect_check_at = now + disconnect_check_interval_s
+                    if await is_disconnected():
+                        logger.warning(f"ID:{self.request_id} | Client disconnected")
+                        ctx.status_code = 499  # Client Closed Request
+                        ctx.error_message = "client_disconnected"
+
+                        break
                yield chunk
        except asyncio.CancelledError:
            ctx.status_code = 499
            ctx.error_message = "client_disconnected"
+
            raise
        except Exception as e:
            ctx.status_code = 500