feat: add daily token limit with actual usage tracking (#171)

* feat: add daily token limit with actual usage tracking

- Add DAILY_TOKEN_LIMIT env var for configurable daily token limit
- Track actual tokens from Bedrock API response metadata (not estimates)
- Server sends inputTokens + cachedInputTokens + outputTokens via messageMetadata
- Client increments token count in onFinish callback with actual usage
- Add NaN guards to prevent corrupted localStorage values
- Add token limit toast notification with quota display
- Remove client-side token estimation (was blocking legitimate requests)
- Switch to js-tiktoken for client compatibility (pure JS, no WASM)

* feat: add TPM (tokens per minute) rate limiting

- Add 50k tokens/min client-side rate limit
- Track tokens per minute with automatic minute rollover
- Check TPM limit after daily limits pass
- Show toast when rate limit reached
- NaN guards for localStorage values

* feat: make TPM limit configurable via TPM_LIMIT env var

* chore: restore cache debug logs

* fix: prevent race condition in TPM tracking

checkTPMLimit was resetting TPM count to 0 when checking, which
overwrote the count saved by incrementTPMCount. Now checkTPMLimit
only reads and incrementTPMCount handles all writes.

* chore: improve TPM limit error message clarity
This commit is contained in:
Dayuan Jiang
2025-12-08 18:56:34 +09:00
committed by GitHub
parent 728dda5267
commit 622829b903
7 changed files with 285 additions and 66 deletions

View File

@@ -189,32 +189,11 @@ async function handleChatRequest(req: Request): Promise<Response> {
const textPart = lastMessage.parts?.find((p: any) => p.type === "text")
const filePart = lastMessage.parts?.find((p: any) => p.type === "file")
console.log("[Cache DEBUG] textPart?.text:", textPart?.text)
console.log("[Cache DEBUG] hasFilePart:", !!filePart)
const cached = findCachedResponse(textPart?.text || "", !!filePart)
console.log("[Cache DEBUG] cached found:", !!cached)
if (cached) {
console.log(
"[Cache] Returning cached response for:",
textPart?.text,
)
return createCachedStreamResponse(cached.xml)
} else {
console.log("[Cache DEBUG] No cache match - checking why...")
console.log(
"[Cache DEBUG] Exact promptText:",
JSON.stringify(textPart?.text),
)
}
} else {
console.log("[Cache DEBUG] Skipping cache check - conditions not met")
if (!isFirstMessage)
console.log("[Cache DEBUG] Reason: not first message")
if (!isEmptyDiagram)
console.log("[Cache DEBUG] Reason: diagram not empty")
}
// === CACHE CHECK END ===
@@ -243,28 +222,6 @@ ${lastMessageText}
// Convert UIMessages to ModelMessages and add system message
const modelMessages = convertToModelMessages(messages)
// Debug: log raw messages to see what's coming in
console.log(
"[DEBUG] Raw UI messages:",
JSON.stringify(
messages.map((m: any, i: number) => ({
index: i,
role: m.role,
partsCount: m.parts?.length,
parts: m.parts?.map((p: any) => ({
type: p.type,
toolName: p.toolName,
toolCallId: p.toolCallId,
state: p.state,
inputType: p.input ? typeof p.input : undefined,
input: p.input,
})),
})),
null,
2,
),
)
// Fix tool call inputs for Bedrock API (requires JSON objects, not strings)
const fixedMessages = fixToolCallInputs(modelMessages)
@@ -383,14 +340,8 @@ ${lastMessageText}
}
return null
},
onFinish: ({ text, usage, providerMetadata }) => {
console.log(
"[Cache] Full providerMetadata:",
JSON.stringify(providerMetadata, null, 2),
)
console.log("[Cache] Usage:", JSON.stringify(usage, null, 2))
onFinish: ({ text, usage }) => {
// Pass usage to Langfuse (Bedrock streaming doesn't auto-report tokens to telemetry)
// AI SDK uses inputTokens/outputTokens, Langfuse expects promptTokens/completionTokens
setTraceOutput(text, {
promptTokens: usage?.inputTokens,
completionTokens: usage?.outputTokens,
@@ -476,7 +427,28 @@ IMPORTANT: Keep edits concise:
}),
})
return result.toUIMessageStreamResponse()
return result.toUIMessageStreamResponse({
messageMetadata: ({ part }) => {
if (part.type === "finish") {
const usage = (part as any).totalUsage
if (!usage) {
console.warn(
"[messageMetadata] No usage data in finish part",
)
return undefined
}
// Total input = non-cached + cached (these are separate counts)
// Note: cacheWriteInputTokens is not available on finish part
const totalInputTokens =
(usage.inputTokens ?? 0) + (usage.cachedInputTokens ?? 0)
return {
inputTokens: totalInputTokens,
outputTokens: usage.outputTokens ?? 0,
}
}
return undefined
},
})
}
// Wrap handler with error handling

View File

@@ -9,5 +9,7 @@ export async function GET() {
return NextResponse.json({
accessCodeRequired: accessCodes.length > 0,
dailyRequestLimit: parseInt(process.env.DAILY_REQUEST_LIMIT || "0", 10),
dailyTokenLimit: parseInt(process.env.DAILY_TOKEN_LIMIT || "0", 10),
tpmLimit: parseInt(process.env.TPM_LIMIT || "0", 10),
})
}