feat: claude support cache control;

This commit is contained in:
Bing Zhu 2025-08-28 23:06:50 +08:00
parent 2875af774e
commit c45e28a806

View File

@ -962,6 +962,84 @@ export function resetAnthropicClient(): void {
* 4. Fallback region (us-east5)
*/
/**
* Manage cache control to ensure it doesn't exceed Claude's 4 cache block limit
* Priority:
* 1. System prompts (high priority)
* 2. Long documents or reference materials (high priority)
* 3. Reusable context (medium priority)
* 4. Short messages or one-time content (no caching)
*/
function applyCacheControlWithLimits(
systemBlocks: TextBlockParam[],
messageParams: MessageParam[]
): { systemBlocks: TextBlockParam[]; messageParams: MessageParam[] } {
if (!PROMPT_CACHING_ENABLED) {
return { systemBlocks, messageParams }
}
const maxCacheBlocks = 4
let usedCacheBlocks = 0
// 1. Prioritize adding cache to system prompts (highest priority)
const processedSystemBlocks = systemBlocks.map((block, index) => {
if (usedCacheBlocks < maxCacheBlocks && block.text.length > 1000) {
usedCacheBlocks++
return {
...block,
cache_control: { type: 'ephemeral' as const }
}
}
const { cache_control, ...blockWithoutCache } = block
return blockWithoutCache
})
// 2. Add cache to message content based on priority
const processedMessageParams = messageParams.map((message, messageIndex) => {
if (Array.isArray(message.content)) {
const processedContent = message.content.map((contentBlock, blockIndex) => {
// Determine whether this content block should be cached
const shouldCache =
usedCacheBlocks < maxCacheBlocks &&
contentBlock.type === 'text' &&
typeof contentBlock.text === 'string' &&
(
// Long documents (over 2000 characters)
contentBlock.text.length > 2000 ||
// Last content block of the last message (may be important context)
(messageIndex === messageParams.length - 1 &&
blockIndex === message.content.length - 1 &&
contentBlock.text.length > 500)
)
if (shouldCache) {
usedCacheBlocks++
return {
...contentBlock,
cache_control: { type: 'ephemeral' as const }
}
}
// Remove existing cache_control
const { cache_control, ...blockWithoutCache } = contentBlock as any
return blockWithoutCache
})
return {
...message,
content: processedContent
}
}
return message
})
return {
systemBlocks: processedSystemBlocks,
messageParams: processedMessageParams
}
}
export function userMessageToMessageParam(
message: UserMessage,
addCache = false,
@ -974,23 +1052,13 @@ export function userMessageToMessageParam(
{
type: 'text',
text: message.message.content,
...(PROMPT_CACHING_ENABLED
? { cache_control: { type: 'ephemeral' } }
: {}),
},
],
}
} else {
return {
role: 'user',
content: message.message.content.map((_, i) => ({
..._,
...(i === message.message.content.length - 1
? PROMPT_CACHING_ENABLED
? { cache_control: { type: 'ephemeral' } }
: {}
: {}),
})),
content: message.message.content.map((_) => ({ ..._ })),
}
}
}
@ -1012,25 +1080,13 @@ export function assistantMessageToMessageParam(
{
type: 'text',
text: message.message.content,
...(PROMPT_CACHING_ENABLED
? { cache_control: { type: 'ephemeral' } }
: {}),
},
],
}
} else {
return {
role: 'assistant',
content: message.message.content.map((_, i) => ({
..._,
...(i === message.message.content.length - 1 &&
_.type !== 'thinking' &&
_.type !== 'redacted_thinking'
? PROMPT_CACHING_ENABLED
? { cache_control: { type: 'ephemeral' } }
: {}
: {}),
})),
content: message.message.content.map((_) => ({ ..._ })),
}
}
}
@ -1383,9 +1439,6 @@ async function queryAnthropicNative(
const system: TextBlockParam[] = splitSysPromptPrefix(systemPrompt).map(
_ => ({
...(PROMPT_CACHING_ENABLED
? { cache_control: { type: 'ephemeral' } }
: {}),
text: _,
type: 'text',
}),
@ -1404,6 +1457,10 @@ async function queryAnthropicNative(
)
const anthropicMessages = addCacheBreakpoints(messages)
// apply cache control
const { systemBlocks: processedSystem, messageParams: processedMessages } =
applyCacheControlWithLimits(system, anthropicMessages)
const startIncludingRetries = Date.now()
// 记录系统提示构建过程
@ -1426,8 +1483,8 @@ async function queryAnthropicNative(
const params: Anthropic.Beta.Messages.MessageCreateParams = {
model,
max_tokens: getMaxTokensFromProfile(modelProfile),
messages: anthropicMessages,
system,
messages: processedMessages,
system: processedSystem,
tools: toolSchemas.length > 0 ? toolSchemas : undefined,
tool_choice: toolSchemas.length > 0 ? { type: 'auto' } : undefined,
}
@ -1450,6 +1507,7 @@ async function queryAnthropicNative(
: null,
maxTokens: params.max_tokens,
temperature: MAIN_QUERY_TEMPERATURE,
params: params,
messageCount: params.messages?.length || 0,
streamMode: true,
toolsCount: toolSchemas.length,
@ -1609,6 +1667,10 @@ async function queryAnthropicNative(
}
}, { signal })
debugLogger.api('ANTHROPIC_API_CALL_SUCCESS', {
content: response.content
})
const ttftMs = start - Date.now()
const durationMs = Date.now() - startIncludingRetries