diff --git a/app/llm.py b/app/llm.py index 18a13af..47e18ab 100644 --- a/app/llm.py +++ b/app/llm.py @@ -59,6 +59,7 @@ class LLM: # Add token counting related attributes self.total_input_tokens = 0 + self.total_completion_tokens = 0 self.max_input_tokens = ( llm_config.max_input_tokens if hasattr(llm_config, "max_input_tokens") @@ -129,12 +130,15 @@ class LLM: return token_count - def update_token_count(self, input_tokens: int) -> None: + def update_token_count(self, input_tokens: int, completion_tokens: int = 0) -> None: """Update token counts""" # Only track tokens if max_input_tokens is set self.total_input_tokens += input_tokens + self.total_completion_tokens += completion_tokens logger.info( - f"Token usage: Input={input_tokens}, Cumulative Input={self.total_input_tokens}" + f"Token usage: Input={input_tokens}, Completion={completion_tokens}, " + f"Cumulative Input={self.total_input_tokens}, Cumulative Completion={self.total_completion_tokens}, " + f"Total={input_tokens + completion_tokens}, Cumulative Total={self.total_input_tokens + self.total_completion_tokens}" ) def check_token_limit(self, input_tokens: int) -> bool: @@ -271,7 +275,9 @@ class LLM: raise ValueError("Empty or invalid response from LLM") # Update token counts - self.update_token_count(response.usage.prompt_tokens) + self.update_token_count( + response.usage.prompt_tokens, response.usage.completion_tokens + ) return response.choices[0].message.content @@ -282,9 +288,11 @@ class LLM: response = await self.client.chat.completions.create(**params) collected_messages = [] + completion_text = "" async for chunk in response: chunk_message = chunk.choices[0].delta.content or "" collected_messages.append(chunk_message) + completion_text += chunk_message print(chunk_message, end="", flush=True) print() # Newline after streaming @@ -292,6 +300,13 @@ class LLM: if not full_response: raise ValueError("Empty response from streaming LLM") + # 对于流式响应,估算completion tokens + completion_tokens = self.count_tokens(completion_text) + logger.info( + f"Estimated completion tokens for streaming response: {completion_tokens}" + ) + self.total_completion_tokens += completion_tokens + return full_response except TokenLimitExceeded: @@ -412,7 +427,9 @@ class LLM: raise ValueError("Invalid or empty response from LLM") # Update token counts - self.update_token_count(response.usage.prompt_tokens) + self.update_token_count( + response.usage.prompt_tokens, response.usage.completion_tokens + ) return response.choices[0].message diff --git a/app/tool/file_saver.py b/app/tool/file_saver.py index 96d64b3..7d92a02 100644 --- a/app/tool/file_saver.py +++ b/app/tool/file_saver.py @@ -2,8 +2,8 @@ import os import aiofiles -from app.tool.base import BaseTool from app.config import WORKSPACE_ROOT +from app.tool.base import BaseTool class FileSaver(BaseTool):