LiteLLM Minor Fixes & Improvements (10/09/2024) (#6139)

* fix(utils.py): don't return 'none' response headers Fixes #6123 * fix(vertex_and_google_ai_studio_gemini.py): support parsing out additional properties and strict value for tool calls Fixes #6136 * fix(cost_calculator.py): set default character value to none Fixes #6133 (comment) * fix(google.py): fix cost per token / cost per char conversion Fixes #6133 (comment) * build(model_prices_and_context_window.json): update gemini pricing Fixes #6133 * build(model_prices_and_context_window.json): update gemini pricing * fix(litellm_logging.py): fix streaming caching logging when 'turn_off_message_logging' enabled Stores unredacted response in cache * build(model_prices_and_context_window.json): update gemini-1.5-flash pricing * fix(cost_calculator.py): fix default prompt_character count logic Fixes error in gemini cost calculation * fix(cost_calculator.py): fix cost calc for tts models
BerriAI · Oct 10, 2024 · 6005450 · 6005450
1 parent 60baa65
commit 6005450
Show file tree

Hide file tree

Showing 16 changed files with 761 additions and 507 deletions.
diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py
@@ -87,8  87,8 @@ def cost_per_token(
     custom_llm_provider: Optional[str] = None,
     region_name=None,
     ### CHARACTER PRICING ###
-    prompt_characters: int = 0,
-    completion_characters: int = 0,
     prompt_characters: Optional[int] = None,
     completion_characters: Optional[int] = None,
     ### PROMPT CACHING PRICING ### - used for anthropic
     cache_creation_input_tokens: Optional[int] = 0,
     cache_read_input_tokens: Optional[int] = 0,
@@ -201,13  201,24 @@ def cost_per_token(
         model = model_without_prefix
 
     # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
-    print_verbose(f"Looking up model={model} in model_cost_map")
     print_verbose(
         f"Looking up model={model} in model_cost_map, custom_llm_provider={custom_llm_provider}, call_type={call_type}"
     )
     if call_type == "speech" or call_type == "aspeech":
         if prompt_characters is None:
             raise ValueError(
                 "prompt_characters must be provided for tts calls. prompt_characters={}, model={}, custom_llm_provider={}, call_type={}".format(
                     prompt_characters,
                     model,
                     custom_llm_provider,
                     call_type,
                 )
             )
         prompt_cost, completion_cost = _generic_cost_per_character(
             model=model_without_prefix,
             custom_llm_provider=custom_llm_provider,
             prompt_characters=prompt_characters,
-            completion_characters=completion_characters,
             completion_characters=0,
             custom_prompt_cost=None,
             custom_completion_cost=0,
         )
@@ -232,10  243,6 @@ def cost_per_token(
         cost_router = google_cost_router(
             model=model_without_prefix,
             custom_llm_provider=custom_llm_provider,
-            prompt_characters=prompt_characters,
-            completion_characters=completion_characters,
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
             call_type=call_type,
         )
         if cost_router == "cost_per_character":
@@ -542,9  549,9 @@ def completion_cost(
             model = "dall-e-2"  # for dall-e-2, azure expects an empty model name
         # Handle Inputs to completion_cost
         prompt_tokens = 0
-        prompt_characters = 0
         prompt_characters: Optional[int] = None
         completion_tokens = 0
-        completion_characters = 0
         completion_characters: Optional[int] = None
         cache_creation_input_tokens: Optional[int] = None
         cache_read_input_tokens: Optional[int] = None
         if completion_response is not None and (
@@ -721,10  728,8 @@ def completion_cost(
                 prompt_string = litellm.utils.get_formatted_prompt(
                     data={"messages": messages}, call_type="completion"
                 )
-            else:
-                prompt_string = ""
 
-            prompt_characters = litellm.utils._count_characters(text=prompt_string)
                 prompt_characters = litellm.utils._count_characters(text=prompt_string)
             if completion_response is not None and isinstance(
                 completion_response, ModelResponse
             ):

diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py
@@ -901,14  901,19 @@ def success_handler(
                         complete_streaming_response = None
                 else:
                     self.sync_streaming_chunks.append(result)
-
             _caching_complete_streaming_response: Optional[
                 Union[ModelResponse, TextCompletionResponse]
             ] = None
             if complete_streaming_response is not None:
                 verbose_logger.debug(
                     "Logging Details LiteLLM-Success Call streaming complete"
                 )
                 self.model_call_details["complete_streaming_response"] = (
                     complete_streaming_response
                 )
                 _caching_complete_streaming_response = copy.deepcopy(
                     complete_streaming_response
                 )
                 self.model_call_details["response_cost"] = (
                     self._response_cost_calculator(result=complete_streaming_response)
                 )
@@ -937,6  942,20 @@ def success_handler(
             else:
                 callbacks = litellm.success_callback
 
             ## STREAMING CACHING ##
             if "cache" in callbacks and litellm.cache is not None:
                 # this only logs streaming once, complete_streaming_response exists i.e when stream ends
                 print_verbose("success_callback: reaches cache for logging!")
                 kwargs = self.model_call_details
                 if self.stream and _caching_complete_streaming_response is not None:
                     print_verbose(
                         "success_callback: reaches cache for logging, there is a complete_streaming_response. Adding to cache"
                     )
                     result = _caching_complete_streaming_response
                     # only add to cache once we have a complete streaming response
                     litellm.cache.add_cache(result, **kwargs)
 
             ## REDACT MESSAGES ##
             result = redact_message_input_output_from_logging(
                 model_call_details=(
                     self.model_call_details
@@ -1302,23  1321,6 @@ def success_handler(
                             end_time=end_time,
                             print_verbose=print_verbose,
                         )
-                    if callback == "cache" and litellm.cache is not None:
-                        # this only logs streaming once, complete_streaming_response exists i.e when stream ends
-                        print_verbose("success_callback: reaches cache for logging!")
-                        kwargs = self.model_call_details
-                        if self.stream:
-                            if "complete_streaming_response" not in kwargs:
-                                print_verbose(
-                                    f"success_callback: reaches cache for logging, there is no complete_streaming_response. Kwargs={kwargs}\n\n"
-                                )
-                                pass
-                            else:
-                                print_verbose(
-                                    "success_callback: reaches cache for logging, there is a complete_streaming_response. Adding to cache"
-                                )
-                                result = kwargs["complete_streaming_response"]
-                                # only add to cache once we have a complete streaming response
-                                litellm.cache.add_cache(result, **kwargs)
                     if callback == "athina" and athinaLogger is not None:
                         deep_copy = {}
                         for k, v in self.model_call_details.items():

diff --git a/litellm/litellm_core_utils/llm_cost_calc/google.py b/litellm/litellm_core_utils/llm_cost_calc/google.py
@@ -32,10  32,6 @@ def _is_above_128k(tokens: float) -> bool:
 def cost_router(
     model: str,
     custom_llm_provider: str,
-    prompt_tokens: float,
-    completion_tokens: float,
-    prompt_characters: float,
-    completion_characters: float,
     call_type: Union[Literal["embedding", "aembedding"], str],
 ) -> Literal["cost_per_character", "cost_per_token"]:
     """
@@ -66,8  62,8 @@ def cost_per_character(
     custom_llm_provider: str,
     prompt_tokens: float,
     completion_tokens: float,
-    prompt_characters: float,
-    completion_characters: float,
     prompt_characters: Optional[float] = None,
     completion_characters: Optional[float] = None,
 ) -> Tuple[float, float]:
     """
     Calculates the cost per character for a given VertexAI model, input messages, and response object.
@@ -94,87  90,100 @@ def cost_per_character(
     )
 
     ## CALCULATE INPUT COST
-    try:
-        if (
-            _is_above_128k(tokens=prompt_characters * 4)  # 1 token = 4 char
-            and model not in models_without_dynamic_pricing
-        ):
-            ## check if character pricing, else default to token pricing
-            assert (
-                "input_cost_per_character_above_128k_tokens" in model_info
-                and model_info["input_cost_per_character_above_128k_tokens"] is not None
-            ), "model info for model={} does not have 'input_cost_per_character_above_128k_tokens'-pricing for > 128k tokens\nmodel_info={}".format(
-                model, model_info
-            )
-            prompt_cost = (
-                prompt_characters
-                * model_info["input_cost_per_character_above_128k_tokens"]
-            )
-        else:
-            assert (
-                "input_cost_per_character" in model_info
-                and model_info["input_cost_per_character"] is not None
-            ), "model info for model={} does not have 'input_cost_per_character'-pricing\nmodel_info={}".format(
-                model, model_info
-            )
-            prompt_cost = prompt_characters * model_info["input_cost_per_character"]
-    except Exception as e:
-        verbose_logger.exception(
-            "litellm.litellm_core_utils.llm_cost_calc.google.cost_per_character(): Defaulting to (cost_per_token * 4) calculation for prompt_cost. Exception occured - {}".format(
-                str(e)
-            )
-        )
-        initial_prompt_cost, _ = cost_per_token(
     if prompt_characters is None:
         prompt_cost, _ = cost_per_token(
             model=model,
             custom_llm_provider=custom_llm_provider,
             prompt_tokens=prompt_tokens,
             completion_tokens=completion_tokens,
         )
-
-        prompt_cost = initial_prompt_cost * 4
-
-    ## CALCULATE OUTPUT COST
-    try:
-        if (
-            _is_above_128k(tokens=completion_characters * 4)  # 1 token = 4 char
-            and model not in models_without_dynamic_pricing
-        ):
-            assert (
-                "output_cost_per_character_above_128k_tokens" in model_info
-                and model_info["output_cost_per_character_above_128k_tokens"]
-                is not None
-            ), "model info for model={} does not have 'output_cost_per_character_above_128k_tokens' pricing\nmodel_info={}".format(
-                model, model_info
-            )
-            completion_cost = (
-                completion_tokens
-                * model_info["output_cost_per_character_above_128k_tokens"]
-            )
-        else:
-            assert (
-                "output_cost_per_character" in model_info
-                and model_info["output_cost_per_character"] is not None
-            ), "model info for model={} does not have 'output_cost_per_character'-pricing\nmodel_info={}".format(
-                model, model_info
-            )
-            completion_cost = (
-                completion_tokens * model_info["output_cost_per_character"]
     else:
         try:
             if (
                 _is_above_128k(tokens=prompt_characters * 4)  # 1 token = 4 char
                 and model not in models_without_dynamic_pricing
             ):
                 ## check if character pricing, else default to token pricing
                 assert (
                     "input_cost_per_character_above_128k_tokens" in model_info
                     and model_info["input_cost_per_character_above_128k_tokens"]
                     is not None
                 ), "model info for model={} does not have 'input_cost_per_character_above_128k_tokens'-pricing for > 128k tokens\nmodel_info={}".format(
                     model, model_info
                 )
                 prompt_cost = (
                     prompt_characters
                     * model_info["input_cost_per_character_above_128k_tokens"]
                 )
             else:
                 assert (
                     "input_cost_per_character" in model_info
                     and model_info["input_cost_per_character"] is not None
                 ), "model info for model={} does not have 'input_cost_per_character'-pricing\nmodel_info={}".format(
                     model, model_info
                 )
                 prompt_cost = prompt_characters * model_info["input_cost_per_character"]
         except Exception as e:
             verbose_logger.debug(
                 "litellm.litellm_core_utils.llm_cost_calc.google.py::cost_per_character(): Exception occured - {}\nDefaulting to None".format(
                     str(e)
                 )
             )
-    except Exception as e:
-        verbose_logger.exception(
-            "litellm.litellm_core_utils.llm_cost_calc.google.cost_per_character(): \
-                Defaulting to (cost_per_token * 4) calculation for completion_cost\nException occured - {}".format(
-                str(e)
             prompt_cost, _ = cost_per_token(
                 model=model,
                 custom_llm_provider=custom_llm_provider,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
             )
-        )
-        _, initial_completion_cost = cost_per_token(
 
     ## CALCULATE OUTPUT COST
     if completion_characters is None:
         _, completion_cost = cost_per_token(
             model=model,
             custom_llm_provider=custom_llm_provider,
             prompt_tokens=prompt_tokens,
             completion_tokens=completion_tokens,
         )
     else:
         try:
             if (
                 _is_above_128k(tokens=completion_characters * 4)  # 1 token = 4 char
                 and model not in models_without_dynamic_pricing
             ):
                 assert (
                     "output_cost_per_character_above_128k_tokens" in model_info
                     and model_info["output_cost_per_character_above_128k_tokens"]
                     is not None
                 ), "model info for model={} does not have 'output_cost_per_character_above_128k_tokens' pricing\nmodel_info={}".format(
                     model, model_info
                 )
                 completion_cost = (
                     completion_tokens
                     * model_info["output_cost_per_character_above_128k_tokens"]
                 )
             else:
                 assert (
                     "output_cost_per_character" in model_info
                     and model_info["output_cost_per_character"] is not None
                 ), "model info for model={} does not have 'output_cost_per_character'-pricing\nmodel_info={}".format(
                     model, model_info
                 )
                 completion_cost = (
                     completion_characters * model_info["output_cost_per_character"]
                 )
         except Exception as e:
             verbose_logger.debug(
                 "litellm.litellm_core_utils.llm_cost_calc.google.py::cost_per_character(): Exception occured - {}\nDefaulting to None".format(
                     str(e)
                 )
             )
             _, completion_cost = cost_per_token(
                 model=model,
                 custom_llm_provider=custom_llm_provider,
                 prompt_tokens=prompt_tokens,
                 completion_tokens=completion_tokens,
             )
 
-        completion_cost = initial_completion_cost * 4
     return prompt_cost, completion_cost
 
 

diff --git a/litellm/litellm_core_utils/llm_cost_calc/utils.py b/litellm/litellm_core_utils/llm_cost_calc/utils.py
@@ -17,9  17,8 @@ def _generic_cost_per_character(
     custom_completion_cost: Optional[float],
 ) -> Tuple[Optional[float], Optional[float]]:
     """
-    Generic function to help calculate cost per character.
-    """
-    """
     Calculates cost per character for aspeech/speech calls.
 
     Calculates the cost per character for a given model, input messages, and response object.
 
     Input:
@@ -29,7  28,7 @@ def _generic_cost_per_character(
         - completion_characters: float, the number of output characters
 
     Returns:
-        Tuple[Optional[float], Optional[float]] - prompt_cost_in_usd, completion_cost_in_usd. 
         Tuple[Optional[float], Optional[float]] - prompt_cost_in_usd, completion_cost_in_usd.
         - returns None if not able to calculate cost.
 
     Raises: