Skip to content

Commit

Permalink
LiteLLM Minor Fixes & Improvements (10/09/2024) (#6139)
Browse files Browse the repository at this point in the history
* fix(utils.py): don't return 'none' response headers

Fixes #6123

* fix(vertex_and_google_ai_studio_gemini.py): support parsing out additional properties and strict value for tool calls

Fixes #6136

* fix(cost_calculator.py): set default character value to none

Fixes #6133 (comment)

* fix(google.py): fix cost per token / cost per char conversion

Fixes #6133 (comment)

* build(model_prices_and_context_window.json): update gemini pricing

Fixes #6133

* build(model_prices_and_context_window.json): update gemini pricing

* fix(litellm_logging.py): fix streaming caching logging when 'turn_off_message_logging' enabled

Stores unredacted response in cache

* build(model_prices_and_context_window.json): update gemini-1.5-flash pricing

* fix(cost_calculator.py): fix default prompt_character count logic

Fixes error in gemini cost calculation

* fix(cost_calculator.py): fix cost calc for tts models
  • Loading branch information
krrishdholakia authored Oct 10, 2024
1 parent 60baa65 commit 6005450
Show file tree
Hide file tree
Showing 16 changed files with 761 additions and 507 deletions.
31 changes: 18 additions & 13 deletions litellm/cost_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 87,8 @@ def cost_per_token(
custom_llm_provider: Optional[str] = None,
region_name=None,
### CHARACTER PRICING ###
prompt_characters: int = 0,
completion_characters: int = 0,
prompt_characters: Optional[int] = None,
completion_characters: Optional[int] = None,
### PROMPT CACHING PRICING ### - used for anthropic
cache_creation_input_tokens: Optional[int] = 0,
cache_read_input_tokens: Optional[int] = 0,
Expand Down Expand Up @@ -201,13 201,24 @@ def cost_per_token(
model = model_without_prefix

# see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
print_verbose(f"Looking up model={model} in model_cost_map")
print_verbose(
f"Looking up model={model} in model_cost_map, custom_llm_provider={custom_llm_provider}, call_type={call_type}"
)
if call_type == "speech" or call_type == "aspeech":
if prompt_characters is None:
raise ValueError(
"prompt_characters must be provided for tts calls. prompt_characters={}, model={}, custom_llm_provider={}, call_type={}".format(
prompt_characters,
model,
custom_llm_provider,
call_type,
)
)
prompt_cost, completion_cost = _generic_cost_per_character(
model=model_without_prefix,
custom_llm_provider=custom_llm_provider,
prompt_characters=prompt_characters,
completion_characters=completion_characters,
completion_characters=0,
custom_prompt_cost=None,
custom_completion_cost=0,
)
Expand All @@ -232,10 243,6 @@ def cost_per_token(
cost_router = google_cost_router(
model=model_without_prefix,
custom_llm_provider=custom_llm_provider,
prompt_characters=prompt_characters,
completion_characters=completion_characters,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
call_type=call_type,
)
if cost_router == "cost_per_character":
Expand Down Expand Up @@ -542,9 549,9 @@ def completion_cost(
model = "dall-e-2" # for dall-e-2, azure expects an empty model name
# Handle Inputs to completion_cost
prompt_tokens = 0
prompt_characters = 0
prompt_characters: Optional[int] = None
completion_tokens = 0
completion_characters = 0
completion_characters: Optional[int] = None
cache_creation_input_tokens: Optional[int] = None
cache_read_input_tokens: Optional[int] = None
if completion_response is not None and (
Expand Down Expand Up @@ -721,10 728,8 @@ def completion_cost(
prompt_string = litellm.utils.get_formatted_prompt(
data={"messages": messages}, call_type="completion"
)
else:
prompt_string = ""

prompt_characters = litellm.utils._count_characters(text=prompt_string)
prompt_characters = litellm.utils._count_characters(text=prompt_string)
if completion_response is not None and isinstance(
completion_response, ModelResponse
):
Expand Down
38 changes: 20 additions & 18 deletions litellm/litellm_core_utils/litellm_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -901,14 901,19 @@ def success_handler(
complete_streaming_response = None
else:
self.sync_streaming_chunks.append(result)

_caching_complete_streaming_response: Optional[
Union[ModelResponse, TextCompletionResponse]
] = None
if complete_streaming_response is not None:
verbose_logger.debug(
"Logging Details LiteLLM-Success Call streaming complete"
)
self.model_call_details["complete_streaming_response"] = (
complete_streaming_response
)
_caching_complete_streaming_response = copy.deepcopy(
complete_streaming_response
)
self.model_call_details["response_cost"] = (
self._response_cost_calculator(result=complete_streaming_response)
)
Expand Down Expand Up @@ -937,6 942,20 @@ def success_handler(
else:
callbacks = litellm.success_callback

## STREAMING CACHING ##
if "cache" in callbacks and litellm.cache is not None:
# this only logs streaming once, complete_streaming_response exists i.e when stream ends
print_verbose("success_callback: reaches cache for logging!")
kwargs = self.model_call_details
if self.stream and _caching_complete_streaming_response is not None:
print_verbose(
"success_callback: reaches cache for logging, there is a complete_streaming_response. Adding to cache"
)
result = _caching_complete_streaming_response
# only add to cache once we have a complete streaming response
litellm.cache.add_cache(result, **kwargs)

## REDACT MESSAGES ##
result = redact_message_input_output_from_logging(
model_call_details=(
self.model_call_details
Expand Down Expand Up @@ -1302,23 1321,6 @@ def success_handler(
end_time=end_time,
print_verbose=print_verbose,
)
if callback == "cache" and litellm.cache is not None:
# this only logs streaming once, complete_streaming_response exists i.e when stream ends
print_verbose("success_callback: reaches cache for logging!")
kwargs = self.model_call_details
if self.stream:
if "complete_streaming_response" not in kwargs:
print_verbose(
f"success_callback: reaches cache for logging, there is no complete_streaming_response. Kwargs={kwargs}\n\n"
)
pass
else:
print_verbose(
"success_callback: reaches cache for logging, there is a complete_streaming_response. Adding to cache"
)
result = kwargs["complete_streaming_response"]
# only add to cache once we have a complete streaming response
litellm.cache.add_cache(result, **kwargs)
if callback == "athina" and athinaLogger is not None:
deep_copy = {}
for k, v in self.model_call_details.items():
Expand Down
157 changes: 83 additions & 74 deletions litellm/litellm_core_utils/llm_cost_calc/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 32,6 @@ def _is_above_128k(tokens: float) -> bool:
def cost_router(
model: str,
custom_llm_provider: str,
prompt_tokens: float,
completion_tokens: float,
prompt_characters: float,
completion_characters: float,
call_type: Union[Literal["embedding", "aembedding"], str],
) -> Literal["cost_per_character", "cost_per_token"]:
"""
Expand Down Expand Up @@ -66,8 62,8 @@ def cost_per_character(
custom_llm_provider: str,
prompt_tokens: float,
completion_tokens: float,
prompt_characters: float,
completion_characters: float,
prompt_characters: Optional[float] = None,
completion_characters: Optional[float] = None,
) -> Tuple[float, float]:
"""
Calculates the cost per character for a given VertexAI model, input messages, and response object.
Expand All @@ -94,87 90,100 @@ def cost_per_character(
)

## CALCULATE INPUT COST
try:
if (
_is_above_128k(tokens=prompt_characters * 4) # 1 token = 4 char
and model not in models_without_dynamic_pricing
):
## check if character pricing, else default to token pricing
assert (
"input_cost_per_character_above_128k_tokens" in model_info
and model_info["input_cost_per_character_above_128k_tokens"] is not None
), "model info for model={} does not have 'input_cost_per_character_above_128k_tokens'-pricing for > 128k tokens\nmodel_info={}".format(
model, model_info
)
prompt_cost = (
prompt_characters
* model_info["input_cost_per_character_above_128k_tokens"]
)
else:
assert (
"input_cost_per_character" in model_info
and model_info["input_cost_per_character"] is not None
), "model info for model={} does not have 'input_cost_per_character'-pricing\nmodel_info={}".format(
model, model_info
)
prompt_cost = prompt_characters * model_info["input_cost_per_character"]
except Exception as e:
verbose_logger.exception(
"litellm.litellm_core_utils.llm_cost_calc.google.cost_per_character(): Defaulting to (cost_per_token * 4) calculation for prompt_cost. Exception occured - {}".format(
str(e)
)
)
initial_prompt_cost, _ = cost_per_token(
if prompt_characters is None:
prompt_cost, _ = cost_per_token(
model=model,
custom_llm_provider=custom_llm_provider,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)

prompt_cost = initial_prompt_cost * 4

## CALCULATE OUTPUT COST
try:
if (
_is_above_128k(tokens=completion_characters * 4) # 1 token = 4 char
and model not in models_without_dynamic_pricing
):
assert (
"output_cost_per_character_above_128k_tokens" in model_info
and model_info["output_cost_per_character_above_128k_tokens"]
is not None
), "model info for model={} does not have 'output_cost_per_character_above_128k_tokens' pricing\nmodel_info={}".format(
model, model_info
)
completion_cost = (
completion_tokens
* model_info["output_cost_per_character_above_128k_tokens"]
)
else:
assert (
"output_cost_per_character" in model_info
and model_info["output_cost_per_character"] is not None
), "model info for model={} does not have 'output_cost_per_character'-pricing\nmodel_info={}".format(
model, model_info
)
completion_cost = (
completion_tokens * model_info["output_cost_per_character"]
else:
try:
if (
_is_above_128k(tokens=prompt_characters * 4) # 1 token = 4 char
and model not in models_without_dynamic_pricing
):
## check if character pricing, else default to token pricing
assert (
"input_cost_per_character_above_128k_tokens" in model_info
and model_info["input_cost_per_character_above_128k_tokens"]
is not None
), "model info for model={} does not have 'input_cost_per_character_above_128k_tokens'-pricing for > 128k tokens\nmodel_info={}".format(
model, model_info
)
prompt_cost = (
prompt_characters
* model_info["input_cost_per_character_above_128k_tokens"]
)
else:
assert (
"input_cost_per_character" in model_info
and model_info["input_cost_per_character"] is not None
), "model info for model={} does not have 'input_cost_per_character'-pricing\nmodel_info={}".format(
model, model_info
)
prompt_cost = prompt_characters * model_info["input_cost_per_character"]
except Exception as e:
verbose_logger.debug(
"litellm.litellm_core_utils.llm_cost_calc.google.py::cost_per_character(): Exception occured - {}\nDefaulting to None".format(
str(e)
)
)
except Exception as e:
verbose_logger.exception(
"litellm.litellm_core_utils.llm_cost_calc.google.cost_per_character(): \
Defaulting to (cost_per_token * 4) calculation for completion_cost\nException occured - {}".format(
str(e)
prompt_cost, _ = cost_per_token(
model=model,
custom_llm_provider=custom_llm_provider,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
)
_, initial_completion_cost = cost_per_token(

## CALCULATE OUTPUT COST
if completion_characters is None:
_, completion_cost = cost_per_token(
model=model,
custom_llm_provider=custom_llm_provider,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
else:
try:
if (
_is_above_128k(tokens=completion_characters * 4) # 1 token = 4 char
and model not in models_without_dynamic_pricing
):
assert (
"output_cost_per_character_above_128k_tokens" in model_info
and model_info["output_cost_per_character_above_128k_tokens"]
is not None
), "model info for model={} does not have 'output_cost_per_character_above_128k_tokens' pricing\nmodel_info={}".format(
model, model_info
)
completion_cost = (
completion_tokens
* model_info["output_cost_per_character_above_128k_tokens"]
)
else:
assert (
"output_cost_per_character" in model_info
and model_info["output_cost_per_character"] is not None
), "model info for model={} does not have 'output_cost_per_character'-pricing\nmodel_info={}".format(
model, model_info
)
completion_cost = (
completion_characters * model_info["output_cost_per_character"]
)
except Exception as e:
verbose_logger.debug(
"litellm.litellm_core_utils.llm_cost_calc.google.py::cost_per_character(): Exception occured - {}\nDefaulting to None".format(
str(e)
)
)
_, completion_cost = cost_per_token(
model=model,
custom_llm_provider=custom_llm_provider,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)

completion_cost = initial_completion_cost * 4
return prompt_cost, completion_cost


Expand Down
7 changes: 3 additions & 4 deletions litellm/litellm_core_utils/llm_cost_calc/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 17,8 @@ def _generic_cost_per_character(
custom_completion_cost: Optional[float],
) -> Tuple[Optional[float], Optional[float]]:
"""
Generic function to help calculate cost per character.
"""
"""
Calculates cost per character for aspeech/speech calls.
Calculates the cost per character for a given model, input messages, and response object.
Input:
Expand All @@ -29,7 28,7 @@ def _generic_cost_per_character(
- completion_characters: float, the number of output characters
Returns:
Tuple[Optional[float], Optional[float]] - prompt_cost_in_usd, completion_cost_in_usd.
Tuple[Optional[float], Optional[float]] - prompt_cost_in_usd, completion_cost_in_usd.
- returns None if not able to calculate cost.
Raises:
Expand Down
Loading

0 comments on commit 6005450

Please sign in to comment.