forked from BerriAI/liteLLM-proxy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathllm.py
157 lines (131 loc) · 4.2 KB
/
llm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
from typing import Dict
from collections import defaultdict
import threading
from fastapi import HTTPException
from utils import getenv
import backoff
import openai.error
import litellm
import litellm.exceptions
from litellm.caching import Cache
litellm.telemetry = False
litellm.cache = Cache(
type="redis",
host=getenv("REDISHOST", ""),
port=getenv("REDISPORT", ""),
password=getenv("REDISPASSWORD", ""),
)
cost_dict: Dict[str, Dict[str, float]] = defaultdict(dict)
cost_dict_lock = threading.Lock()
def _update_costs_thread(budget_manager: litellm.BudgetManager):
thread = threading.Thread(target=budget_manager.save_data)
thread.start()
class RetryConstantError(Exception):
pass
class RetryExpoError(Exception):
pass
class UnknownLLMError(Exception):
pass
config = {
"function": "completion",
"default_fallback_models": ["gpt-3.5-turbo", "claude-instant-1", "j2-ultra"],
"available_models": litellm.utils.get_valid_models(),
"adapt_to_prompt_size": True,
"model": {
"claude-instant-1": {
"needs_moderation": True
},
"claude-2": {
"needs_moderation": True
},
"gpt-3.5-turbo": {
"error_handling": {
"ContextWindowExceededError": {"fallback_model": "gpt-3.5-turbo-16k"}
}
},
"gpt-3.5-turbo-0613": {
"error_handling": {
"ContextWindowExceededError": {"fallback_model": "gpt-3.5-turbo-16k-0613"}
}
},
"gpt-4": {
"error_handling": {
"ContextWindowExceededError": {"fallback_model": "claude-2"}
}
}
}
}
def handle_llm_exception(e: Exception):
if isinstance(
e,
(
openai.error.APIError,
openai.error.TryAgain,
openai.error.Timeout,
openai.error.ServiceUnavailableError,
),
):
raise RetryConstantError from e
elif isinstance(e, openai.error.RateLimitError):
raise RetryExpoError from e
elif isinstance(
e,
(
openai.error.APIConnectionError,
openai.error.InvalidRequestError,
openai.error.AuthenticationError,
openai.error.PermissionError,
openai.error.InvalidAPIType,
openai.error.SignatureVerificationError,
),
):
raise e
else:
raise UnknownLLMError from e
@backoff.on_exception(
wait_gen=backoff.constant,
exception=RetryConstantError,
max_tries=3,
interval=3,
)
@backoff.on_exception(
wait_gen=backoff.expo,
exception=RetryExpoError,
jitter=backoff.full_jitter,
max_value=100,
factor=1.5,
)
def completion(**kwargs) -> litellm.ModelResponse:
user_key = kwargs.pop("user_key")
master_key = kwargs.pop("master_key")
budget_manager: litellm.BudgetManager = kwargs.pop("budget_manager")
model = str(kwargs.get("model", ""))
def _completion(overide_model=None):
try:
if overide_model is not None:
kwargs["model"] = overide_model
kwargs["config"] = config
if user_key == master_key:
response = litellm.completion_with_config(**kwargs)
else:
if budget_manager.get_current_cost(
user=user_key
) > budget_manager.get_total_budget(user=user_key):
raise HTTPException(
status_code=429, detail={"error": "budget exceeded"}
)
response = litellm.completion_with_config(**kwargs)
if "stream" not in kwargs or kwargs["stream"] is not True:
print(f"user_key: {user_key}")
print(f"master_key: {master_key}")
if user_key != master_key: # no budget on master key
# updates both user
budget_manager.update_cost(completion_obj=response, user=user_key)
_update_costs_thread(budget_manager) # Non-blocking
return response
except Exception as e:
handle_llm_exception(e)
try:
return _completion()
except Exception as e:
raise e