refactor: model pass_max_tokens (#493)

pull/494/head
sigoden 4 weeks ago committed by GitHub
parent 1c6c740381
commit 7762cd6bed
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -746,8 +746,8 @@
messages: messages,
stream: true,
};
const { max_output_token, need_max_tokens } = retrieveModel(this.models, chat.model_id);
if (!body["max_tokens"] && need_max_tokens) {
const { max_output_token, pass_max_tokens } = retrieveModel(this.models, chat.model_id);
if (!body["max_tokens"] && pass_max_tokens) {
body["max_tokens"] = max_output_token;
};
return body;
@ -819,14 +819,14 @@
function retrieveModel(models, id) {
const model = models.find(model => model.id === id);
if (!model) return {};
const max_output_token = model.max_output_tokens || model["max_output_tokens?"] || null;
const need_max_tokens = !!model.max_output_tokens;
const max_output_token = model.max_output_tokens;
const supports_vision = !!model.supports_vision;
const pass_max_tokens = !!model.pass_max_tokens;
return {
id,
max_output_token,
need_max_tokens,
supports_vision,
pass_max_tokens,
}
}

@ -939,8 +939,8 @@
body[body_key || setting_key] = this.settings[setting_key];
}
});
const { max_output_token, need_max_tokens } = this.currentModel;
if (!body["max_tokens"] && need_max_tokens) {
const { max_output_token, pass_max_tokens } = this.currentModel;
if (!body["max_tokens"] && pass_max_tokens) {
body["max_tokens"] = max_output_token;
};
return body;
@ -1013,14 +1013,14 @@
function retrieveModel(models, id) {
const model = models.find(model => model.id === id);
if (!model) return {};
const max_output_token = model.max_output_tokens || model["max_output_tokens?"] || null;
const need_max_tokens = !!model.max_output_tokens;
const max_output_token = model.max_output_tokens;
const supports_vision = !!model.supports_vision;
const pass_max_tokens = !!model.pass_max_tokens;
return {
id,
max_output_token,
need_max_tokens,
supports_vision,
pass_max_tokens,
}
}

@ -33,7 +33,6 @@ clients:
# models:
# - name: xxxx # The model name
# max_input_tokens: 100000
# max_output_tokens: 4096
# supports_vision: true
# extra_fields: # Set custom parameters, will merge with the body json
# key: value

@ -8,44 +8,45 @@
models:
- name: gpt-3.5-turbo
max_input_tokens: 16385
max_output_tokens?: 4096
max_output_tokens: 4096
input_price: 0.5
output_price: 1.5
- name: gpt-3.5-turbo-1106
max_input_tokens: 16385
max_output_tokens?: 4096
max_output_tokens: 4096
input_price: 1
output_price: 2
- name: gpt-4-turbo
max_input_tokens: 128000
max_output_tokens?: 4096
max_output_tokens: 4096
input_price: 10
output_price: 30
supports_vision: true
- name: gpt-4-turbo-preview
max_input_tokens: 128000
max_output_tokens?: 4096
max_output_tokens: 4096
input_price: 10
output_price: 30
- name: gpt-4-1106-preview
max_input_tokens: 128000
max_output_tokens?: 4096
max_output_tokens: 4096
input_price: 10
output_price: 30
- name: gpt-4-vision-preview
max_input_tokens: 128000
max_output_tokens: 4096
pass_max_tokens: true
input_price: 10
output_price: 30
supports_vision: true
- name: gpt-4
max_input_tokens: 8192
max_output_tokens?: 4096
max_output_tokens: 4096
input_price: 30
output_price: 60
- name: gpt-4-32k
max_input_tokens: 32768
max_output_tokens?: 4096
max_output_tokens: 4096
input_price: 60
output_price: 120
@ -59,18 +60,18 @@
models:
- name: gemini-1.0-pro-latest
max_input_tokens: 30720
max_output_tokens?: 2048
max_output_tokens: 2048
input_price: 0.5
output_price: 1.5
- name: gemini-1.0-pro-vision-latest
max_input_tokens: 12288
max_output_tokens?: 4096
max_output_tokens: 4096
input_price: 0.5
output_price: 1.5
supports_vision: true
- name: gemini-1.5-pro-latest
max_input_tokens: 1048576
max_output_tokens?: 8192
max_output_tokens: 8192
input_price: 7
output_price: 21
supports_vision: true
@ -85,18 +86,21 @@
- name: claude-3-opus-20240229
max_input_tokens: 200000
max_output_tokens: 4096
pass_max_tokens: true
input_price: 15
output_price: 75
supports_vision: true
- name: claude-3-sonnet-20240229
max_input_tokens: 200000
max_output_tokens: 4096
pass_max_tokens: true
input_price: 3
output_price: 15
supports_vision: true
- name: claude-3-haiku-20240307
max_input_tokens: 200000
max_output_tokens: 4096
pass_max_tokens: true
input_price: 0.25
output_price: 1.25
supports_vision: true
@ -140,12 +144,12 @@
models:
- name: command-r
max_input_tokens: 128000
max_output_tokens?: 4000
max_output_tokens: 4000
input_price: 0.5
output_price: 1.5
- name: command-r-plus
max_input_tokens: 128000
max_output_tokens?: 4000
max_output_tokens: 4000
input_price: 3
output_price: 15
@ -159,28 +163,28 @@
models:
- name: llama-3-sonar-small-32k-chat
max_input_tokens: 32768
max_output_tokens?: 32768
max_output_tokens: 32768
input_price: 0.2
output_price: 0.2
- name: llama-3-sonar-large-32k-chat
max_input_tokens: 32768
max_output_tokens?: 32768
max_output_tokens: 32768
input_price: 0.6
output_price: 0.6
- name: llama-3-8b-instruct
max_input_tokens: 8192
max_output_tokens?: 8192
max_output_tokens: 8192
input_price: 0.2
output_price: 0.2
- name: llama-3-70b-instruct
max_input_tokens: 8192
max_output_tokens?: 8192
max_output_tokens: 8192
input_price: 1
output_price: 1
- name: mixtral-8x7b-instruct
max_input_tokens: 16384
max_output_tokens?: 16384
max_output_tokens: 16384
input_price: 0.6
output_price: 0.6
@ -195,22 +199,22 @@
models:
- name: llama3-8b-8192
max_input_tokens: 8192
max_output_tokens?: 8192
max_output_tokens: 8192
input_price: 0.05
output_price: 0.10
- name: llama3-70b-8192
max_input_tokens: 8192
max_output_tokens?: 8192
max_output_tokens: 8192
input_price: 0.59
output_price: 0.79
- name: mixtral-8x7b-32768
max_input_tokens: 32768
max_output_tokens?: 32768
max_output_tokens: 32768
input_price: 0.27
output_price: 0.27
- name: gemma-7b-it
max_input_tokens: 8192
max_output_tokens?: 8192
max_output_tokens: 8192
input_price: 0.10
output_price: 0.10
@ -224,18 +228,18 @@
models:
- name: gemini-1.0-pro
max_input_tokens: 24568
max_output_tokens?: 8192
max_output_tokens: 8192
input_price: 0.125
output_price: 0.375
- name: gemini-1.0-pro-vision
max_input_tokens: 14336
max_output_tokens?: 2048
max_output_tokens: 2048
input_price: 0.125
output_price: 0.375
supports_vision: true
- name: gemini-1.5-pro-preview-0409
max_input_tokens: 1000000
max_output_tokens?: 8192
max_output_tokens: 8192
input_price: 2.5
output_price: 7.5
supports_vision: true
@ -250,18 +254,21 @@
- name: claude-3-opus@20240229
max_input_tokens: 200000
max_output_tokens: 4096
pass_max_tokens: true
input_price: 15
output_price: 75
supports_vision: true
- name: claude-3-sonnet@20240229
max_input_tokens: 200000
max_output_tokens: 4096
pass_max_tokens: true
input_price: 3
output_price: 15
supports_vision: true
- name: claude-3-haiku@20240307
max_input_tokens: 200000
max_output_tokens: 4096
pass_max_tokens: true
input_price: 0.25
output_price: 1.25
supports_vision: true
@ -277,44 +284,52 @@
- name: anthropic.claude-3-opus-20240229-v1:0
max_input_tokens: 200000
max_output_tokens: 4096
pass_max_tokens: true
input_price: 15
output_price: 75
supports_vision: true
- name: anthropic.claude-3-sonnet-20240229-v1:0
max_input_tokens: 200000
max_output_tokens: 4096
pass_max_tokens: true
input_price: 3
output_price: 15
supports_vision: true
- name: anthropic.claude-3-haiku-20240307-v1:0
max_input_tokens: 200000
max_output_tokens: 4096
pass_max_tokens: true
input_price: 0.25
output_price: 1.25
supports_vision: true
- name: meta.llama3-8b-instruct-v1:0
max_input_tokens: 8192
max_output_tokens: 4096
pass_max_tokens: true
input_price: 0.4
output_price: 0.6
- name: meta.llama3-70b-instruct-v1:0
max_input_tokens: 8192
max_output_tokens: 4096
pass_max_tokens: true
input_price: 2.65
output_price: 3.5
- name: mistral.mistral-7b-instruct-v0:2
max_input_tokens: 32000
max_output_tokens: 8192
pass_max_tokens: true
input_price: 0.15
output_price: 0.2
- name: mistral.mixtral-8x7b-instruct-v0:1
max_input_tokens: 32000
max_output_tokens: 8192
pass_max_tokens: true
input_price: 0.45
output_price: 0.7
- name: mistral.mistral-large-2402-v1:0
max_input_tokens: 32000
max_output_tokens: 8192
pass_max_tokens: true
input_price: 8
output_price: 2.4
@ -328,21 +343,27 @@
- name: '@cf/meta/llama-3-8b-instruct'
max_input_tokens: 4096
max_output_tokens: 4096
pass_max_tokens: true
- name: '@cf/mistral/mistral-7b-instruct-v0.2-lora'
max_input_tokens: 4096
max_output_tokens: 4096
pass_max_tokens: true
- name: '@cf/google/gemma-7b-it-lora'
max_input_tokens: 4096
max_output_tokens: 4096
pass_max_tokens: true
- name: '@cf/qwen/qwen1.5-14b-chat-awq'
max_input_tokens: 4096
max_output_tokens: 4096
pass_max_tokens: true
- name: '@hf/thebloke/deepseek-coder-6.7b-instruct-awq'
max_input_tokens: 4096
max_output_tokens: 4096
pass_max_tokens: true
- name: '@hf/nexusflow/starling-lm-7b-beta'
max_input_tokens: 4096
max_output_tokens: 4096
pass_max_tokens: true
- platform: replicate
# docs:
@ -354,21 +375,25 @@
- name: meta/meta-llama-3-70b-instruct
max_input_tokens: 8192
max_output_tokens: 4096
pass_max_tokens: true
input_price: 0.65
output_price: 2.75
- name: meta/meta-llama-3-8b-instruct
max_input_tokens: 8192
max_output_tokens: 4096
pass_max_tokens: true
input_price: 0.05
output_price: 0.25
- name: mistralai/mistral-7b-instruct-v0.2
max_input_tokens: 32000
max_output_tokens: 8192
pass_max_tokens: true
input_price: 0.05
output_price: 0.25
- name: mistralai/mixtral-8x7b-instruct-v0.1
max_input_tokens: 32000
max_output_tokens: 8192
pass_max_tokens: true
input_price: 0.3
output_price: 1
@ -382,26 +407,31 @@
- name: ernie-4.0-8k-preview
max_input_tokens: 5120
max_output_tokens: 2048
pass_max_tokens: true
input_price: 16.8
output_price: 16.8
- name: ernie-3.5-8k-preview
max_input_tokens: 5120
max_output_tokens: 2048
pass_max_tokens: true
input_price: 1.68
output_price: 1.68
- name: ernie-speed-128k
max_input_tokens: 124000
max_output_tokens: 4096
pass_max_tokens: true
input_price: 0.56
output_price: 1.12
- name: ernie-lite-8k
max_input_tokens: 7168
max_output_tokens: 2048
pass_max_tokens: true
input_price: 0.42
output_price: 0.84
- name: ernie-tiny-8k
max_input_tokens: 7168
max_output_tokens: 2048
pass_max_tokens: true
input_price: 0.14
output_price: 0.14
@ -414,22 +444,22 @@
models:
- name: qwen-turbo
max_input_tokens: 6000
max_output_tokens?: 1500
max_output_tokens: 1500
input_price: 1.12
output_price: 1.12
- name: qwen-plus
max_input_tokens: 30000
max_output_tokens?: 2000
max_output_tokens: 2000
input_price: 2.8
output_price: 2.8
- name: qwen-max
max_input_tokens: 6000
max_output_tokens?: 2000
max_output_tokens: 2000
input_price: 16.8
output_price: 16.8
- name: qwen-max-longcontext
max_input_tokens: 28000
max_output_tokens?: 2000
max_output_tokens: 2000
- name: qwen-vl-plus
input_price: 1.12
output_price: 1.12
@ -686,16 +716,22 @@
supports_vision: true
- name: anthropic/claude-3-opus
max_input_tokens: 200000
max_output_tokens: 4096
pass_max_tokens: true
input_price: 15
output_price: 75
supports_vision: true
- name: anthropic/claude-3-sonnet
max_input_tokens: 200000
max_output_tokens: 4096
pass_max_tokens: true
input_price: 3
output_price: 15
supports_vision: true
- name: anthropic/claude-3-haiku
max_input_tokens: 200000
max_output_tokens: 4096
pass_max_tokens: true
input_price: 0.25
output_price: 1.25
supports_vision: true

@ -172,7 +172,7 @@ async fn send_message_streaming(
let data: Value = decode_chunk(message.payload()).ok_or_else(|| {
anyhow!("Invalid chunk data: {}", hex_encode(message.payload()))
})?;
debug!("bedrock chunk: {data}");
// debug!("bedrock chunk: {data}");
match model_category {
ModelCategory::Anthropic => {
if let Some(typ) = data["type"].as_str() {
@ -235,7 +235,7 @@ fn meta_llama_build_body(data: SendData, model: &Model, pt: PromptFormat) -> Res
let prompt = generate_prompt(&messages, pt)?;
let mut body = json!({ "prompt": prompt });
if let Some(v) = model.max_output_tokens {
if let Some(v) = model.max_tokens_param() {
body["max_gen_len"] = v.into();
}
if let Some(v) = temperature {
@ -258,7 +258,7 @@ fn mistral_build_body(data: SendData, model: &Model) -> Result<Value> {
let prompt = generate_prompt(&messages, MISTRAL_PROMPT_FORMAT)?;
let mut body = json!({ "prompt": prompt });
if let Some(v) = model.max_output_tokens {
if let Some(v) = model.max_tokens_param() {
body["max_tokens"] = v.into();
}
if let Some(v) = temperature {

@ -142,7 +142,7 @@ pub fn claude_build_body(data: SendData, model: &Model) -> Result<Value> {
if let Some(v) = system_message {
body["system"] = v.into();
}
if let Some(v) = model.max_output_tokens {
if let Some(v) = model.max_tokens_param() {
body["max_tokens"] = v.into();
}
if let Some(v) = temperature {

@ -88,7 +88,7 @@ fn build_body(data: SendData, model: &Model) -> Result<Value> {
"messages": messages,
});
if let Some(v) = model.max_output_tokens {
if let Some(v) = model.max_tokens_param() {
body["max_tokens"] = v.into();
}
if let Some(v) = temperature {

@ -135,7 +135,7 @@ fn build_body(data: SendData, model: &Model) -> Result<Value> {
body["chat_history"] = messages.into();
}
if let Some(v) = model.max_output_tokens {
if let Some(v) = model.max_tokens_param() {
body["max_tokens"] = v.into();
}
if let Some(v) = temperature {

@ -128,7 +128,7 @@ fn build_body(data: SendData, model: &Model) -> Value {
"messages": messages,
});
if let Some(v) = model.max_output_tokens {
if let Some(v) = model.max_tokens_param() {
body["max_output_tokens"] = v.into();
}
if let Some(v) = temperature {

@ -14,11 +14,11 @@ pub struct Model {
pub name: String,
pub max_input_tokens: Option<usize>,
pub max_output_tokens: Option<isize>,
pub ref_max_output_tokens: Option<isize>,
pub pass_max_tokens: bool,
pub input_price: Option<f64>,
pub output_price: Option<f64>,
pub extra_fields: Option<serde_json::Map<String, serde_json::Value>>,
pub capabilities: ModelCapabilities,
pub extra_fields: Option<serde_json::Map<String, serde_json::Value>>,
}
impl Default for Model {
@ -32,13 +32,13 @@ impl Model {
Self {
client_name: client_name.into(),
name: name.into(),
extra_fields: None,
max_input_tokens: None,
max_output_tokens: None,
ref_max_output_tokens: None,
pass_max_tokens: false,
input_price: None,
output_price: None,
capabilities: ModelCapabilities::Text,
extra_fields: None,
}
}
@ -49,8 +49,7 @@ impl Model {
let mut model = Model::new(client_name, &v.name);
model
.set_max_input_tokens(v.max_input_tokens)
.set_max_output_tokens(v.max_output_tokens)
.set_ref_max_output_tokens(v.ref_max_output_tokens)
.set_max_tokens(v.max_output_tokens, v.pass_max_tokens)
.set_input_price(v.input_price)
.set_output_price(v.output_price)
.set_supports_vision(v.supports_vision)
@ -97,7 +96,7 @@ impl Model {
pub fn description(&self) -> String {
let max_input_tokens = format_option_value(&self.max_input_tokens);
let max_output_tokens = format_option_value(&self.show_max_output_tokens());
let max_output_tokens = format_option_value(&self.max_output_tokens);
let input_price = format_option_value(&self.input_price);
let output_price = format_option_value(&self.output_price);
let vision = if self.capabilities.contains(ModelCapabilities::Vision) {
@ -115,8 +114,12 @@ impl Model {
self.capabilities.contains(ModelCapabilities::Vision)
}
pub fn show_max_output_tokens(&self) -> Option<isize> {
self.max_output_tokens.or(self.ref_max_output_tokens)
pub fn max_tokens_param(&self) -> Option<isize> {
if self.pass_max_tokens {
self.max_output_tokens
} else {
None
}
}
pub fn set_max_input_tokens(&mut self, max_input_tokens: Option<usize>) -> &mut Self {
@ -127,19 +130,16 @@ impl Model {
self
}
pub fn set_max_output_tokens(&mut self, max_output_tokens: Option<isize>) -> &mut Self {
pub fn set_max_tokens(
&mut self,
max_output_tokens: Option<isize>,
pass_max_tokens: bool,
) -> &mut Self {
match max_output_tokens {
None | Some(0) => self.max_output_tokens = None,
_ => self.max_output_tokens = max_output_tokens,
}
self
}
pub fn set_ref_max_output_tokens(&mut self, ref_max_output_tokens: Option<isize>) -> &mut Self {
match ref_max_output_tokens {
None | Some(0) => self.ref_max_output_tokens = None,
_ => self.ref_max_output_tokens = ref_max_output_tokens,
}
self.pass_max_tokens = pass_max_tokens;
self
}
@ -237,12 +237,12 @@ pub struct ModelConfig {
pub name: String,
pub max_input_tokens: Option<usize>,
pub max_output_tokens: Option<isize>,
#[serde(rename = "max_output_tokens?")]
pub ref_max_output_tokens: Option<isize>,
pub input_price: Option<f64>,
pub output_price: Option<f64>,
#[serde(default)]
pub supports_vision: bool,
#[serde(default)]
pub pass_max_tokens: bool,
pub extra_fields: Option<serde_json::Map<String, serde_json::Value>>,
}

@ -159,7 +159,7 @@ fn build_body(data: SendData, model: &Model) -> Result<Value> {
"options": {},
});
if let Some(v) = model.max_output_tokens {
if let Some(v) = model.max_tokens_param() {
body["options"]["num_predict"] = v.into();
}
if let Some(v) = temperature {

@ -90,7 +90,7 @@ pub fn openai_build_body(data: SendData, model: &Model) -> Value {
"messages": messages,
});
if let Some(v) = model.max_output_tokens {
if let Some(v) = model.max_tokens_param() {
body["max_tokens"] = v.into();
}
if let Some(v) = temperature {

@ -173,7 +173,7 @@ fn build_body(data: SendData, model: &Model, is_vl: bool) -> Result<(Value, bool
parameters["incremental_output"] = true.into();
}
if let Some(v) = model.max_output_tokens {
if let Some(v) = model.max_tokens_param() {
parameters["max_tokens"] = v.into();
}
if let Some(v) = temperature {

@ -148,7 +148,7 @@ fn build_body(data: SendData, model: &Model) -> Result<Value> {
"prompt_template": "{prompt}"
});
if let Some(v) = model.max_output_tokens {
if let Some(v) = model.max_tokens_param() {
input["max_tokens"] = v.into();
input["max_new_tokens"] = v.into();
}

@ -201,7 +201,7 @@ pub(crate) fn gemini_build_body(
body["safetySettings"] = safety_settings;
}
if let Some(v) = model.max_output_tokens {
if let Some(v) = model.max_tokens_param() {
body["generationConfig"]["maxOutputTokens"] = v.into();
}
if let Some(v) = temperature {

@ -422,7 +422,7 @@ impl Config {
(
"max_output_tokens",
self.model
.max_output_tokens
.max_tokens_param()
.map(|v| format!("{v} (current model)"))
.unwrap_or_else(|| "-".into()),
),
@ -523,7 +523,7 @@ impl Config {
(values, args[0])
} else if args.len() == 2 {
let values = match args[0] {
"max_output_tokens" => match self.model.show_max_output_tokens() {
"max_output_tokens" => match self.model.max_output_tokens {
Some(v) => vec![v.to_string()],
None => vec![],
},
@ -564,7 +564,7 @@ impl Config {
match key {
"max_output_tokens" => {
let value = parse_value(value)?;
self.model.set_max_output_tokens(value);
self.model.set_max_tokens(value, true);
}
"temperature" => {
let value = parse_value(value)?;

@ -93,7 +93,7 @@ impl Server {
"id": id,
"max_input_tokens": model.max_input_tokens,
"max_output_tokens": model.max_output_tokens,
"max_output_tokens?": model.ref_max_output_tokens,
"pass_max_tokens": model.pass_max_tokens,
"input_price": model.input_price,
"output_price": model.output_price,
"supports_vision": model.supports_vision(),
@ -244,7 +244,7 @@ impl Server {
let mut client = init_client(&config)?;
if max_tokens.is_some() {
client.model_mut().set_max_output_tokens(max_tokens);
client.model_mut().set_max_tokens(max_tokens, true);
}
let abort = create_abort_signal();
let http_client = client.build_client()?;

Loading…
Cancel
Save