Lower --max_batch_size and --inference_max_length defaults to 2048

pull/97/head
Aleksandr Borzunov 2 years ago
parent eb580973da
commit 5578378202

@ -39,13 +39,13 @@ def main():
help='server will use this many processes to handle incoming requests')
parser.add_argument('--min_batch_size', type=int, default=1,
help='Minimum required batch size for all operations (in total tokens)')
parser.add_argument('--max_batch_size', type=int, default=16384,
parser.add_argument('--max_batch_size', type=int, default=2048,
help='The total number of tokens in the same batch will not exceed this value')
parser.add_argument('--prefetch_batches', type=int, default=1, required=False,
help='Pre-form this many subsequent batches while GPU is processing the current one')
parser.add_argument('--sender_threads', type=int, default=1, required=False,
help='Use this many threads to pass results/exceptions from Runtime to Pools')
parser.add_argument('--inference_max_length', type=int, default=16384,
parser.add_argument('--inference_max_length', type=int, default=2048,
help='Maximum total sequence length permitted per inference, defaults to 16384 tokens')
parser.add_argument('--cache_dir', type=str, default=None,
help='Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used.')

@ -49,8 +49,8 @@ class Server:
block_indices: Optional[str] = None,
num_handlers: int = 8,
min_batch_size: int = 1,
max_batch_size: int = 4096,
inference_max_length: int = 4096,
max_batch_size: int = 2048,
inference_max_length: int = 2048,
torch_dtype: str = "auto",
revision: str = "main",
cache_dir: Optional[str] = None,

Loading…
Cancel
Save