python: do not print GPU name with verbose=False, expose this info via properties (#2222)

* llamamodel: only print device used in verbose mode

Signed-off-by: Jared Van Bortel <jared@nomic.ai>

* python: expose backend and device via GPT4All properties

Signed-off-by: Jared Van Bortel <jared@nomic.ai>

* backend: const correctness fixes

Signed-off-by: Jared Van Bortel <jared@nomic.ai>

* python: bump version

Signed-off-by: Jared Van Bortel <jared@nomic.ai>

* python: typing fixups

Signed-off-by: Jared Van Bortel <jared@nomic.ai>

* python: fix segfault with closed GPT4All

Signed-off-by: Jared Van Bortel <jared@nomic.ai>

---------

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
dependabot/go_modules/gpt4all-bindings/golang/golang.org/x/net-0.23.0
Jared Van Bortel 1 month ago committed by GitHub
parent 271d752701
commit ba53ab5da0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -364,8 +364,10 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
d_ptr->end_tokens = {llama_token_eos(d_ptr->model)}; d_ptr->end_tokens = {llama_token_eos(d_ptr->model)};
#ifdef GGML_USE_KOMPUTE #ifdef GGML_USE_KOMPUTE
if (usingGPUDevice() && ggml_vk_has_device()) { if (usingGPUDevice()) {
std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl; if (llama_verbose()) {
std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
}
d_ptr->backend_name = "kompute"; d_ptr->backend_name = "kompute";
} }
#endif #endif
@ -558,7 +560,7 @@ bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) co
#endif #endif
} }
bool LLamaModel::hasGPUDevice() bool LLamaModel::hasGPUDevice() const
{ {
#if defined(GGML_USE_KOMPUTE) #if defined(GGML_USE_KOMPUTE)
return d_ptr->device != -1; return d_ptr->device != -1;
@ -567,10 +569,12 @@ bool LLamaModel::hasGPUDevice()
#endif #endif
} }
bool LLamaModel::usingGPUDevice() bool LLamaModel::usingGPUDevice() const
{ {
#if defined(GGML_USE_KOMPUTE) #if defined(GGML_USE_KOMPUTE)
return hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0; bool hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
assert(!hasDevice || ggml_vk_has_device());
return hasDevice;
#elif defined(GGML_USE_METAL) #elif defined(GGML_USE_METAL)
return true; return true;
#else #else
@ -578,6 +582,19 @@ bool LLamaModel::usingGPUDevice()
#endif #endif
} }
const char *LLamaModel::backendName() const {
return d_ptr->backend_name;
}
const char *LLamaModel::gpuDeviceName() const {
#if defined(GGML_USE_KOMPUTE)
if (usingGPUDevice()) {
return ggml_vk_current_device().name;
}
#endif
return nullptr;
}
void llama_batch_add( void llama_batch_add(
struct llama_batch & batch, struct llama_batch & batch,
llama_token id, llama_token id,

@ -33,8 +33,10 @@ public:
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const override; std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const override;
bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override; bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override;
bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override; bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override;
bool hasGPUDevice() override; bool hasGPUDevice() const override;
bool usingGPUDevice() override; bool usingGPUDevice() const override;
const char *backendName() const override;
const char *gpuDeviceName() const override;
size_t embeddingSize() const override; size_t embeddingSize() const override;
// user-specified prefix // user-specified prefix

@ -144,8 +144,10 @@ public:
return false; return false;
} }
virtual bool hasGPUDevice() { return false; } virtual bool hasGPUDevice() const { return false; }
virtual bool usingGPUDevice() { return false; } virtual bool usingGPUDevice() const { return false; }
virtual const char *backendName() const { return "cpu"; }
virtual const char *gpuDeviceName() const { return nullptr; }
void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; } void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; }

@ -283,6 +283,18 @@ bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device)
bool llmodel_has_gpu_device(llmodel_model model) bool llmodel_has_gpu_device(llmodel_model model)
{ {
auto *wrapper = static_cast<LLModelWrapper *>(model); const auto *wrapper = static_cast<LLModelWrapper *>(model);
return wrapper->llModel->hasGPUDevice(); return wrapper->llModel->hasGPUDevice();
} }
const char *llmodel_model_backend_name(llmodel_model model)
{
const auto *wrapper = static_cast<LLModelWrapper *>(model);
return wrapper->llModel->backendName();
}
const char *llmodel_model_gpu_device_name(llmodel_model model)
{
const auto *wrapper = static_cast<LLModelWrapper *>(model);
return wrapper->llModel->gpuDeviceName();
}

@ -295,6 +295,16 @@ bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device);
*/ */
bool llmodel_has_gpu_device(llmodel_model model); bool llmodel_has_gpu_device(llmodel_model model);
/**
* @return The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal".
*/
const char *llmodel_model_backend_name(llmodel_model model);
/**
* @return The name of the GPU device currently in use, or NULL for backends other than Kompute.
*/
const char *llmodel_model_gpu_device_name(llmodel_model model);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

@ -9,7 +9,7 @@ import sys
import threading import threading
from enum import Enum from enum import Enum
from queue import Queue from queue import Queue
from typing import TYPE_CHECKING, Any, Callable, Generic, Iterable, NoReturn, TypeVar, overload from typing import TYPE_CHECKING, Any, Callable, Generic, Iterable, Literal, NoReturn, TypeVar, overload
if sys.version_info >= (3, 9): if sys.version_info >= (3, 9):
import importlib.resources as importlib_resources import importlib.resources as importlib_resources
@ -158,6 +158,12 @@ llmodel.llmodel_gpu_init_gpu_device_by_int.restype = ctypes.c_bool
llmodel.llmodel_has_gpu_device.argtypes = [ctypes.c_void_p] llmodel.llmodel_has_gpu_device.argtypes = [ctypes.c_void_p]
llmodel.llmodel_has_gpu_device.restype = ctypes.c_bool llmodel.llmodel_has_gpu_device.restype = ctypes.c_bool
llmodel.llmodel_model_backend_name.argtypes = [ctypes.c_void_p]
llmodel.llmodel_model_backend_name.restype = ctypes.c_char_p
llmodel.llmodel_model_gpu_device_name.argtypes = [ctypes.c_void_p]
llmodel.llmodel_model_gpu_device_name.restype = ctypes.c_char_p
ResponseCallbackType = Callable[[int, str], bool] ResponseCallbackType = Callable[[int, str], bool]
RawResponseCallbackType = Callable[[int, bytes], bool] RawResponseCallbackType = Callable[[int, bytes], bool]
EmbCancelCallbackType: TypeAlias = 'Callable[[list[int], str], bool]' EmbCancelCallbackType: TypeAlias = 'Callable[[list[int], str], bool]'
@ -224,6 +230,19 @@ class LLModel:
def _raise_closed(self) -> NoReturn: def _raise_closed(self) -> NoReturn:
raise ValueError("Attempted operation on a closed LLModel") raise ValueError("Attempted operation on a closed LLModel")
@property
def backend(self) -> Literal["cpu", "kompute", "metal"]:
if self.model is None:
self._raise_closed()
return llmodel.llmodel_model_backend_name(self.model).decode()
@property
def device(self) -> str | None:
if self.model is None:
self._raise_closed()
dev = llmodel.llmodel_model_gpu_device_name(self.model)
return None if dev is None else dev.decode()
@staticmethod @staticmethod
def list_gpus(mem_required: int = 0) -> list[str]: def list_gpus(mem_required: int = 0) -> list[str]:
""" """
@ -333,22 +352,23 @@ class LLModel:
@overload @overload
def generate_embeddings( def generate_embeddings(
self, text: str, prefix: str, dimensionality: int, do_mean: bool, atlas: bool, cancel_cb: EmbCancelCallbackType, self, text: str, prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
cancel_cb: EmbCancelCallbackType | None,
) -> EmbedResult[list[float]]: ... ) -> EmbedResult[list[float]]: ...
@overload @overload
def generate_embeddings( def generate_embeddings(
self, text: list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool, self, text: list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
cancel_cb: EmbCancelCallbackType, cancel_cb: EmbCancelCallbackType | None,
) -> EmbedResult[list[list[float]]]: ... ) -> EmbedResult[list[list[float]]]: ...
@overload @overload
def generate_embeddings( def generate_embeddings(
self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool, self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
cancel_cb: EmbCancelCallbackType, cancel_cb: EmbCancelCallbackType | None,
) -> EmbedResult[list[Any]]: ... ) -> EmbedResult[list[Any]]: ...
def generate_embeddings( def generate_embeddings(
self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool, self, text: str | list[str], prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
cancel_cb: EmbCancelCallbackType, cancel_cb: EmbCancelCallbackType | None,
) -> EmbedResult[list[Any]]: ) -> EmbedResult[list[Any]]:
if not text: if not text:
raise ValueError("text must not be None or empty") raise ValueError("text must not be None or empty")
@ -368,11 +388,11 @@ class LLModel:
for i, t in enumerate(text): for i, t in enumerate(text):
c_texts[i] = t.encode() c_texts[i] = t.encode()
def wrap_cancel_cb(batch_sizes: ctypes.POINTER(ctypes.c_uint), n_batch: int, backend: bytes) -> bool: def wrap_cancel_cb(batch_sizes: Any, n_batch: int, backend: bytes) -> bool:
assert cancel_cb is not None assert cancel_cb is not None
return cancel_cb(batch_sizes[:n_batch], backend.decode()) return cancel_cb(batch_sizes[:n_batch], backend.decode())
cancel_cb_wrapper = EmbCancelCallback(0x0 if cancel_cb is None else wrap_cancel_cb) cancel_cb_wrapper = EmbCancelCallback() if cancel_cb is None else EmbCancelCallback(wrap_cancel_cb)
# generate the embeddings # generate the embeddings
embedding_ptr = llmodel.llmodel_embed( embedding_ptr = llmodel.llmodel_embed(

@ -226,6 +226,16 @@ class GPT4All:
"""Delete the model instance and free associated system resources.""" """Delete the model instance and free associated system resources."""
self.model.close() self.model.close()
@property
def backend(self) -> Literal["cpu", "kompute", "metal"]:
"""The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal"."""
return self.model.backend
@property
def device(self) -> str | None:
"""The name of the GPU device currently in use, or None for backends other than Kompute."""
return self.model.device
@property @property
def current_chat_session(self) -> list[MessageType] | None: def current_chat_session(self) -> list[MessageType] | None:
return None if self._history is None else list(self._history) return None if self._history is None else list(self._history)

@ -68,7 +68,7 @@ def get_long_description():
setup( setup(
name=package_name, name=package_name,
version="2.5.2", version="2.6.0",
description="Python bindings for GPT4All", description="Python bindings for GPT4All",
long_description=get_long_description(), long_description=get_long_description(),
long_description_content_type="text/markdown", long_description_content_type="text/markdown",

Loading…
Cancel
Save