python: add list_gpus to the GPT4All API (#2194)

Other changes: * fix memory leak in llmodel_available_gpu_devices * drop model argument from llmodel_available_gpu_devices * breaking: make GPT4All/Embed4All arguments past model_name keyword-only Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2 months ago · 1b84a48c47
parent 790320e170
commit 1b84a48c47
8 changed files with 91 additions and 58 deletions
--- a/gpt4all-backend/llmodel.cpp
+++ b/gpt4all-backend/llmodel.cpp
@ -213,9 +213,9 @@ LLModel *LLModel::Implementation::constructDefaultLlama() {
    return llama.get();
 }

-std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices() {
+std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired) {
    auto *llama = constructDefaultLlama();
-    if (llama) { return llama->availableGPUDevices(0); }
+    if (llama) { return llama->availableGPUDevices(memoryRequired); }
    return {};
 }

--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@ -38,7 +38,7 @@ public:
        std::string_view buildVariant() const { return m_buildVariant; }

        static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto", int n_ctx = 2048);
-        static std::vector<GPUDevice> availableGPUDevices();
+        static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
        static int32_t maxContextLength(const std::string &modelPath);
        static int32_t layerCount(const std::string &modelPath);
        static bool isEmbeddingModel(const std::string &modelPath);
--- a/gpt4all-backend/llmodel_c.cpp
+++ b/gpt4all-backend/llmodel_c.cpp
@ -4,6 +4,7 @@
 #include <cerrno>
 #include <cstring>
 #include <iostream>
+#include <memory>
 #include <optional>
 #include <utility>

@ -221,28 +222,45 @@ const char *llmodel_get_implementation_search_path()
    return LLModel::Implementation::implementationsSearchPath().c_str();
 }

-struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, size_t memoryRequired, int* num_devices)
-{
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    std::vector<LLModel::GPUDevice> devices = wrapper->llModel->availableGPUDevices(memoryRequired);
+// RAII wrapper around a C-style struct
+struct llmodel_gpu_device_cpp: llmodel_gpu_device {
+    llmodel_gpu_device_cpp() = default;

-    // Set the num_devices
-    *num_devices = devices.size();
+    llmodel_gpu_device_cpp(const llmodel_gpu_device_cpp  &) = delete;
+    llmodel_gpu_device_cpp(      llmodel_gpu_device_cpp &&) = delete;
+
+    const llmodel_gpu_device_cpp &operator=(const llmodel_gpu_device_cpp  &) = delete;
+          llmodel_gpu_device_cpp &operator=(      llmodel_gpu_device_cpp &&) = delete;
+
+    ~llmodel_gpu_device_cpp() {
+        free(const_cast<char *>(name));
+        free(const_cast<char *>(vendor));
+    }
+};

-    if (*num_devices == 0) return nullptr;  // Return nullptr if no devices are found
+static_assert(sizeof(llmodel_gpu_device_cpp) == sizeof(llmodel_gpu_device));

-    // Allocate memory for the output array
-    struct llmodel_gpu_device* output = (struct llmodel_gpu_device*) malloc(*num_devices * sizeof(struct llmodel_gpu_device));
+struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired, int *num_devices)
+{
+    static thread_local std::unique_ptr<llmodel_gpu_device_cpp[]> c_devices;
+
+    auto devices = LLModel::Implementation::availableGPUDevices(memoryRequired);
+    *num_devices = devices.size();

-    for (int i = 0; i < *num_devices; i++) {
-        output[i].index = devices[i].index;
-        output[i].type = devices[i].type;
-        output[i].heapSize = devices[i].heapSize;
-        output[i].name = strdup(devices[i].name.c_str());  // Convert std::string to char* and allocate memory
-        output[i].vendor = strdup(devices[i].vendor.c_str());  // Convert std::string to char* and allocate memory
+    if (devices.empty()) { return nullptr; /* no devices */ }
+
+    c_devices = std::make_unique<llmodel_gpu_device_cpp[]>(devices.size());
+    for (unsigned i = 0; i < devices.size(); i++) {
+        const auto &dev  =   devices[i];
+              auto &cdev = c_devices[i];
+        cdev.index    = dev.index;
+        cdev.type     = dev.type;
+        cdev.heapSize = dev.heapSize;
+        cdev.name     = strdup(dev.name.c_str());
+        cdev.vendor   = strdup(dev.vendor.c_str());
    }

-    return output;
+    return c_devices.get();
 }

 bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device)
--- a/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/llmodel_c.h
@ -48,9 +48,9 @@ struct llmodel_prompt_context {
 };

 struct llmodel_gpu_device {
-    int index = 0;
-    int type = 0;           // same as VkPhysicalDeviceType
-    size_t heapSize = 0;
+    int index;
+    int type; // same as VkPhysicalDeviceType
+    size_t heapSize;
    const char * name;
    const char * vendor;
 };
@ -241,9 +241,10 @@ const char *llmodel_get_implementation_search_path();

 /**
 * Get a list of available GPU devices given the memory required.
+ * @param memoryRequired The minimum amount of VRAM, in bytes
 * @return A pointer to an array of llmodel_gpu_device's whose number is given by num_devices.
 */
-struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, size_t memoryRequired, int* num_devices);
+struct llmodel_gpu_device* llmodel_available_gpu_devices(size_t memoryRequired, int* num_devices);

 /**
 * Initializes a GPU device based on a specified string criterion.
--- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@ -138,7 +138,7 @@ llmodel.llmodel_threadCount.restype = ctypes.c_int32

 llmodel.llmodel_set_implementation_search_path(str(MODEL_LIB_PATH).encode())

-llmodel.llmodel_available_gpu_devices.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.POINTER(ctypes.c_int32)]
+llmodel.llmodel_available_gpu_devices.argtypes = [ctypes.c_size_t, ctypes.POINTER(ctypes.c_int32)]
 llmodel.llmodel_available_gpu_devices.restype = ctypes.POINTER(LLModelGPUDevice)

 llmodel.llmodel_gpu_init_gpu_device_by_string.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_char_p]
@ -214,13 +214,22 @@ class LLModel:
    def _raise_closed(self) -> NoReturn:
        raise ValueError("Attempted operation on a closed LLModel")

-    def _list_gpu(self, mem_required: int) -> list[LLModelGPUDevice]:
-        assert self.model is not None
+    @staticmethod
+    def list_gpus(mem_required: int = 0) -> list[str]:
+        """
+        List the names of the available GPU devices with at least `mem_required` bytes of VRAM.
+
+        Args:
+            mem_required: The minimum amount of VRAM, in bytes
+
+        Returns:
+            A list of strings representing the names of the available GPU devices.
+        """
        num_devices = ctypes.c_int32(0)
-        devices_ptr = llmodel.llmodel_available_gpu_devices(self.model, mem_required, ctypes.byref(num_devices))
+        devices_ptr = llmodel.llmodel_available_gpu_devices(mem_required, ctypes.byref(num_devices))
        if not devices_ptr:
            raise ValueError("Unable to retrieve available GPU devices")
-        return devices_ptr[:num_devices.value]
+        return [d.name.decode() for d in devices_ptr[:num_devices.value]]

    def init_gpu(self, device: str):
        if self.model is None:
@ -231,23 +240,13 @@ class LLModel:
        if llmodel.llmodel_gpu_init_gpu_device_by_string(self.model, mem_required, device.encode()):
            return

-        # Retrieve all GPUs without considering memory requirements.
-        num_devices = ctypes.c_int32(0)
-        all_devices_ptr = llmodel.llmodel_available_gpu_devices(self.model, 0, ctypes.byref(num_devices))
-        if not all_devices_ptr:
-            raise ValueError("Unable to retrieve list of all GPU devices")
-        all_gpus = [d.name.decode() for d in all_devices_ptr[:num_devices.value]]
-
-        # Retrieve GPUs that meet the memory requirements using list_gpu
-        available_gpus = [device.name.decode() for device in self._list_gpu(mem_required)]
-
-        # Identify GPUs that are unavailable due to insufficient memory or features
+        all_gpus = self.list_gpus()
+        available_gpus = self.list_gpus(mem_required)
        unavailable_gpus = set(all_gpus).difference(available_gpus)

-        # Formulate the error message
-        error_msg = "Unable to initialize model on GPU: '{}'.".format(device)
-        error_msg += "\nAvailable GPUs: {}.".format(available_gpus)
-        error_msg += "\nUnavailable GPUs due to insufficient memory or features: {}.".format(unavailable_gpus)
+        error_msg = "Unable to initialize model on GPU: {!r}".format(device)
+        error_msg += "\nAvailable GPUs: {}".format(available_gpus)
+        error_msg += "\nUnavailable GPUs due to insufficient memory or features: {}".format(unavailable_gpus)
        raise ValueError(error_msg)

    def load_model(self) -> bool:
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@ -19,8 +19,7 @@ from requests.exceptions import ChunkedEncodingError
 from tqdm import tqdm
 from urllib3.exceptions import IncompleteRead, ProtocolError

-from . import _pyllmodel
-from ._pyllmodel import EmbedResult as EmbedResult
+from ._pyllmodel import EmbedResult as EmbedResult, LLModel, ResponseCallbackType, empty_response_callback

 if TYPE_CHECKING:
    from typing_extensions import Self, TypeAlias
@ -44,16 +43,18 @@ class Embed4All:

    MIN_DIMENSIONALITY = 64

-    def __init__(self, model_name: str | None = None, n_threads: int | None = None, **kwargs):
+    def __init__(self, model_name: str | None = None, *, n_threads: int | None = None, device: str | None = "cpu", **kwargs: Any):
        """
        Constructor

        Args:
            n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
+            device: The processing unit on which the embedding model will run. See the `GPT4All` constructor for more info.
+            kwargs: Remaining keyword arguments are passed to the `GPT4All` constructor.
        """
        if model_name is None:
            model_name = 'all-MiniLM-L6-v2.gguf2.f16.gguf'
-        self.gpt4all = GPT4All(model_name, n_threads=n_threads, **kwargs)
+        self.gpt4all = GPT4All(model_name, n_threads=n_threads, device=device, **kwargs)

    def __enter__(self) -> Self:
        return self
@ -157,6 +158,7 @@ class GPT4All:
    def __init__(
        self,
        model_name: str,
+        *,
        model_path: str | os.PathLike[str] | None = None,
        model_type: str | None = None,
        allow_download: bool = True,
@ -181,7 +183,7 @@ class GPT4All:
                - "cpu": Model will run on the central processing unit.
                - "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor.
                - "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor.
-                Alternatively, a specific GPU name can also be provided, and the model will run on the GPU that matches the name if it's available.
+                - A specific device name from the list returned by `GPT4All.list_gpus()`.
                Default is "cpu".

                Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model.
@ -192,7 +194,7 @@ class GPT4All:
        self.model_type = model_type
        # Retrieve model and download if allowed
        self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
-        self.model = _pyllmodel.LLModel(self.config["path"], n_ctx, ngl)
+        self.model = LLModel(self.config["path"], n_ctx, ngl)
        if device is not None and device != "cpu":
            self.model.init_gpu(device)
        self.model.load_model()
@ -419,19 +421,19 @@ class GPT4All:
    def generate(
        self, prompt: str, *, max_tokens: int = ..., temp: float = ..., top_k: int = ..., top_p: float = ...,
        min_p: float = ..., repeat_penalty: float = ..., repeat_last_n: int = ..., n_batch: int = ...,
-        n_predict: int | None = ..., streaming: Literal[False] = ..., callback: _pyllmodel.ResponseCallbackType = ...,
+        n_predict: int | None = ..., streaming: Literal[False] = ..., callback: ResponseCallbackType = ...,
    ) -> str: ...
    @overload
    def generate(
        self, prompt: str, *, max_tokens: int = ..., temp: float = ..., top_k: int = ..., top_p: float = ...,
        min_p: float = ..., repeat_penalty: float = ..., repeat_last_n: int = ..., n_batch: int = ...,
-        n_predict: int | None = ..., streaming: Literal[True], callback: _pyllmodel.ResponseCallbackType = ...,
+        n_predict: int | None = ..., streaming: Literal[True], callback: ResponseCallbackType = ...,
    ) -> Iterable[str]: ...
    @overload
    def generate(
        self, prompt: str, *, max_tokens: int = ..., temp: float = ..., top_k: int = ..., top_p: float = ...,
        min_p: float = ..., repeat_penalty: float = ..., repeat_last_n: int = ..., n_batch: int = ...,
-        n_predict: int | None = ..., streaming: bool, callback: _pyllmodel.ResponseCallbackType = ...,
+        n_predict: int | None = ..., streaming: bool, callback: ResponseCallbackType = ...,
    ) -> Any: ...

    def generate(
@ -448,7 +450,7 @@ class GPT4All:
        n_batch: int = 8,
        n_predict: int | None = None,
        streaming: bool = False,
-        callback: _pyllmodel.ResponseCallbackType = _pyllmodel.empty_response_callback,
+        callback: ResponseCallbackType = empty_response_callback,
    ) -> Any:
        """
        Generate outputs from any GPT4All model.
@ -494,7 +496,7 @@ class GPT4All:
                if reset:
                    # ingest system prompt
                    self.model.prompt_model(self._history[0]["content"], "%1",
-                                            _pyllmodel.empty_response_callback,
+                                            empty_response_callback,
                                            n_batch=n_batch, n_predict=0, special=True)
                prompt_template = self._current_prompt_template.format("%1", "%2")
            else:
@ -523,9 +525,9 @@ class GPT4All:
            output_collector = self._history

        def _callback_wrapper(
-            callback: _pyllmodel.ResponseCallbackType,
+            callback: ResponseCallbackType,
            output_collector: list[MessageType],
-        ) -> _pyllmodel.ResponseCallbackType:
+        ) -> ResponseCallbackType:
            def _callback(token_id: int, response: str) -> bool:
                nonlocal callback, output_collector

@ -589,6 +591,16 @@ class GPT4All:
            self._history = None
            self._current_prompt_template = "{0}"

+    @staticmethod
+    def list_gpus() -> list[str]:
+        """
+        List the names of the available GPU devices.
+
+        Returns:
+            A list of strings representing the names of the available GPU devices.
+        """
+        return LLModel.list_gpus()
+
    def _format_chat_prompt_template(
        self,
        messages: list[MessageType],
@ -598,6 +610,9 @@ class GPT4All:
        """
        Helper method for building a prompt from list of messages using the self._current_prompt_template as a template for each message.

+        Warning:
+            This function was deprecated in version 2.3.0, and will be removed in a future release.
+
        Args:
            messages:  List of dictionaries. Each dictionary should have a "role" key
                with value of "system", "assistant", or "user" and a "content" key with a
--- a/gpt4all-bindings/python/setup.py
+++ b/gpt4all-bindings/python/setup.py
@ -68,7 +68,7 @@ def get_long_description():

 setup(
    name=package_name,
-    version="2.3.3",
+    version="2.4.0",
    description="Python bindings for GPT4All",
    long_description=get_long_description(),
    long_description_content_type="text/markdown",
--- a/gpt4all-bindings/typescript/index.cc
+++ b/gpt4all-bindings/typescript/index.cc
@ -36,7 +36,7 @@ Napi::Value NodeModelWrapper::GetGpuDevices(const Napi::CallbackInfo &info)
    auto env = info.Env();
    int num_devices = 0;
    auto mem_size = llmodel_required_mem(GetInference(), full_model_path.c_str(), nCtx, nGpuLayers);
-    llmodel_gpu_device *all_devices = llmodel_available_gpu_devices(GetInference(), mem_size, &num_devices);
+    llmodel_gpu_device *all_devices = llmodel_available_gpu_devices(mem_size, &num_devices);
    if (all_devices == nullptr)
    {
        Napi::Error::New(env, "Unable to retrieve list of all GPU devices").ThrowAsJavaScriptException();