diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
index 3d88d6b31..e8b89dee8 100644
--- a/LLama.Web/Common/ModelOptions.cs
+++ b/LLama.Web/Common/ModelOptions.cs
@@ -84,7 +84,7 @@ public class ModelOptions : IModelParams
///
/// how split tensors should be distributed across GPUs
///
- public nint TensorSplits { get; set; }
+ public float[] TensorSplits { get; set; }
///
/// Grouped-Query Attention
diff --git a/LLama/Abstractions/IModelParams.cs b/LLama/Abstractions/IModelParams.cs
index 40c5432b7..fdc911521 100644
--- a/LLama/Abstractions/IModelParams.cs
+++ b/LLama/Abstractions/IModelParams.cs
@@ -93,7 +93,7 @@ public interface IModelParams
///
/// how split tensors should be distributed across GPUs
///
- nint TensorSplits { get; set; }
+ float[]? TensorSplits { get; set; }
///
/// Grouped-Query Attention
diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
index 72c779379..5cb810783 100644
--- a/LLama/Common/ModelParams.cs
+++ b/LLama/Common/ModelParams.cs
@@ -1,14 +1,13 @@
using LLama.Abstractions;
using System;
-using System.Collections.Generic;
-using System.Text;
namespace LLama.Common
{
///
/// The parameters for initializing a LLama model.
///
- public class ModelParams : IModelParams
+ public class ModelParams
+ : IModelParams
{
///
/// Model context size (n_ctx)
@@ -85,7 +84,7 @@ public class ModelParams : IModelParams
///
/// how split tensors should be distributed across GPUs
///
- public nint TensorSplits { get; set; }
+ public float[]? TensorSplits { get; set; }
///
/// Grouped-Query Attention
diff --git a/LLama/Extensions/IModelParamsExtensions.cs b/LLama/Extensions/IModelParamsExtensions.cs
new file mode 100644
index 000000000..0cd24cff9
--- /dev/null
+++ b/LLama/Extensions/IModelParamsExtensions.cs
@@ -0,0 +1,54 @@
+using System.IO;
+using System;
+using System.Buffers;
+using LLama.Abstractions;
+using LLama.Native;
+
+namespace LLama.Extensions
+{
+ internal static class IModelParamsExtensions
+ {
+ ///
+ /// Convert the given `IModelParams` into a `LLamaContextParams`
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ public static MemoryHandle ToLlamaContextParams(this IModelParams @params, out LLamaContextParams result)
+ {
+ if (!File.Exists(@params.ModelPath))
+ throw new FileNotFoundException($"The model file does not exist: {@params.ModelPath}");
+
+ if (@params.TensorSplits != null && @params.TensorSplits.Length != 1)
+ throw new ArgumentException("Currently multi-gpu support is not supported by both llama.cpp and LLamaSharp.");
+
+ result = NativeApi.llama_context_default_params();
+ result.n_ctx = @params.ContextSize;
+ result.n_batch = @params.BatchSize;
+ result.main_gpu = @params.MainGpu;
+ result.n_gpu_layers = @params.GpuLayerCount;
+ result.seed = @params.Seed;
+ result.f16_kv = @params.UseFp16Memory;
+ result.use_mmap = @params.UseMemoryLock;
+ result.use_mlock = @params.UseMemoryLock;
+ result.logits_all = @params.Perplexity;
+ result.embedding = @params.EmbeddingMode;
+ result.low_vram = @params.LowVram;
+ result.n_gqa = @params.GroupedQueryAttention;
+ result.rms_norm_eps = @params.RmsNormEpsilon;
+ result.rope_freq_base = @params.RopeFrequencyBase;
+ result.rope_freq_scale = @params.RopeFrequencyScale;
+ result.mul_mat_q = @params.MulMatQ;
+
+ var pin = @params.TensorSplits.AsMemory().Pin();
+ unsafe
+ {
+ result.tensor_split = (nint)pin.Pointer;
+ }
+
+ return pin;
+ }
+ }
+}
diff --git a/LLama/Utils.cs b/LLama/Utils.cs
index 1454693fd..391a5cc14 100644
--- a/LLama/Utils.cs
+++ b/LLama/Utils.cs
@@ -1,12 +1,12 @@
using LLama.Abstractions;
-using LLama.Exceptions;
using LLama.Native;
using System;
using System.Collections.Generic;
-using System.IO;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;
+using LLama.Exceptions;
+using LLama.Extensions;
namespace LLama
{
@@ -15,46 +15,16 @@ public static class Utils
{
public static SafeLLamaContextHandle InitLLamaContextFromModelParams(IModelParams @params)
{
- var lparams = NativeApi.llama_context_default_params();
-
- lparams.n_ctx = @params.ContextSize;
- lparams.n_batch = @params.BatchSize;
- lparams.main_gpu = @params.MainGpu;
- lparams.n_gpu_layers = @params.GpuLayerCount;
- lparams.seed = @params.Seed;
- lparams.f16_kv = @params.UseFp16Memory;
- lparams.use_mmap = @params.UseMemoryLock;
- lparams.use_mlock = @params.UseMemoryLock;
- lparams.logits_all = @params.Perplexity;
- lparams.embedding = @params.EmbeddingMode;
- lparams.low_vram = @params.LowVram;
- lparams.n_gqa = @params.GroupedQueryAttention;
- lparams.rms_norm_eps = @params.RmsNormEpsilon;
- lparams.rope_freq_base = @params.RopeFrequencyBase;
- lparams.rope_freq_scale = @params.RopeFrequencyScale;
- lparams.mul_mat_q = @params.MulMatQ;
-
- /*
- if (@params.TensorSplits.Length != 1)
+ using (@params.ToLlamaContextParams(out var lparams))
{
- throw new ArgumentException("Currently multi-gpu support is not supported by " +
- "both llama.cpp and LLamaSharp.");
- }*/
+ var model = SafeLlamaModelHandle.LoadFromFile(@params.ModelPath, lparams);
+ var ctx = SafeLLamaContextHandle.Create(model, lparams);
- lparams.tensor_split = @params.TensorSplits;
+ if (!string.IsNullOrEmpty(@params.LoraAdapter))
+ model.ApplyLoraFromFile(@params.LoraAdapter, @params.LoraBase, @params.Threads);
- if (!File.Exists(@params.ModelPath))
- {
- throw new FileNotFoundException($"The model file does not exist: {@params.ModelPath}");
+ return ctx;
}
-
- var model = SafeLlamaModelHandle.LoadFromFile(@params.ModelPath, lparams);
- var ctx = SafeLLamaContextHandle.Create(model, lparams);
-
- if (!string.IsNullOrEmpty(@params.LoraAdapter))
- model.ApplyLoraFromFile(@params.LoraAdapter, @params.LoraBase, @params.Threads);
-
- return ctx;
}
public static IEnumerable Tokenize(SafeLLamaContextHandle ctx, string text, bool add_bos, Encoding encoding)