diff --git a/.github/prepare_release.sh b/.github/prepare_release.sh index e44099978..bfc42ed74 100755 --- a/.github/prepare_release.sh +++ b/.github/prepare_release.sh @@ -63,7 +63,7 @@ elif [[ $type == "patch" ]]; then exit 1 fi else - echo "Invalid type" + echo "Invalid type" exit 1 fi @@ -71,6 +71,7 @@ cd .. # pack the main package dotnet pack ./LLama/LLamaSharp.csproj -c Release -o ./temp/ /p:PackageVersion=$updated_version /p:Version=$updated_version; dotnet pack ./LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj -c Release -o ./temp/ /p:PackageVersion=$updated_version /p:Version=$updated_version; +dotnet pack ./LLama.KernelMemory/LLamaSharp.KernelMemory.csproj -c Release -o ./temp/ /p:PackageVersion=$updated_version /p:Version=$updated_version; # pack the backends cd temp diff --git a/.github/workflows/release-minor.yml b/.github/workflows/release-minor.yml index e54e889e8..e6fed26cf 100644 --- a/.github/workflows/release-minor.yml +++ b/.github/workflows/release-minor.yml @@ -32,6 +32,7 @@ jobs: run: | dotnet build ./LLama/LLamaSharp.csproj -c Release --no-restore dotnet build ./LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj -c Release --no-restore + dotnet build ./LLama.KernelMemory/LLamaSharp.KernelMemory.csproj -c Release --no-restore - name: Pack packages run: | diff --git a/.github/workflows/release-patch.yml b/.github/workflows/release-patch.yml index 1a59ac9cc..2e5bd9442 100644 --- a/.github/workflows/release-patch.yml +++ b/.github/workflows/release-patch.yml @@ -32,6 +32,7 @@ jobs: run: | dotnet build ./LLama/LLamaSharp.csproj -c Release --no-restore dotnet build ./LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj -c Release --no-restore + dotnet build ./LLama.KernelMemory/LLamaSharp.KernelMemory.csproj -c Release --no-restore - name: Pack packages run: | diff --git a/LLama.Examples/Program.cs b/LLama.Examples/Program.cs index d9c8a8904..0f180053f 100644 --- a/LLama.Examples/Program.cs +++ b/LLama.Examples/Program.cs @@ -7,6 +7,7 @@ Console.WriteLine("======================================================================================================"); +NativeLibraryConfig.Default.WithCuda().WithLogs(); NativeApi.llama_empty_call(); Console.WriteLine(); diff --git a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj index 7fd99e2cb..c26471a1c 100644 --- a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj +++ b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj @@ -1,11 +1,12 @@ - net6.0 + net6.0;net7.0 enable enable 0.7.1 + 0.8.0 Xbotter SciSharp STACK true diff --git a/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj b/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj index c6ece4e7e..c752aca7d 100644 --- a/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj +++ b/LLama.SemanticKernel/LLamaSharp.SemanticKernel.csproj @@ -10,7 +10,7 @@ enable enable - 0.7.1 + 0.8.0 Tim Miller, Xbotter SciSharp STACK true diff --git a/LLama/AntipromptProcessor.cs b/LLama/AntipromptProcessor.cs index 4d969cea2..22df99936 100644 --- a/LLama/AntipromptProcessor.cs +++ b/LLama/AntipromptProcessor.cs @@ -1,66 +1,67 @@ using System; using System.Collections.Generic; -namespace LLama; - -internal sealed class AntipromptProcessor +namespace LLama { - private int _longestAntiprompt; - private readonly List _antiprompts = new(); - - private string? _string; - - public AntipromptProcessor(IEnumerable? antiprompts = null) + internal sealed class AntipromptProcessor { - if (antiprompts != null) - SetAntiprompts(antiprompts); - } + private int _longestAntiprompt; + private readonly List _antiprompts = new(); - /// - /// Add an antiprompt to the collection - /// - /// - public void AddAntiprompt(string antiprompt) - { - _antiprompts.Add(antiprompt); - _longestAntiprompt = Math.Max(_longestAntiprompt, antiprompt.Length); - } + private string? _string; - /// - /// Overwrite all current antiprompts with a new set - /// - /// - public void SetAntiprompts(IEnumerable antiprompts) - { - _antiprompts.Clear(); - _antiprompts.AddRange(antiprompts); + public AntipromptProcessor(IEnumerable? antiprompts = null) + { + if (antiprompts != null) + SetAntiprompts(antiprompts); + } - _longestAntiprompt = 0; - foreach (var antiprompt in _antiprompts) + /// + /// Add an antiprompt to the collection + /// + /// + public void AddAntiprompt(string antiprompt) + { + _antiprompts.Add(antiprompt); _longestAntiprompt = Math.Max(_longestAntiprompt, antiprompt.Length); - } + } - /// - /// Add some text and check if the buffer now ends with any antiprompt - /// - /// - /// true if the text buffer ends with any antiprompt - public bool Add(string text) - { - _string += text; + /// + /// Overwrite all current antiprompts with a new set + /// + /// + public void SetAntiprompts(IEnumerable antiprompts) + { + _antiprompts.Clear(); + _antiprompts.AddRange(antiprompts); + + _longestAntiprompt = 0; + foreach (var antiprompt in _antiprompts) + _longestAntiprompt = Math.Max(_longestAntiprompt, antiprompt.Length); + } + + /// + /// Add some text and check if the buffer now ends with any antiprompt + /// + /// + /// true if the text buffer ends with any antiprompt + public bool Add(string text) + { + _string += text; - // When the string gets very long (4x antiprompt length) trim it down (to 2x antiprompt length). - // This trimming leaves a lot of extra characters because two sequences can be considered "equal" in unicode - // even with different numbers of characters. Hopefully there are enough characters here to handle all those weird circumstances! - var maxLength = Math.Max(32, _longestAntiprompt * 4); - var trimLength = Math.Max(16, _longestAntiprompt * 2); - if (_string.Length > maxLength) - _string = _string.Substring(_string.Length - trimLength); + // When the string gets very long (4x antiprompt length) trim it down (to 2x antiprompt length). + // This trimming leaves a lot of extra characters because two sequences can be considered "equal" in unicode + // even with different numbers of characters. Hopefully there are enough characters here to handle all those weird circumstances! + var maxLength = Math.Max(32, _longestAntiprompt * 4); + var trimLength = Math.Max(16, _longestAntiprompt * 2); + if (_string.Length > maxLength) + _string = _string.Substring(_string.Length - trimLength); - foreach (var antiprompt in _antiprompts) - if (_string.EndsWith(antiprompt, StringComparison.CurrentCulture)) - return true; + foreach (var antiprompt in _antiprompts) + if (_string.EndsWith(antiprompt, StringComparison.CurrentCulture)) + return true; - return false; + return false; + } } } \ No newline at end of file diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets index c14f0ffab..9085e5f7f 100644 --- a/LLama/LLamaSharp.Runtime.targets +++ b/LLama/LLamaSharp.Runtime.targets @@ -5,39 +5,39 @@ PreserveNewest - libllama.dll + runtimes/win-x64/native/libllama.dll PreserveNewest - libllama-cuda11.dll + runtimes/win-x64/native/cuda11/libllama.dll PreserveNewest - libllama-cuda12.dll + runtimes/win-x64/native/cuda12/libllama.dll PreserveNewest - libllama.so + runtimes/linux-x64/native/libllama.so PreserveNewest - libllama-cuda11.so + runtimes/linux-x64/native/cuda11/libllama.so PreserveNewest - libllama-cuda12.so + runtimes/linux-x64/native/cuda12/libllama.so PreserveNewest - runtimes/osx-arm64/libllama.dylib + runtimes/osx-arm64/native/libllama.dylib PreserveNewest - runtimes/osx-arm64/ggml-metal.metal + runtimes/osx-arm64/native/ggml-metal.metal PreserveNewest - runtimes/osx-x64/libllama.dylib + runtimes/osx-x64/native/libllama.dylib \ No newline at end of file diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj index d525202f2..18ea30804 100644 --- a/LLama/LLamaSharp.csproj +++ b/LLama/LLamaSharp.csproj @@ -7,7 +7,7 @@ AnyCPU;x64;Arm64 True - 0.5.0 + 0.8.0 Yaohui Liu, Martin Evans, Haiping Chen SciSharp STACK true @@ -17,11 +17,11 @@ https://avatars3.githubusercontent.com/u/44989469?s=200&v=4 LLama, LLM, GPT, ChatGPT, NLP, AI, Chat Bot, SciSharp - The .NET binding of LLama.cpp, providing APIs to run the model and deploy it on Web. For model + The .NET binding of LLama.cpp, making LLM inference and deployment easy and fast. For model weights to run, please go to https://github.com/SciSharp/LLamaSharp for more information. - LLamaSharp 0.5.0 adds support for GGUF, grammar and integration with semantic-kernel. + LLamaSharp 0.8.0 supports automatically device feature detection, adds integration with kernel-memory and fixes some performance issues. MIT packages diff --git a/LLama/Native/NativeApi.Load.cs b/LLama/Native/NativeApi.Load.cs new file mode 100644 index 000000000..87f753244 --- /dev/null +++ b/LLama/Native/NativeApi.Load.cs @@ -0,0 +1,357 @@ +using LLama.Exceptions; +using Microsoft.Extensions.Logging; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Runtime.InteropServices; +using System.Text; +using System.Text.Json; + +namespace LLama.Native +{ + public partial class NativeApi + { + static NativeApi() + { + // Try to load a preferred library, based on CPU feature detection + TryLoadLibrary(); + + try + { + llama_empty_call(); + } + catch (DllNotFoundException) + { + throw new RuntimeError("The native library cannot be correctly loaded. It could be one of the following reasons: \n" + + "1. No LLamaSharp backend was installed. Please search LLamaSharp.Backend and install one of them. \n" + + "2. You are using a device with only CPU but installed cuda backend. Please install cpu backend instead. \n" + + "3. One of the dependency of the native library is missed. Please use `ldd` on linux, `dumpbin` on windows and `otool`" + + "to check if all the dependency of the native library is satisfied. Generally you could find the libraries under your output folder.\n" + + "4. Try to compile llama.cpp yourself to generate a libllama library, then use `LLama.Native.NativeLibraryConfig.WithLibrary` " + + "to specify it at the very beginning of your code. For more informations about compilation, please refer to LLamaSharp repo on github.\n"); + } + llama_backend_init(false); + } + + private static void Log(string message, LogLevel level) + { + if (!enableLogging) return; + Debug.Assert(level is LogLevel.Information or LogLevel.Error or LogLevel.Warning); + ConsoleColor color; + string levelPrefix; + if (level == LogLevel.Information) + { + color = ConsoleColor.Green; + levelPrefix = "[Info]"; + } + else if (level == LogLevel.Error) + { + color = ConsoleColor.Red; + levelPrefix = "[Error]"; + } + else + { + color = ConsoleColor.Yellow; + levelPrefix = "[Error]"; + } + Console.ForegroundColor = color; + Console.WriteLine($"{loggingPrefix} {levelPrefix} {message}"); + Console.ResetColor(); + } + + private static int GetCudaMajorVersion() + { + string? cudaPath; + string version = ""; + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + cudaPath = Environment.GetEnvironmentVariable("CUDA_PATH"); + if (cudaPath is null) + { + return -1; + } + version = GetCudaVersionFromPath(cudaPath); + } + else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) + { + // Try the default first + cudaPath = "/usr/local/bin/cuda"; + version = GetCudaVersionFromPath(cudaPath); + if (string.IsNullOrEmpty(version)) + { + cudaPath = Environment.GetEnvironmentVariable("LD_LIBRARY_PATH"); + if (cudaPath is null) + { + return -1; + } + foreach (var path in cudaPath.Split(':')) + { + version = GetCudaVersionFromPath(Path.Combine(path, "..")); + if (string.IsNullOrEmpty(version)) + { + break; + } + } + } + } + + if (string.IsNullOrEmpty(version)) + { + return -1; + } + else + { + version = version.Split('.')[0]; + bool success = int.TryParse(version, out var majorVersion); + if (success) + { + return majorVersion; + } + else + { + return -1; + } + } + } + + private static string GetCudaVersionFromPath(string cudaPath) + { + try + { + string json = File.ReadAllText(Path.Combine(cudaPath, cudaVersionFile)); + using (JsonDocument document = JsonDocument.Parse(json)) + { + JsonElement root = document.RootElement; + JsonElement cublasNode = root.GetProperty("libcublas"); + JsonElement versionNode = cublasNode.GetProperty("version"); + if (versionNode.ValueKind == JsonValueKind.Undefined) + { + return string.Empty; + } + return versionNode.GetString(); + } + } + catch (Exception) + { + return string.Empty; + } + } + +#if NET6_0_OR_GREATER + private static string GetAvxLibraryPath(NativeLibraryConfig.AvxLevel avxLevel, string prefix, string suffix) + { + var avxStr = NativeLibraryConfig.AvxLevelToString(avxLevel); + if (!string.IsNullOrEmpty(avxStr)) + { + avxStr += "/"; + } + return $"{prefix}{avxStr}{libraryName}{suffix}"; + } + + private static List GetLibraryTryOrder(NativeLibraryConfig.Description configuration) + { + OSPlatform platform; + string prefix, suffix; + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + platform = OSPlatform.Windows; + prefix = "runtimes/win-x64/native/"; + suffix = ".dll"; + } + else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) + { + platform = OSPlatform.Linux; + prefix = "runtimes/linux-x64/native/"; + suffix = ".so"; + } + else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) + { + platform = OSPlatform.OSX; + suffix = ".dylib"; + if (System.Runtime.Intrinsics.Arm.ArmBase.Arm64.IsSupported) + { + prefix = "runtimes/osx-arm64/native/"; + } + else + { + prefix = "runtimes/osx-x64/native/"; + } + } + else + { + throw new RuntimeError($"Your system plarform is not supported, please open an issue in LLamaSharp."); + } + Log($"Detected OS Platform: {platform}", LogLevel.Information); + + List result = new(); + if (configuration.UseCuda && (platform == OSPlatform.Windows || platform == OSPlatform.Linux)) // no cuda on macos + { + int cudaVersion = GetCudaMajorVersion(); + + // TODO: load cuda library with avx + if (cudaVersion == -1 && !configuration.AllowFallback) + { + // if check skipped, we just try to load cuda libraries one by one. + if (configuration.SkipCheck) + { + result.Add($"{prefix}cuda12/{libraryName}{suffix}"); + result.Add($"{prefix}cuda11/{libraryName}{suffix}"); + } + else + { + throw new RuntimeError("Configured to load a cuda library but no cuda detected on your device."); + } + } + else if (cudaVersion == 11) + { + Log($"Detected cuda major version {cudaVersion}.", LogLevel.Information); + result.Add($"{prefix}cuda11/{libraryName}{suffix}"); + } + else if (cudaVersion == 12) + { + Log($"Detected cuda major version {cudaVersion}.", LogLevel.Information); + result.Add($"{prefix}cuda12/{libraryName}{suffix}"); + } + else if (cudaVersion > 0) + { + throw new RuntimeError($"Cuda version {cudaVersion} hasn't been supported by LLamaSharp, please open an issue for it."); + } + // otherwise no cuda detected but allow fallback + } + + // use cpu (or mac possibly with metal) + if (!configuration.AllowFallback && platform != OSPlatform.OSX) + { + result.Add(GetAvxLibraryPath(configuration.AvxLevel, prefix, suffix)); + } + else if (platform != OSPlatform.OSX) // in macos there's absolutely no avx + { +#if NET8_0_OR_GREATER + if (configuration.AvxLevel == NativeLibraryConfig.AvxLevel.Avx512) + { + result.Add(GetAvxLibraryPath(NativeLibraryConfig.AvxLevel.Avx512, prefix, suffix))); + result.Add(GetAvxLibraryPath(NativeLibraryConfig.AvxLevel.Avx2, prefix, suffix))); + result.Add(GetAvxLibraryPath(NativeLibraryConfig.AvxLevel.Avx, prefix, suffix))); + } + else +#endif + if (configuration.AvxLevel == NativeLibraryConfig.AvxLevel.Avx2) + { + result.Add(GetAvxLibraryPath(NativeLibraryConfig.AvxLevel.Avx2, prefix, suffix)); + result.Add(GetAvxLibraryPath(NativeLibraryConfig.AvxLevel.Avx, prefix, suffix)); + } + else if (configuration.AvxLevel == NativeLibraryConfig.AvxLevel.Avx) + { + result.Add(GetAvxLibraryPath(NativeLibraryConfig.AvxLevel.Avx, prefix, suffix)); + } + result.Add(GetAvxLibraryPath(NativeLibraryConfig.AvxLevel.None, prefix, suffix)); + } + + if (platform == OSPlatform.OSX) + { + result.Add($"{prefix}{libraryName}{suffix}"); + } + + return result; + } +#endif + + /// + /// Try to load libllama, using CPU feature detection to try and load a more specialised DLL if possible + /// + /// The library handle to unload later, or IntPtr.Zero if no library was loaded + private static IntPtr TryLoadLibrary() + { +#if NET6_0_OR_GREATER + var configuration = NativeLibraryConfig.CheckAndGatherDescription(); + enableLogging = configuration.Logging; + // We move the flag to avoid loading library when the variable is called else where. + NativeLibraryConfig.LibraryHasLoaded = true; + + if (!string.IsNullOrEmpty(configuration.Path)) + { + // When loading the user specified library, there's no fallback. + var success = NativeLibrary.TryLoad(configuration.Path, out var result); + if (!success) + { + throw new RuntimeError($"Failed to load the native library [{configuration.Path}] you specified."); + } + Log($"Successfully loaded the library [{configuration.Path}] specified by user", LogLevel.Information); + return result; + } + + var libraryTryLoadOrder = GetLibraryTryOrder(configuration); + + string[] possiblePathPrefix = new string[] { + System.AppDomain.CurrentDomain.BaseDirectory, + Path.GetDirectoryName(System.Reflection.Assembly.GetExecutingAssembly().Location) ?? "" + }; + + var tryFindPath = (string filename) => + { + int i = 0; + while (!File.Exists(filename)) + { + if (i < possiblePathPrefix.Length) + { + filename = Path.Combine(possiblePathPrefix[i], filename); + i++; + } + else + { + break; + } + } + return filename; + }; + + foreach (var libraryPath in libraryTryLoadOrder) + { + var fullPath = tryFindPath(libraryPath); + var result = TryLoad(fullPath, true); + if (result is not null && result != IntPtr.Zero) + { + Log($"{fullPath} is selected and loaded successfully.", LogLevel.Information); + return result ?? IntPtr.Zero; + } + else + { + Log($"Tried to load {fullPath} but failed.", LogLevel.Information); + } + } + + if (!configuration.AllowFallback) + { + throw new RuntimeError("Failed to load the library that match your rule, please" + + " 1) check your rule." + + " 2) try to allow fallback." + + " 3) or open an issue if it's expected to be successful."); + } +#endif + + Log($"No library was loaded before calling native apis. " + + $"This is not an error under netstandard2.0 but needs attention with net6 or higher.", LogLevel.Warning); + return IntPtr.Zero; + +#if NET6_0_OR_GREATER + // Try to load a DLL from the path if supported. Returns null if nothing is loaded. + static IntPtr? TryLoad(string path, bool supported = true) + { + if (!supported) + return null; + + if (NativeLibrary.TryLoad(path, out var handle)) + return handle; + + return null; + } +#endif + } + + private const string libraryName = "libllama"; + private const string cudaVersionFile = "version.json"; + private const string loggingPrefix = "[LLamaSharp Native]"; + private static bool enableLogging = false; + } +} diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index fc4086783..074a8e9fd 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -2,7 +2,6 @@ using System.Buffers; using System.Runtime.InteropServices; using System.Text; -using LLama.Exceptions; #pragma warning disable IDE1006 // Naming Styles @@ -22,88 +21,6 @@ namespace LLama.Native /// public unsafe partial class NativeApi { - static NativeApi() - { - // Try to load a preferred library, based on CPU feature detection - TryLoadLibrary(); - - try - { - llama_empty_call(); - } - catch (DllNotFoundException) - { - throw new RuntimeError("The native library cannot be found. It could be one of the following reasons: \n" + - "1. No LLamaSharp backend was installed. Please search LLamaSharp.Backend and install one of them. \n" + - "2. You are using a device with only CPU but installed cuda backend. Please install cpu backend instead. \n" + - "3. The backend is not compatible with your system cuda environment. Please check and fix it. If the environment is " + - "expected not to be changed, then consider build llama.cpp from source or submit an issue to LLamaSharp.\n" + - "4. One of the dependency of the native library is missed.\n"); - } - llama_backend_init(false); - } - - /// - /// Try to load libllama, using CPU feature detection to try and load a more specialised DLL if possible - /// - /// The library handle to unload later, or IntPtr.Zero if no library was loaded - private static IntPtr TryLoadLibrary() - { -#if NET6_0_OR_GREATER - - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) - { - // All of the Windows libraries, in order of preference - return TryLoad("cu12.1.0/libllama.dll") - ?? TryLoad("cu11.7.1/libllama.dll") -#if NET8_0_OR_GREATER - ?? TryLoad("avx512/libllama.dll", System.Runtime.Intrinsics.X86.Avx512.IsSupported) -#endif - ?? TryLoad("avx2/libllama.dll", System.Runtime.Intrinsics.X86.Avx2.IsSupported) - ?? TryLoad("avx/libllama.dll", System.Runtime.Intrinsics.X86.Avx.IsSupported) - ?? IntPtr.Zero; - } - - if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) - { - // All of the Linux libraries, in order of preference - return TryLoad("cu12.1.0/libllama.so") - ?? TryLoad("cu11.7.1/libllama.so") -#if NET8_0_OR_GREATER - ?? TryLoad("avx512/libllama.so", System.Runtime.Intrinsics.X86.Avx512.IsSupported) -#endif - ?? TryLoad("avx2/libllama.so", System.Runtime.Intrinsics.X86.Avx2.IsSupported) - ?? TryLoad("avx/libllama.so", System.Runtime.Intrinsics.X86.Avx.IsSupported) - ?? IntPtr.Zero; - } - - if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) - { - return TryLoad("runtimes/osx-arm64/libllama.dylib", System.Runtime.Intrinsics.Arm.ArmBase.Arm64.IsSupported) - ?? TryLoad("runtimes/osx-x64/libllama.dylib") - ?? IntPtr.Zero; - } -#endif - - return IntPtr.Zero; - -#if NET6_0_OR_GREATER - // Try to load a DLL from the path if supported. Returns null if nothing is loaded. - static IntPtr? TryLoad(string path, bool supported = true) - { - if (!supported) - return null; - - if (NativeLibrary.TryLoad(path, out var handle)) - return handle; - - return null; - } -#endif - } - - private const string libraryName = "libllama"; - /// /// A method that does nothing. This is a native method, calling it will force the llama native dependencies to be loaded. /// diff --git a/LLama/Native/NativeLibraryConfig.cs b/LLama/Native/NativeLibraryConfig.cs new file mode 100644 index 000000000..7af445e6d --- /dev/null +++ b/LLama/Native/NativeLibraryConfig.cs @@ -0,0 +1,201 @@ +using System; + +namespace LLama.Native +{ +#if NET6_0_OR_GREATER + /// + /// A class about configurations when loading native libraries. + /// Note that it could be configured only once before any call to llama model apis. + /// + public class NativeLibraryConfig + { + private static NativeLibraryConfig? instance; + private static readonly object lockObject = new object(); + public static NativeLibraryConfig Default + { + get + { + return GetInstance(); + } + } + + /// + /// Whether there's already a config for native library. + /// + public static bool LibraryHasLoaded { get; internal set; } = false; + + private string _libraryPath; + private bool _useCuda; + private AvxLevel _avxLevel; + private bool _allowFallback; + private bool _skipCheck; + private bool _logging; + + internal static NativeLibraryConfig GetInstance() + { + if (instance is null) + { + lock (lockObject) + { + if (instance is null) + { + instance = new NativeLibraryConfig(); + } + } + } + return instance; + } + + /// + /// Load a specified native library as backend for LLamaSharp. + /// When this method is called, all the other configurations will be ignored. + /// + /// + /// + public NativeLibraryConfig WithLibrary(string libraryPath) + { + if (LibraryHasLoaded) + { + throw new InvalidOperationException("NativeLibraryConfig could be configured only once before any call to llama model apis."); + } + _libraryPath = libraryPath; + return this; + } + + /// + /// Configure whether to use cuda backend if possible. + /// + /// + /// + /// + public NativeLibraryConfig WithCuda(bool enable = true) + { + if (LibraryHasLoaded) + { + throw new InvalidOperationException("NativeLibraryConfig could be configured only once before any call to llama model apis."); + } + _useCuda = enable; + return this; + } + + /// + /// Configure the prefferred avx support level of the backend. + /// + /// + /// + /// + public NativeLibraryConfig WithAvx(AvxLevel level) + { + if (LibraryHasLoaded) + { + throw new InvalidOperationException("NativeLibraryConfig could be configured only once before any call to llama model apis."); + } + _avxLevel = level; + return this; + } + + /// + /// Configure whether to allow fallback when there's not match for preffered settings. + /// + /// + /// + /// + public NativeLibraryConfig WithAutoFallback(bool enable = true) + { + if (LibraryHasLoaded) + { + throw new InvalidOperationException("NativeLibraryConfig could be configured only once before any call to llama model apis."); + } + _allowFallback = enable; + return this; + } + + /// + /// Whether to skip the check when you don't allow fallback. This option + /// may be useful under some complex conditions. For example, you're sure + /// you have your cublas configured but LLamaSharp take it as invalid by mistake. + /// + /// + /// + /// + public NativeLibraryConfig SkipCheck(bool enable = true) + { + if (LibraryHasLoaded) + { + throw new InvalidOperationException("NativeLibraryConfig could be configured only once before any call to llama model apis."); + } + _skipCheck = enable; + return this; + } + + /// + /// Whether to output the logs to console when loading the native library with your configuration. + /// + /// + /// + /// + public NativeLibraryConfig WithLogs(bool enable = true) + { + if (LibraryHasLoaded) + { + throw new InvalidOperationException("NativeLibraryConfig could be configured only once before any call to llama model apis."); + } + _logging = enable; + return this; + } + + internal static Description CheckAndGatherDescription() + { + if (Default._allowFallback && Default._skipCheck) + { + throw new ArgumentException("Cannot skip the check when fallback is allowed."); + } + return new Description(Default._libraryPath, Default._useCuda, Default._avxLevel, Default._allowFallback, Default._skipCheck, Default._logging); + } + + internal static string AvxLevelToString(AvxLevel level) + { + return level switch + { + AvxLevel.None => string.Empty, + AvxLevel.Avx => "avx", + AvxLevel.Avx2 => "avx2", +#if NET8_0_OR_GREATER + AvxLevel.Avx512 => "avx512" +#endif + _ => throw new ArgumentException($"Cannot recognize Avx level {level}") + }; + } + + + private NativeLibraryConfig() + { + _libraryPath = string.Empty; + _useCuda = true; + _avxLevel = AvxLevel.Avx2; + _allowFallback = true; + _skipCheck = false; + _logging = false; + } + + /// + /// Avx support configuration + /// + public enum AvxLevel + { + /// + None = 0, + /// + Avx = 1, + /// + Avx2 = 2, +#if NET8_0_OR_GREATER + /// + Avx512 = 3, +#endif + } + internal record Description(string Path = "", bool UseCuda = true, AvxLevel AvxLevel = AvxLevel.Avx2, + bool AllowFallback = true, bool SkipCheck = false, bool Logging = false); + } +#endif +} diff --git a/LLama/StreamingTokenDecoder.cs b/LLama/StreamingTokenDecoder.cs index c5d9683e9..f82f8c37f 100644 --- a/LLama/StreamingTokenDecoder.cs +++ b/LLama/StreamingTokenDecoder.cs @@ -6,169 +6,170 @@ using LLama.Extensions; using LLama.Native; -namespace LLama; - -/// -/// Decodes a stream of tokens into a stream of characters -/// -public sealed class StreamingTokenDecoder +namespace LLama { - private readonly SafeLlamaModelHandle _weights; - private readonly Decoder _decoder; - - private readonly List _characters = new(); - - /// - /// The number of decoded characters waiting to be read - /// - public int AvailableCharacters => _characters.Count; - - #region constructors - /// - /// Create a new decoder - /// - /// Text encoding to use - /// Model weights - public StreamingTokenDecoder(Encoding encoding, LLamaWeights weights) - : this(encoding, weights.NativeHandle) - { - } - - /// - /// Create a new decoder - /// - /// Context to retrieve encoding and model weights from - public StreamingTokenDecoder(LLamaContext context) - : this(context.Encoding, context.NativeHandle) - { - } - /// - /// Create a new decoder + /// Decodes a stream of tokens into a stream of characters /// - /// Text encoding to use - /// Context to retrieve model weights from - public StreamingTokenDecoder(Encoding encoding, SafeLLamaContextHandle context) - : this(encoding, context.ModelHandle) + public sealed class StreamingTokenDecoder { - } - - /// - /// Create a new decoder - /// - /// Text encoding to use - /// Models weights to use - public StreamingTokenDecoder(Encoding encoding, SafeLlamaModelHandle weights) - { - _weights = weights; - _decoder = encoding.GetDecoder(); - } - #endregion - - /// - /// Add a single token to the decoder - /// - /// - public void Add(int token) - { - var charsArr = ArrayPool.Shared.Rent(16); - var bytesArr = ArrayPool.Shared.Rent(16); - try + private readonly SafeLlamaModelHandle _weights; + private readonly Decoder _decoder; + + private readonly List _characters = new(); + + /// + /// The number of decoded characters waiting to be read + /// + public int AvailableCharacters => _characters.Count; + + #region constructors + /// + /// Create a new decoder + /// + /// Text encoding to use + /// Model weights + public StreamingTokenDecoder(Encoding encoding, LLamaWeights weights) + : this(encoding, weights.NativeHandle) { - // Convert this token into bytes - var bytesAvailable = TokenToBytes(ref bytesArr, token, _weights).Length; - - // Convert those bytes into characters - var bytesOffset = 0; - var completed = false; - while (!completed) - { - // Decode some of the bytes into the temp char buffer. Keep doing this - // until all bytes have been consumed - _decoder.Convert( - bytesArr, bytesOffset, bytesAvailable, - charsArr, 0, charsArr.Length, - false, - out var bytesUsed, out var charsUsed, out completed - ); - bytesOffset += bytesUsed; - bytesAvailable -= bytesUsed; - - // Add the decoded characters to the output buffer - _characters.AddSpan(charsArr.AsSpan(0, charsUsed)); - } } - finally + + /// + /// Create a new decoder + /// + /// Context to retrieve encoding and model weights from + public StreamingTokenDecoder(LLamaContext context) + : this(context.Encoding, context.NativeHandle) { - ArrayPool.Shared.Return(charsArr); - ArrayPool.Shared.Return(bytesArr); } - return; + /// + /// Create a new decoder + /// + /// Text encoding to use + /// Context to retrieve model weights from + public StreamingTokenDecoder(Encoding encoding, SafeLLamaContextHandle context) + : this(encoding, context.ModelHandle) + { + } - // Converts a single token into bytes, using the `bytes` array as temporary storage. - // If the `bytes` array is too small it will get a larger one from the ArrayPool. - static Span TokenToBytes(ref byte[] bytes, int token, SafeLlamaModelHandle model) + /// + /// Create a new decoder + /// + /// Text encoding to use + /// Models weights to use + public StreamingTokenDecoder(Encoding encoding, SafeLlamaModelHandle weights) { - // Try to get bytes - var l = model.TokenToSpan(token, bytes); + _weights = weights; + _decoder = encoding.GetDecoder(); + } + #endregion - // Negative length indicates that the output was too small. Expand it to twice that size and try again. - if (l < 0) + /// + /// Add a single token to the decoder + /// + /// + public void Add(int token) + { + var charsArr = ArrayPool.Shared.Rent(16); + var bytesArr = ArrayPool.Shared.Rent(16); + try { - // Return the old array to the pool and get a new one - ArrayPool.Shared.Return(bytes); - bytes = ArrayPool.Shared.Rent(-l * 2); - - // Get bytes, this time it can't fail - l = model.TokenToSpan(token, bytes); + // Convert this token into bytes + var bytesAvailable = TokenToBytes(ref bytesArr, token, _weights).Length; + + // Convert those bytes into characters + var bytesOffset = 0; + var completed = false; + while (!completed) + { + // Decode some of the bytes into the temp char buffer. Keep doing this + // until all bytes have been consumed + _decoder.Convert( + bytesArr, bytesOffset, bytesAvailable, + charsArr, 0, charsArr.Length, + false, + out var bytesUsed, out var charsUsed, out completed + ); + bytesOffset += bytesUsed; + bytesAvailable -= bytesUsed; + + // Add the decoded characters to the output buffer + _characters.AddSpan(charsArr.AsSpan(0, charsUsed)); + } + } + finally + { + ArrayPool.Shared.Return(charsArr); + ArrayPool.Shared.Return(bytesArr); } - Debug.Assert(l >= 0); - return new Span(bytes, 0, l); + return; + + // Converts a single token into bytes, using the `bytes` array as temporary storage. + // If the `bytes` array is too small it will get a larger one from the ArrayPool. + static Span TokenToBytes(ref byte[] bytes, int token, SafeLlamaModelHandle model) + { + // Try to get bytes + var l = model.TokenToSpan(token, bytes); + + // Negative length indicates that the output was too small. Expand it to twice that size and try again. + if (l < 0) + { + // Return the old array to the pool and get a new one + ArrayPool.Shared.Return(bytes); + bytes = ArrayPool.Shared.Rent(-l * 2); + + // Get bytes, this time it can't fail + l = model.TokenToSpan(token, bytes); + } + + Debug.Assert(l >= 0); + return new Span(bytes, 0, l); + } } - } - /// - /// Add all tokens in the given enumerable - /// - /// - public void AddRange(IEnumerable tokens) - { - foreach (var item in tokens) - Add(item); - } + /// + /// Add all tokens in the given enumerable + /// + /// + public void AddRange(IEnumerable tokens) + { + foreach (var item in tokens) + Add(item); + } - /// - /// Read all decoded characters and clear the buffer - /// - /// - public void Read(List dest) - { - dest.AddRange(_characters); - _characters.Clear(); - } + /// + /// Read all decoded characters and clear the buffer + /// + /// + public void Read(List dest) + { + dest.AddRange(_characters); + _characters.Clear(); + } - /// - /// Read all decoded characters as a string and clear the buffer - /// - /// - public string Read() - { - if (_characters.Count == 0) - return ""; + /// + /// Read all decoded characters as a string and clear the buffer + /// + /// + public string Read() + { + if (_characters.Count == 0) + return ""; - var str = string.Join("", _characters); - _characters.Clear(); - return str; - } + var str = string.Join("", _characters); + _characters.Clear(); + return str; + } - /// - /// Set the decoder back to its initial state - /// - public void Reset() - { - _decoder.Reset(); - _characters.Clear(); + /// + /// Set the decoder back to its initial state + /// + public void Reset() + { + _decoder.Reset(); + _characters.Clear(); + } } -} \ No newline at end of file +} diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec index dc296ab87..d8876f4f3 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda11.nuspec @@ -17,8 +17,8 @@ - - + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.nuspec index cde9aaa63..5ffd8ef12 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cuda12.nuspec @@ -17,8 +17,8 @@ - - + + diff --git a/LLama/runtimes/build/LLamaSharpBackend.props b/LLama/runtimes/build/LLamaSharpBackend.props index 786e89056..6858ca516 100644 --- a/LLama/runtimes/build/LLamaSharpBackend.props +++ b/LLama/runtimes/build/LLamaSharpBackend.props @@ -7,33 +7,7 @@ - - PreserveNewest - false - %(Filename)%(Extension) - - - PreserveNewest - false - %(Filename)%(Extension) - - - PreserveNewest - false - %(Filename)%(Extension) - - - - PreserveNewest - false - %(Filename)%(Extension) - - + PreserveNewest false %(Filename)%(Extension) diff --git a/README.md b/README.md index 216db1249..2dde0a4a5 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ Then, search and install one of the following backends. (Please don't install tw LLamaSharp.Backend.Cpu # cpu for windows, linux and mac (mac metal is also supported) LLamaSharp.Backend.Cuda11 # cuda11 for windows and linux LLamaSharp.Backend.Cuda12 # cuda12 for windows and linux -LLamaSharp.Backend.MacMetal # special for using mac metal +LLamaSharp.Backend.MacMetal # Removed after v0.8.0, metal support has been moved to cpu version now ``` We publish these backends because they are the most popular ones. If none of them matches, please compile the [llama.cpp](https://github.com/ggerganov/llama.cpp) yourself. In this case, please **DO NOT** install the backend packages, instead, add your DLL to your project and ensure it will be copied to the output directory when compiling your project. For more informations please refer to ([this guide](https://scisharp.github.io/LLamaSharp/0.5/ContributingGuide/)). @@ -88,8 +88,8 @@ Many hands make light work. If you have found any other model resource that coul ## FAQ 1. GPU out of memory: Please try setting `n_gpu_layers` to a smaller number. -2. Unsupported model: `llama.cpp` is under quick development and often has break changes. Please check the release date of the model and find a suitable version of LLamaSharp to install, or use the model we provide [on huggingface](https://huggingface.co/AsakusaRinne/LLamaSharpSamples). -3. Cannot find backend package: 1) ensure you installed one of them. 2) check if there's a `libllama.dll` under your output path. 3) check if your system supports avx2, which is the default settings of official runtimes now. If not, please compile llama.cpp yourself. +2. Unsupported model: `llama.cpp` is under quick development and often has break changes. Please check the release date of the model and find a suitable version of LLamaSharp to install, or generate `gguf` format weights from original weights yourself. +3. Cannot load native lirary: 1) ensure you installed one of the backend packages. 2) Run `NativeLibraryConfig.WithLogs()` at the very beginning of your code to print more informations. 3) check if your system supports avx2, which is the default settings of official runtimes now. If not, please compile llama.cpp yourself and specify it with `NativeLibraryConfig.WithLibrary`. @@ -168,13 +168,13 @@ Since we are in short of hands, if you're familiar with ASP.NET core, we'll appr ![demo-console](Assets/console_demo.gif) -## How to Get a Model +## How to Find a Model -Model in format `gguf` is valid for LLamaSharp (and `ggml` before v0.5.1). One option is to search `LLama` and `gguf` in [huggingface](https://huggingface.co/) to find a model. +Models in format `gguf` are valid for LLamaSharp (and `ggml` before v0.5.1). If you're new to LLM/LLaMA, it's a good choice to search `LLama` and `gguf` on [huggingface](https://huggingface.co/) to find a model. -Another choice is generate gguf format file yourself with a pytorch weight (or any other), pleae refer to [convert.py](https://github.com/ggerganov/llama.cpp/blob/master/convert.py) and [convert-llama-ggml-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-ggml-to-gguf.py) to get gguf file though a ggml transform. +Another choice is generate gguf format file yourself with a pytorch weight (or any other), pleae refer to [convert.py](https://github.com/ggerganov/llama.cpp/blob/master/convert.py) and [convert-llama-ggml-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-ggml-to-gguf.py) to get gguf file through a ggml transformation. -## Roadmap +## Features ---