diff --git a/.github/prepare_release.sh b/.github/prepare_release.sh index 3fc2e3937..e44099978 100755 --- a/.github/prepare_release.sh +++ b/.github/prepare_release.sh @@ -22,13 +22,19 @@ fi mkdir ./temp; mkdir ./temp/runtimes; -cp ./LLama/runtimes/*.* ./temp/runtimes/; +# For sure it could be done better but cp -R did not work on osx +mkdir ./temp/runtimes/osx-arm64 +mkdir ./temp/runtimes/osx-x64 +cp ./LLama/runtimes/*.* ./temp/runtimes/; +cp ./LLama/runtimes/osx-arm64/*.* ./temp/runtimes/osx-arm64/; +cp ./LLama/runtimes/osx-x64/*.* ./temp/runtimes/osx-x64; cp ./LLama/runtimes/build/*.* ./temp/; # get the current version cd temp; dotnet add package LLamaSharp; version=$(dotnet list temp.csproj package | grep LLamaSharp); +# TODO: This didn´t work on osx...we need a solution read -ra arr <<< "$version" version="${arr[-1]}" echo "The latest version: $version"; @@ -71,7 +77,7 @@ cd temp nuget pack LLamaSharp.Backend.Cpu.nuspec -version $updated_version nuget pack LLamaSharp.Backend.Cuda11.nuspec -version $updated_version nuget pack LLamaSharp.Backend.Cuda12.nuspec -version $updated_version -nuget pack LLamaSharp.Backend.MacMetal.nuspec -version $updated_version + cd .. exit 0 diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index 26de28b54..5dda197cd 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -6,9 +6,9 @@ on: cublas: type: boolean description: Build CUBLAS binaries - macos: + osx: type: boolean - description: Build MacOS binaries + description: Build OSX binaries push: branches: [cron_job] #schedule: @@ -145,8 +145,10 @@ jobs: fail-fast: true matrix: include: - - build: 'metal' + - build: 'arm64' defines: '-DCMAKE_OSX_ARCHITECTURES=arm64' + - build: 'x64' + defines: '-DLLAMA_METAL=OFF -DCMAKE_OSX_ARCHITECTURES=x86_64' runs-on: macos-latest steps: - uses: actions/checkout@v3 @@ -167,7 +169,7 @@ jobs: uses: actions/upload-artifact@v3 with: path: ./build/libllama.dylib - name: llama-bin-macos-${{ matrix.build }}.dylib + name: llama-bin-osx-${{ matrix.build }}.dylib - name: Upload Metal uses: actions/upload-artifact@v3 with: @@ -210,9 +212,13 @@ jobs: - name: Rearrange MacOS files if: ${{ github.event.inputs.macos }} run: | - mkdir deps/macos-metal - cp artifacts/llama-bin-macos-metal.dylib/libllama.dylib deps/macos-metal/libllama.dylib - cp artifacts/ggml-metal.metal/ggml-metal.metal deps/macos-metal/ggml-metal.metal + mkdir deps/osx-arm64 + mkdir deps/osx-x64 + + cp artifacts/llama-bin-osx-arm64.dylib/libllama.dylib deps/osx-arm64/libllama.dylib + cp artifacts/ggml-metal.metal/ggml-metal.metal deps/osx-arm64/ggml-metal.metal + cp artifacts/llama-bin-osx-x64.dylib/libllama.dylib deps/osx-x64/libllama.dylib + - name: Rearrange CUDA files if: ${{ github.event.inputs.cublas }} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1c08e6e57..eb0e936f1 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -12,14 +12,14 @@ jobs: strategy: fail-fast: false matrix: - build: [linux-release, windows-release] + build: [linux-release, windows-release, osx-release] include: - build: linux-release os: ubuntu-latest config: release - # - build: macos-release - # os: macos-latest - # config: release + - build: osx-release + os: macos-latest + config: release - build: windows-release os: windows-2019 config: release diff --git a/LLama.Examples/NewVersion/GetEmbeddings.cs b/LLama.Examples/NewVersion/GetEmbeddings.cs index 1e5b19be3..fe9e3ea80 100644 --- a/LLama.Examples/NewVersion/GetEmbeddings.cs +++ b/LLama.Examples/NewVersion/GetEmbeddings.cs @@ -4,7 +4,7 @@ namespace LLama.Examples.NewVersion { public class GetEmbeddings { - public static Task Run() + public static void Run() { Console.Write("Please input your model path: "); var modelPath = Console.ReadLine(); @@ -23,7 +23,6 @@ public static Task Run() Console.WriteLine(string.Join(", ", embedder.GetEmbeddings(text))); Console.WriteLine(); } - return Task.CompletedTask; } } } diff --git a/LLama.Examples/NewVersion/QuantizeModel.cs b/LLama.Examples/NewVersion/QuantizeModel.cs index 456d89290..71966af8f 100644 --- a/LLama.Examples/NewVersion/QuantizeModel.cs +++ b/LLama.Examples/NewVersion/QuantizeModel.cs @@ -2,7 +2,7 @@ { public class QuantizeModel { - public static Task Run() + public static void Run() { Console.Write("Please input your original model path: "); var inputPath = Console.ReadLine(); @@ -21,8 +21,6 @@ public static Task Run() { Console.WriteLine("Quantization failed!"); } - - return Task.CompletedTask; } } } diff --git a/LLama.Examples/NewVersion/TestRunner.cs b/LLama.Examples/NewVersion/TestRunner.cs index c89cba305..a21a2eed4 100644 --- a/LLama.Examples/NewVersion/TestRunner.cs +++ b/LLama.Examples/NewVersion/TestRunner.cs @@ -1,54 +1,109 @@ -using System.Linq.Expressions; -using Spectre.Console; - -namespace LLama.Examples.NewVersion +namespace LLama.Examples.NewVersion { public class NewVersionTestRunner { - static Dictionary> Examples = new Dictionary> - { - {"Run a chat session without stripping the role names.", () => ChatSessionWithRoleName.Run()}, - {"Run a chat session with the role names stripped.",()=> ChatSessionStripRoleName.Run()}, - {"Interactive mode chat by using executor.",()=> InteractiveModeExecute.Run()}, - {"Instruct mode chat by using executor.",()=> InstructModeExecute.Run()}, - {"Stateless mode chat by using executor.",()=> StatelessModeExecute.Run()}, - {"Load and save chat session.",()=> SaveAndLoadSession.Run()}, - {"Load and save state of model and executor.",()=> LoadAndSaveState.Run()}, - {"Get embeddings from LLama model.",()=> GetEmbeddings.Run()}, - {"Quantize the model.",()=> QuantizeModel.Run()}, - {"Automatic conversation.",()=> TalkToYourself.Run()}, - {"Constrain response to json format using grammar.",()=> GrammarJsonResponse.Run()}, - {"Semantic Kernel Prompt.",()=> SemanticKernelPrompt.Run()}, - {"Semantic Kernel Chat.",()=> SemanticKernelChat.Run()}, - {"Semantic Kernel Memory.",()=> SemanticKernelMemory.Run()}, - {"Coding Assistant.",()=> CodingAssistant.Run()}, - {"Batch Decoding.",()=> BatchedDecoding.Run()}, - {"SK Kernel Memory.",()=> KernelMemory.Run()}, - {"Exit", ()=> Task.CompletedTask} - }; public static async Task Run() { - AnsiConsole.Write(new Rule("LLamaSharp Examples")); + Console.WriteLine("================LLamaSharp Examples (New Version)==================\n"); + + Console.WriteLine("Please input a number to choose an example to run:"); + Console.WriteLine("0: Run a chat session without stripping the role names."); + Console.WriteLine("1: Run a chat session with the role names stripped."); + Console.WriteLine("2: Interactive mode chat by using executor."); + Console.WriteLine("3: Instruct mode chat by using executor."); + Console.WriteLine("4: Stateless mode chat by using executor."); + Console.WriteLine("5: Load and save chat session."); + Console.WriteLine("6: Load and save state of model and executor."); + Console.WriteLine("7: Get embeddings from LLama model."); + Console.WriteLine("8: Quantize the model."); + Console.WriteLine("9: Automatic conversation."); + Console.WriteLine("10: Constrain response to json format using grammar."); + Console.WriteLine("11: Semantic Kernel Prompt."); + Console.WriteLine("12: Semantic Kernel Chat."); + Console.WriteLine("13: Semantic Kernel Memory."); + Console.WriteLine("14: Coding Assistant."); + Console.WriteLine("15: Batch Decoding."); + Console.WriteLine("16: SK Kernel Memory."); while (true) { - var choice = AnsiConsole.Prompt( - new SelectionPrompt() - .Title("Please choose[green] an example[/] to run: ") - .AddChoices(Examples.Keys)); + Console.Write("\nYour choice: "); + int choice = int.Parse(Console.ReadLine()); - - if (Examples.TryGetValue(choice, out var example)) + if (choice == 0) { - if (choice == "Exit") - { - break; - } - AnsiConsole.Write(new Rule(choice)); - await example(); + await ChatSessionWithRoleName.Run(); } - - AnsiConsole.Clear(); + else if (choice == 1) + { + await ChatSessionStripRoleName.Run(); + } + else if (choice == 2) + { + await InteractiveModeExecute.Run(); + } + else if (choice == 3) + { + await InstructModeExecute.Run(); + } + else if (choice == 4) + { + await StatelessModeExecute.Run(); + } + else if (choice == 5) + { + await SaveAndLoadSession.Run(); + } + else if (choice == 6) + { + await LoadAndSaveState.Run(); + } + else if (choice == 7) + { + GetEmbeddings.Run(); + } + else if (choice == 8) + { + QuantizeModel.Run(); + } + else if (choice == 9) + { + await TalkToYourself.Run(); + } + else if (choice == 10) + { + await GrammarJsonResponse.Run(); + } + else if (choice == 11) + { + await SemanticKernelPrompt.Run(); + } + else if (choice == 12) + { + await SemanticKernelChat.Run(); + } + else if (choice == 13) + { + await SemanticKernelMemory.Run(); + } + else if (choice == 14) + { + await CodingAssistant.Run(); + } + else if (choice == 15) + { + await BatchedDecoding.Run(); + } + else if (choice == 16) + { + await KernelMemory.Run(); + } + else + { + Console.WriteLine("Cannot parse your choice. Please select again."); + continue; + } + break; } } } diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs index 2f9caffdc..182ace002 100644 --- a/LLama.Web/Common/ModelOptions.cs +++ b/LLama.Web/Common/ModelOptions.cs @@ -18,9 +18,9 @@ public class ModelOptions public int MaxInstances { get; set; } /// - /// Model context size (n_ctx). Null to use value from model. + /// Model context size (n_ctx) /// - public uint? ContextSize { get; set; } + public uint ContextSize { get; set; } = 512; /// /// the GPU that is used for scratch and small tensors diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs index d9811cdce..0f129217f 100644 --- a/LLama/Abstractions/IContextParams.cs +++ b/LLama/Abstractions/IContextParams.cs @@ -9,9 +9,9 @@ namespace LLama.Abstractions; public interface IContextParams { /// - /// Model context size (n_ctx). Null to use value from model file. + /// Model context size (n_ctx) /// - uint? ContextSize { get; set; } + uint ContextSize { get; set; } /// /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch) diff --git a/LLama/Common/FixedSizeQueue.cs b/LLama/Common/FixedSizeQueue.cs index 37fb1cf51..d4577a475 100644 --- a/LLama/Common/FixedSizeQueue.cs +++ b/LLama/Common/FixedSizeQueue.cs @@ -43,7 +43,7 @@ public FixedSizeQueue(int size) /// public FixedSizeQueue(int size, IEnumerable data) { -#if NET6_0_OR_GREATER +#if !NETSTANDARD2_0 // Try to check the size without enumerating the entire IEnumerable. This may not be able to get the count, // in which case we'll have to check later if (data.TryGetNonEnumeratedCount(out var dataCount) && dataCount > size) diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs index a736ccbde..8bf59fa53 100644 --- a/LLama/Common/ModelParams.cs +++ b/LLama/Common/ModelParams.cs @@ -13,60 +13,92 @@ namespace LLama.Common public record ModelParams : ILLamaParams { - /// - public uint? ContextSize { get; set; } - - /// + /// + /// Model context size (n_ctx) + /// + public uint ContextSize { get; set; } = 512; + /// + /// the GPU that is used for scratch and small tensors + /// public int MainGpu { get; set; } = 0; - /// + /// + /// Number of layers to run in VRAM / GPU memory (n_gpu_layers) + /// public int GpuLayerCount { get; set; } = 20; - - /// + /// + /// Seed for the random number generator (seed) + /// public uint Seed { get; set; } = 0xFFFFFFFF; - - /// + /// + /// Use f16 instead of f32 for memory kv (memory_f16) + /// public bool UseFp16Memory { get; set; } = true; - - /// + /// + /// Use mmap for faster loads (use_mmap) + /// public bool UseMemorymap { get; set; } = true; - - /// + /// + /// Use mlock to keep model in memory (use_mlock) + /// public bool UseMemoryLock { get; set; } - - /// + /// + /// Compute perplexity over the prompt (perplexity) + /// public bool Perplexity { get; set; } - - /// + /// + /// Model path (model) + /// public string ModelPath { get; set; } - /// + /// + /// List of LoRAs to apply + /// public AdapterCollection LoraAdapters { get; set; } = new(); - /// + /// + /// base model path for the lora adapter (lora_base) + /// public string LoraBase { get; set; } = string.Empty; - /// + /// + /// Number of threads (null = autodetect) (n_threads) + /// public uint? Threads { get; set; } - /// + /// + /// Number of threads to use for batch processing (null = autodetect) (n_threads) + /// public uint? BatchThreads { get; set; } - /// + /// + /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch) + /// public uint BatchSize { get; set; } = 512; - /// + /// + /// Whether to use embedding mode. (embedding) Note that if this is set to true, + /// The LLamaModel won't produce text response anymore. + /// public bool EmbeddingMode { get; set; } - /// + /// + /// how split tensors should be distributed across GPUs. + /// + /// "[ 3, 2 ]" will assign 60% of the data to GPU 0 and 40% to GPU 1. [JsonConverter(typeof(TensorSplitsCollectionConverter))] public TensorSplitsCollection TensorSplits { get; set; } = new(); - /// - public float? RopeFrequencyBase { get; set; } + /// + /// RoPE base frequency + /// + public float? RopeFrequencyBase { get; set; } + + /// + /// RoPE frequency scaling factor + /// + public float? RopeFrequencyScale { get; set; } - /// - public float? RopeFrequencyScale { get; set; } /// public float? YarnExtrapolationFactor { get; set; } @@ -91,10 +123,15 @@ public record ModelParams /// public bool MulMatQ { get; set; } - /// + + /// + /// Load vocab only (no weights) + /// public bool VocabOnly { get; set; } - /// + /// + /// The encoding to use to convert text for the model + /// [JsonConverter(typeof(EncodingConverter))] public Encoding Encoding { get; set; } = Encoding.UTF8; diff --git a/LLama/Extensions/DictionaryExtensions.cs b/LLama/Extensions/DictionaryExtensions.cs index 1af0e9e1f..a39ed7e8b 100644 --- a/LLama/Extensions/DictionaryExtensions.cs +++ b/LLama/Extensions/DictionaryExtensions.cs @@ -9,8 +9,6 @@ public static TValue GetValueOrDefault(this IReadOnlyDictionary(IReadOnlyDictionary dictionary, TKey key, TValue defaultValue) diff --git a/LLama/Extensions/EncodingExtensions.cs b/LLama/Extensions/EncodingExtensions.cs index 5005b16c1..e88d83a70 100644 --- a/LLama/Extensions/EncodingExtensions.cs +++ b/LLama/Extensions/EncodingExtensions.cs @@ -15,8 +15,6 @@ public static int GetCharCount(this Encoding encoding, ReadOnlySpan bytes) { return GetCharCountImpl(encoding, bytes); } -#elif !NET6_0_OR_GREATER && !NETSTANDARD2_1_OR_GREATER -#error Target framework not supported! #endif internal static int GetCharsImpl(Encoding encoding, ReadOnlySpan bytes, Span output) diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs index bb029c162..16716b531 100644 --- a/LLama/Extensions/IContextParamsExtensions.cs +++ b/LLama/Extensions/IContextParamsExtensions.cs @@ -21,7 +21,7 @@ public static class IContextParamsExtensions public static void ToLlamaContextParams(this IContextParams @params, out LLamaContextParams result) { result = NativeApi.llama_context_default_params(); - result.n_ctx = @params.ContextSize ?? 0; + result.n_ctx = @params.ContextSize; result.n_batch = @params.BatchSize; result.seed = @params.Seed; result.f16_kv = @params.UseFp16Memory; diff --git a/LLama/Extensions/IEnumerableExtensions.cs b/LLama/Extensions/IEnumerableExtensions.cs index 17428d297..9e01feb85 100644 --- a/LLama/Extensions/IEnumerableExtensions.cs +++ b/LLama/Extensions/IEnumerableExtensions.cs @@ -10,8 +10,6 @@ public static IEnumerable TakeLast(this IEnumerable source, int count) { return TakeLastImpl(source, count); } -#elif !NET6_0_OR_GREATER && !NETSTANDARD2_1_OR_GREATER -#error Target framework not supported! #endif internal static IEnumerable TakeLastImpl(IEnumerable source, int count) diff --git a/LLama/Extensions/KeyValuePairExtensions.cs b/LLama/Extensions/KeyValuePairExtensions.cs index 233195ed0..6e12654de 100644 --- a/LLama/Extensions/KeyValuePairExtensions.cs +++ b/LLama/Extensions/KeyValuePairExtensions.cs @@ -19,7 +19,5 @@ public static void Deconstruct(this System.Collections.Generic.Key first = pair.Key; second = pair.Value; } -#elif !NET6_0_OR_GREATER && !NETSTANDARD2_1_OR_GREATER -#error Target framework not supported! #endif } \ No newline at end of file diff --git a/LLama/Extensions/ListExtensions.cs b/LLama/Extensions/ListExtensions.cs index eb30a07a0..11a1d4f00 100644 --- a/LLama/Extensions/ListExtensions.cs +++ b/LLama/Extensions/ListExtensions.cs @@ -5,7 +5,7 @@ namespace LLama.Extensions { internal static class ListExtensions { -#if !NET6_0_OR_GREATER +#if NETSTANDARD2_0 public static void EnsureCapacity(this List list, int capacity) { if (list.Capacity < capacity) diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets index 8910f1551..c14f0ffab 100644 --- a/LLama/LLamaSharp.Runtime.targets +++ b/LLama/LLamaSharp.Runtime.targets @@ -27,13 +27,17 @@ PreserveNewest libllama-cuda12.so - + PreserveNewest - libllama.dylib + runtimes/osx-arm64/libllama.dylib - + PreserveNewest - ggml-metal.metal - + runtimes/osx-arm64/ggml-metal.metal + + + PreserveNewest + runtimes/osx-x64/libllama.dylib + \ No newline at end of file diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs index c0f2afa29..f1ba569d1 100644 --- a/LLama/Native/LLamaContextParams.cs +++ b/LLama/Native/LLamaContextParams.cs @@ -22,7 +22,7 @@ public struct LLamaContextParams public uint seed; /// - /// text context, 0 = from model + /// text context /// public uint n_ctx; diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index e3b182bd4..fc4086783 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -79,7 +79,9 @@ private static IntPtr TryLoadLibrary() if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) { - return IntPtr.Zero; + return TryLoad("runtimes/osx-arm64/libllama.dylib", System.Runtime.Intrinsics.Arm.ArmBase.Arm64.IsSupported) + ?? TryLoad("runtimes/osx-x64/libllama.dylib") + ?? IntPtr.Zero; } #endif diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec index 739eb9086..29466a1fe 100644 --- a/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec +++ b/LLama/runtimes/build/LLamaSharp.Backend.Cpu.nuspec @@ -19,10 +19,9 @@ - - - - + + + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.MacMetal.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.MacMetal.nuspec deleted file mode 100644 index 0ba9c508a..000000000 --- a/LLama/runtimes/build/LLamaSharp.Backend.MacMetal.nuspec +++ /dev/null @@ -1,26 +0,0 @@ - - - - LLamaSharp.Backend.MacMetal - $version$ - LLamaSharp.Backend.MacMetal, the backend for LLamaSharp on MACOS. - llama.cpp Authors - false - MIT - icon512.png - https://github.com/SciSharp/LLamaSharp - LLamaSharp.Backend.MacMetal is a backend for LLamaSharp to use MAC with GPU support. - - Copyright 2023 The llama.cpp Authors. All rights reserved. - LLamaSharp LLama LLM GPT AI ChatBot SciSharp - - - - - - - - - - - diff --git a/LLama/runtimes/build/LLamaSharpBackend.props b/LLama/runtimes/build/LLamaSharpBackend.props index 7e3db26e5..786e89056 100644 --- a/LLama/runtimes/build/LLamaSharpBackend.props +++ b/LLama/runtimes/build/LLamaSharpBackend.props @@ -25,12 +25,6 @@ false %(Filename)%(Extension) - - PreserveNewest - false - %(Filename)%(Extension) - diff --git a/LLama/runtimes/ggml-metal.metal b/LLama/runtimes/osx-arm64/ggml-metal.metal similarity index 100% rename from LLama/runtimes/ggml-metal.metal rename to LLama/runtimes/osx-arm64/ggml-metal.metal diff --git a/LLama/runtimes/libllama.dylib b/LLama/runtimes/osx-arm64/libllama.dylib similarity index 100% rename from LLama/runtimes/libllama.dylib rename to LLama/runtimes/osx-arm64/libllama.dylib diff --git a/LLama/runtimes/osx-x64/libllama.dylib b/LLama/runtimes/osx-x64/libllama.dylib new file mode 100644 index 000000000..37eb3cd43 Binary files /dev/null and b/LLama/runtimes/osx-x64/libllama.dylib differ diff --git a/README.md b/README.md index 74d5aee67..216db1249 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ **The C#/.NET binding of [llama.cpp](https://github.com/ggerganov/llama.cpp). It provides higher-level APIs to inference the LLaMA Models and deploy it on local device with C#/.NET. It works on -both Windows, Linux and MAC without requirment for compiling llama.cpp yourself. Even without GPU or not enough GPU memory, you can still apply LLaMA models well with this repo. 🤗** +both Windows, Linux and MAC without requirment for compiling llama.cpp yourself. Even without GPU or not enought GPU memory, you can still apply LLaMA models well with this repo. 🤗** **Furthermore, it provides integrations with other projects such as [semantic-kernel](https://github.com/microsoft/semantic-kernel), [kernel-memory](https://github.com/microsoft/kernel-memory) and [BotSharp](https://github.com/SciSharp/BotSharp) to provide higher-level applications.**