Merge pull request #258 from SignalRT/RuntimeDetection

Runtime detection MacOS
SciSharp · Nov 12, 2023 · ed479d1 · ed479d1
2 parents c2be012 + 0a2b0ab
commit ed479d1
Show file tree

Hide file tree

Showing 26 changed files with 213 additions and 147 deletions.
diff --git a/.github/prepare_release.sh b/.github/prepare_release.sh
@@ -22,13 +22,19 @@ fi
 
 mkdir ./temp;
 mkdir ./temp/runtimes;
-cp ./LLama/runtimes/*.* ./temp/runtimes/;
+# For sure it could be done better but cp -R did not work on osx
+mkdir ./temp/runtimes/osx-arm64
+mkdir ./temp/runtimes/osx-x64
+cp  ./LLama/runtimes/*.* ./temp/runtimes/;
+cp  ./LLama/runtimes/osx-arm64/*.* ./temp/runtimes/osx-arm64/;
+cp  ./LLama/runtimes/osx-x64/*.* ./temp/runtimes/osx-x64;
 cp ./LLama/runtimes/build/*.* ./temp/;
 
 # get the current version
 cd temp;
 dotnet add package LLamaSharp;
 version=$(dotnet list temp.csproj package | grep LLamaSharp);
+# TODO: This didn´t work on osx...we need a solution
 read -ra arr <<< "$version"
 version="${arr[-1]}"
 echo "The latest version: $version";
@@ -71,7 +77,7 @@ cd temp
 nuget pack LLamaSharp.Backend.Cpu.nuspec -version $updated_version
 nuget pack LLamaSharp.Backend.Cuda11.nuspec -version $updated_version
 nuget pack LLamaSharp.Backend.Cuda12.nuspec -version $updated_version
-nuget pack LLamaSharp.Backend.MacMetal.nuspec -version $updated_version
+
 
 cd ..
 exit 0
diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
@@ -6,9 +6,9 @@ on:
       cublas:
         type: boolean
         description: Build CUBLAS binaries
-      macos:
+      osx:
         type: boolean
-        description: Build MacOS binaries
+        description: Build OSX binaries
   push:
     branches: [cron_job]
   #schedule:
@@ -145,8 +145,10 @@ jobs:
       fail-fast: true
       matrix:
         include:
-          - build: 'metal'
+          - build: 'arm64'
             defines: '-DCMAKE_OSX_ARCHITECTURES=arm64'
+          - build: 'x64'
+            defines: '-DLLAMA_METAL=OFF  -DCMAKE_OSX_ARCHITECTURES=x86_64'            
     runs-on: macos-latest   
     steps:
       - uses: actions/checkout@v3
@@ -167,7 +169,7 @@ jobs:
         uses: actions/upload-artifact@v3
         with:
           path: ./build/libllama.dylib
-          name: llama-bin-macos-${{ matrix.build }}.dylib
+          name: llama-bin-osx-${{ matrix.build }}.dylib
       - name: Upload Metal
         uses: actions/upload-artifact@v3
         with:
@@ -210,9 +212,13 @@ jobs:
       - name: Rearrange MacOS files
         if: ${{ github.event.inputs.macos }}
         run: |
-          mkdir deps/macos-metal
-          cp artifacts/llama-bin-macos-metal.dylib/libllama.dylib deps/macos-metal/libllama.dylib
-          cp artifacts/ggml-metal.metal/ggml-metal.metal deps/macos-metal/ggml-metal.metal
+          mkdir deps/osx-arm64
+          mkdir deps/osx-x64
+          
+          cp artifacts/llama-bin-osx-arm64.dylib/libllama.dylib deps/osx-arm64/libllama.dylib
+          cp artifacts/ggml-metal.metal/ggml-metal.metal deps/osx-arm64/ggml-metal.metal
+          cp artifacts/llama-bin-osx-x64.dylib/libllama.dylib deps/osx-x64/libllama.dylib
+
 
       - name: Rearrange CUDA files
         if: ${{ github.event.inputs.cublas }}

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -12,14 +12,14 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        build: [linux-release, windows-release]
+        build: [linux-release, windows-release, osx-release]
         include:
           - build: linux-release
             os: ubuntu-latest
             config: release
-        # - build: macos-release
-        #   os: macos-latest
-        #   config: release
+          - build: osx-release
+            os: macos-latest
+            config: release
           - build: windows-release
             os: windows-2019
             config: release

diff --git a/LLama.Examples/NewVersion/GetEmbeddings.cs b/LLama.Examples/NewVersion/GetEmbeddings.cs
@@ -4,7 +4,7 @@ namespace LLama.Examples.NewVersion
 {
     public class GetEmbeddings
     {
-        public static Task Run()
+        public static void Run()
         {
             Console.Write("Please input your model path: ");
             var modelPath = Console.ReadLine();
@@ -23,7 +23,6 @@ public static Task Run()
                 Console.WriteLine(string.Join(", ", embedder.GetEmbeddings(text)));
                 Console.WriteLine();
             }
-            return Task.CompletedTask;
         }
     }
 }
diff --git a/LLama.Examples/NewVersion/QuantizeModel.cs b/LLama.Examples/NewVersion/QuantizeModel.cs
@@ -2,7 +2,7 @@
 {
     public class QuantizeModel
     {
-        public static Task Run()
+        public static void Run()
         {
             Console.Write("Please input your original model path: ");
             var inputPath = Console.ReadLine();
@@ -21,8 +21,6 @@ public static Task Run()
             {
                 Console.WriteLine("Quantization failed!");
             }
-
-            return Task.CompletedTask;
         }
     }
 }
diff --git a/LLama.Examples/NewVersion/TestRunner.cs b/LLama.Examples/NewVersion/TestRunner.cs
@@ -1,54 +1,109 @@
-using System.Linq.Expressions;
-using Spectre.Console;
-
-namespace LLama.Examples.NewVersion
+namespace LLama.Examples.NewVersion
 {
     public class NewVersionTestRunner
     {
-        static Dictionary<string, Func<Task>> Examples = new Dictionary<string, Func<Task>>
-        {
-            {"Run a chat session without stripping the role names.", () => ChatSessionWithRoleName.Run()},
-            {"Run a chat session with the role names stripped.",()=> ChatSessionStripRoleName.Run()},
-            {"Interactive mode chat by using executor.",()=> InteractiveModeExecute.Run()},
-            {"Instruct mode chat by using executor.",()=> InstructModeExecute.Run()},
-            {"Stateless mode chat by using executor.",()=> StatelessModeExecute.Run()},
-            {"Load and save chat session.",()=> SaveAndLoadSession.Run()},
-            {"Load and save state of model and executor.",()=> LoadAndSaveState.Run()},
-            {"Get embeddings from LLama model.",()=> GetEmbeddings.Run()},
-            {"Quantize the model.",()=> QuantizeModel.Run()},
-            {"Automatic conversation.",()=> TalkToYourself.Run()},
-            {"Constrain response to json format using grammar.",()=> GrammarJsonResponse.Run()},
-            {"Semantic Kernel Prompt.",()=> SemanticKernelPrompt.Run()},
-            {"Semantic Kernel Chat.",()=> SemanticKernelChat.Run()},
-            {"Semantic Kernel Memory.",()=> SemanticKernelMemory.Run()},
-            {"Coding Assistant.",()=> CodingAssistant.Run()},
-            {"Batch Decoding.",()=> BatchedDecoding.Run()},
-            {"SK Kernel Memory.",()=> KernelMemory.Run()},
-            {"Exit", ()=> Task.CompletedTask}
-        };
         public static async Task Run()
         {
-            AnsiConsole.Write(new Rule("LLamaSharp Examples"));
+            Console.WriteLine("================LLamaSharp Examples (New Version)==================\n");
+
+            Console.WriteLine("Please input a number to choose an example to run:");
+            Console.WriteLine("0: Run a chat session without stripping the role names.");
+            Console.WriteLine("1: Run a chat session with the role names stripped.");
+            Console.WriteLine("2: Interactive mode chat by using executor.");
+            Console.WriteLine("3: Instruct mode chat by using executor.");
+            Console.WriteLine("4: Stateless mode chat by using executor.");
+            Console.WriteLine("5: Load and save chat session.");
+            Console.WriteLine("6: Load and save state of model and executor.");
+            Console.WriteLine("7: Get embeddings from LLama model.");
+            Console.WriteLine("8: Quantize the model.");
+            Console.WriteLine("9: Automatic conversation.");
+            Console.WriteLine("10: Constrain response to json format using grammar.");
+            Console.WriteLine("11: Semantic Kernel Prompt.");
+            Console.WriteLine("12: Semantic Kernel Chat.");
+            Console.WriteLine("13: Semantic Kernel Memory.");
+            Console.WriteLine("14: Coding Assistant.");
+            Console.WriteLine("15: Batch Decoding.");
+            Console.WriteLine("16: SK Kernel Memory.");
 
             while (true)
             {
-                var choice = AnsiConsole.Prompt(
-                    new SelectionPrompt<string>()
-                        .Title("Please choose[green] an example[/] to run: ")
-                        .AddChoices(Examples.Keys));
+                Console.Write("\nYour choice: ");
+                int choice = int.Parse(Console.ReadLine());
 
-
-                if (Examples.TryGetValue(choice, out var example))
+                if (choice == 0)
                 {
-                    if (choice == "Exit")
-                    {
-                        break;
-                    }
-                    AnsiConsole.Write(new Rule(choice));
-                    await example();
+                    await ChatSessionWithRoleName.Run();
                 }
-
-                AnsiConsole.Clear();
+                else if (choice == 1)
+                {
+                    await ChatSessionStripRoleName.Run();
+                }
+                else if (choice == 2)
+                {
+                    await InteractiveModeExecute.Run();
+                }
+                else if (choice == 3)
+                {
+                    await InstructModeExecute.Run();
+                }
+                else if (choice == 4)
+                {
+                    await StatelessModeExecute.Run();
+                }
+                else if (choice == 5)
+                {
+                    await SaveAndLoadSession.Run();
+                }
+                else if (choice == 6)
+                {
+                    await LoadAndSaveState.Run();
+                }
+                else if (choice == 7)
+                {
+                    GetEmbeddings.Run();
+                }
+                else if (choice == 8)
+                {
+                    QuantizeModel.Run();
+                }
+                else if (choice == 9)
+                {
+                    await TalkToYourself.Run();
+                }
+                else if (choice == 10)
+                {
+                    await GrammarJsonResponse.Run();
+                }
+                else if (choice == 11)
+                {
+                    await SemanticKernelPrompt.Run();
+                }
+                else if (choice == 12)
+                {
+                    await SemanticKernelChat.Run();
+                }
+                else if (choice == 13)
+                {
+                    await SemanticKernelMemory.Run();
+                }
+                else if (choice == 14)
+                {
+                    await CodingAssistant.Run();
+                }
+                else if (choice == 15)
+                {
+                    await BatchedDecoding.Run();
+                }
+                else if (choice == 16)
+                {
+                    await KernelMemory.Run();
+                }
+                else
+                {
+                    Console.WriteLine("Cannot parse your choice. Please select again.");
+                    continue;
+                }
+                break;
             }
         }
     }

diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -18,9 +18,9 @@ public class ModelOptions
         public int MaxInstances { get; set; }
 
         /// <summary>
-        /// Model context size (n_ctx). Null to use value from model.
+        /// Model context size (n_ctx)
         /// </summary>
-        public uint? ContextSize { get; set; }
+        public uint ContextSize { get; set; } = 512;
 
         /// <summary>
         /// the GPU that is used for scratch and small tensors

diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs
@@ -9,9 +9,9 @@ namespace LLama.Abstractions;
 public interface IContextParams
 {
     /// <summary>
-    /// Model context size (n_ctx). Null to use value from model file.
+    /// Model context size (n_ctx)
     /// </summary>
-    uint? ContextSize { get; set; }
+    uint ContextSize { get; set; }
 
     /// <summary>
     /// batch size for prompt processing (must be >=32 to use BLAS) (n_batch)

diff --git a/LLama/Common/FixedSizeQueue.cs b/LLama/Common/FixedSizeQueue.cs
@@ -43,7 +43,7 @@ public FixedSizeQueue(int size)
         /// <param name="data"></param>
         public FixedSizeQueue(int size, IEnumerable<T> data)
         {
-#if NET6_0_OR_GREATER
+#if !NETSTANDARD2_0 
             // Try to check the size without enumerating the entire IEnumerable. This may not be able to get the count,
             // in which case we'll have to check later
             if (data.TryGetNonEnumeratedCount(out var dataCount) && dataCount > size)