SciSharp · saddam213 · Jul 28, 2023 · Jul 28, 2023 · Jul 28, 2023 · Jul 28, 2023
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -0,0 +1,55 @@
+name: CI
+on:
+  push:
+    branches: [master]
+  pull_request:
+    branches: [master]
+
+jobs:
+  build:
+    name: Test
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        build: [linux-debug, linux-release, macos-debug, macos-release, windows-debug, windows-release]
+        include:
+          - build: linux-debug
+            os: ubuntu-latest
+            config: debug
+          - build: linux-release
+            os: ubuntu-latest
+            config: release
+          - build: macos-debug
+            os: macos-latest
+            config: debug
+          - build: macos-release
+            os: macos-latest
+            config: release
+          - build: windows-debug
+            os: windows-2019
+            config: debug
+          - build: windows-release
+            os: windows-2019
+            config: release
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions/setup-dotnet@v1
+      with:
+        dotnet-version: | 
+          6.0.x
+          7.0.x
+    - name: Cache Gradle packages
+      uses: actions/cache@v3
+      with:
+        key: "unit_test_models"
+        path: LLama.Unittest/Models
+    #  workaround for actions/setup-dotnet#155
+    - name: Clear package cache
+      run: dotnet clean LLamaSharp.sln && dotnet nuget locals all --clear
+    - name: Restore packages
+      run: dotnet restore LLamaSharp.sln
+    - name: Build
+      run: dotnet build LLamaSharp.sln -c ${{ matrix.config }} --no-restore
+    - name: Test
+      run: dotnet test LLamaSharp.sln -c ${{ matrix.config }}
diff --git a/.gitignore b/.gitignore
@@ -341,4 +341,5 @@ test/TensorFlowNET.Examples/mnist
 *.xsd
 
 # docs
-site/
+site/
+/LLama.Unittest/Models/*.bin
diff --git a/LLama.Examples/NewVersion/ChatSessionStripRoleName.cs b/LLama.Examples/NewVersion/ChatSessionStripRoleName.cs
@@ -14,7 +14,8 @@ public static void Run()
             Console.Write("Please input your model path: ");
             string modelPath = Console.ReadLine();
             var prompt = File.ReadAllText("Assets/chat-with-bob.txt").Trim();
-            InteractiveExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5)));
+            LLamaModel model =  new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5));
+            InteractiveExecutor ex = new(new LLamaModelContext(model));
             ChatSession session = new ChatSession(ex).WithOutputTransform(new LLamaTransforms.KeywordTextOutputStreamTransform(new string[] { "User:", "Bob:" }, redundancyLength: 8));
 
             Console.ForegroundColor = ConsoleColor.Yellow;

diff --git a/LLama.Examples/NewVersion/ChatSessionWithRoleName.cs b/LLama.Examples/NewVersion/ChatSessionWithRoleName.cs
@@ -14,7 +14,8 @@ public static void Run()
             Console.Write("Please input your model path: ");
             string modelPath = Console.ReadLine();
             var prompt = File.ReadAllText("Assets/chat-with-bob.txt").Trim();
-            InteractiveExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5)));
+            LLamaModel model = new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5));
+            InteractiveExecutor ex = new(new LLamaModelContext(model));
             ChatSession session = new ChatSession(ex); // The only change is to remove the transform for the output text stream.
 
             Console.ForegroundColor = ConsoleColor.Yellow;

diff --git a/LLama.Examples/NewVersion/InstructModeExecute.cs b/LLama.Examples/NewVersion/InstructModeExecute.cs
@@ -15,7 +15,8 @@ public static void Run()
             string modelPath = Console.ReadLine();
             var prompt = File.ReadAllText("Assets/dan.txt").Trim();
 
-            InstructExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 1024)));
+            LLamaModel model = new LLamaModel(new ModelParams(modelPath, contextSize: 1024));
+            InstructExecutor ex = new(new LLamaModelContext(model));
 
             Console.ForegroundColor = ConsoleColor.Yellow;
             Console.WriteLine("The executor has been enabled. In this example, the LLM will follow your instructions. For example, you can input \"Write a story about a fox who want to " +

diff --git a/LLama.Examples/NewVersion/InteractiveModeExecute.cs b/LLama.Examples/NewVersion/InteractiveModeExecute.cs
@@ -15,7 +15,8 @@ public async static Task Run()
             string modelPath = Console.ReadLine();
             var prompt = File.ReadAllText("Assets/chat-with-bob.txt").Trim();
 
-            InteractiveExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 256)));
+            LLamaModel model = new LLamaModel(new ModelParams(modelPath, contextSize: 256));
+            InteractiveExecutor ex = new(new LLamaModelContext(model));
 
             Console.ForegroundColor = ConsoleColor.Yellow;
             Console.WriteLine("The executor has been enabled. In this example, the prompt is printed, the maximum tokens is set to 128 and the context size is 256. (an example for small scale usage)");

diff --git a/LLama.Examples/NewVersion/LoadAndSaveSession.cs b/LLama.Examples/NewVersion/LoadAndSaveSession.cs
@@ -15,7 +15,8 @@ public static void Run()
             Console.Write("Please input your model path: ");
             string modelPath = Console.ReadLine();
             var prompt = File.ReadAllText("Assets/chat-with-bob.txt").Trim();
-            InteractiveExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5)));
+            LLamaModel model = new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5));
+            InteractiveExecutor ex = new(new LLamaModelContext(model));
             ChatSession session = new ChatSession(ex); // The only change is to remove the transform for the output text stream.
 
             Console.ForegroundColor = ConsoleColor.Yellow;
@@ -45,8 +46,10 @@ public static void Run()
                     Console.WriteLine("Saved session!");
                     Console.ForegroundColor = ConsoleColor.White;
 
-                    ex.Model.Dispose();
-                    ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5)));
+                    ex.Context.Dispose();
+
+                    //LLamaModel model = new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5));
+                    ex = new(new LLamaModelContext(model));
                     session = new ChatSession(ex);
                     session.LoadSession(statePath);
 

diff --git a/LLama.Examples/NewVersion/LoadAndSaveState.cs b/LLama.Examples/NewVersion/LoadAndSaveState.cs
@@ -15,7 +15,8 @@ public static void Run()
             string modelPath = Console.ReadLine();
             var prompt = File.ReadAllText("Assets/chat-with-bob.txt").Trim();
 
-            InteractiveExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 256)));
+            LLamaModel model = new LLamaModel(new ModelParams(modelPath, contextSize: 256));
+            InteractiveExecutor ex = new(new LLamaModelContext(model));
 
             Console.ForegroundColor = ConsoleColor.Yellow;
             Console.WriteLine("The executor has been enabled. In this example, the prompt is printed, the maximum tokens is set to 64 and the context size is 256. (an example for small scale usage)");
@@ -37,7 +38,7 @@ public static void Run()
                 {
                     Console.Write("Your path to save model state: ");
                     string modelStatePath = Console.ReadLine();
-                    ex.Model.SaveState(modelStatePath);
+                    ex.Context.SaveState(modelStatePath);
 
                     Console.Write("Your path to save executor state: ");
                     string executorStatePath = Console.ReadLine();
@@ -47,9 +48,9 @@ public static void Run()
                     Console.WriteLine("All states saved!");
                     Console.ForegroundColor = ConsoleColor.White;
 
-                    var model = ex.Model;
-                    model.LoadState(modelStatePath);
-                    ex = new InteractiveExecutor(model);
+                    var context = ex.Context;
+                    context.LoadState(modelStatePath);
+                    ex = new InteractiveExecutor(context);
                     ex.LoadState(executorStatePath);
                     Console.ForegroundColor = ConsoleColor.Yellow;
                     Console.WriteLine("Loaded state!");

diff --git a/LLama.Examples/NewVersion/StatelessModeExecute.cs b/LLama.Examples/NewVersion/StatelessModeExecute.cs
@@ -14,7 +14,8 @@ public static void Run()
             Console.Write("Please input your model path: ");
             string modelPath = Console.ReadLine();
 
-            StatelessExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 256)));
+            LLamaModel model = new LLamaModel(new ModelParams(modelPath, contextSize: 256));
+            StatelessExecutor ex = new(new LLamaModelContext(model));
 
             Console.ForegroundColor = ConsoleColor.Yellow;
             Console.WriteLine("The executor has been enabled. In this example, the inference is an one-time job. That says, the previous input and response has " +

diff --git a/LLama.Unittest/BasicTest.cs b/LLama.Unittest/BasicTest.cs
@@ -1,11 +1,14 @@
+using LLama.Common;
+
 namespace LLama.Unittest
 {
     public class BasicTest
     {
         [Fact]
-        public void SimpleQA()
+        public void LoadModel()
         {
-
+            var model = new LLamaModel(new ModelParams("Models/llama-2-7b-chat.ggmlv3.q3_K_S.bin", contextSize: 256));
+            model.Dispose();
         }
     }
 }
diff --git a/LLama.Unittest/LLama.Unittest.csproj b/LLama.Unittest/LLama.Unittest.csproj
@@ -23,8 +23,23 @@
     </PackageReference>
   </ItemGroup>
 
+  <Target Name="DownloadContentFiles" BeforeTargets="Build">
+      <DownloadFile SourceUrl="https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q3_K_S.bin" DestinationFolder="Models" DestinationFileName="llama-2-7b-chat.ggmlv3.q3_K_S.bin" SkipUnchangedFiles="true">
+    </DownloadFile>
+  </Target>
+
   <ItemGroup>
     <ProjectReference Include="..\LLama\LLamaSharp.csproj" />
   </ItemGroup>
 
+  <ItemGroup>
+    <Folder Include="Models\" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <None Update="Models\llama-2-7b-chat.ggmlv3.q3_K_S.bin">
+      <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
+
 </Project>
diff --git a/LLama.Web/Common/LLamaOptions.cs b/LLama.Web/Common/LLamaOptions.cs
@@ -3,18 +3,9 @@
     public class LLamaOptions
     {
         public List<ModelOptions> Models { get; set; }
-        public List<PromptOptions> Prompts { get; set; } = new List<PromptOptions>();
-        public List<ParameterOptions> Parameters { get; set; } = new List<ParameterOptions>();
 
         public void Initialize()
         {
-            foreach (var prompt in Prompts)
-            {
-                if (File.Exists(prompt.Path))
-                {
-                    prompt.Prompt = File.ReadAllText(prompt.Path).Trim();
-                }
-            }
         }
     }
 }
diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -1,15 +1,32 @@
-using LLama.Common;
+using LLama.Abstractions;
 
 namespace LLama.Web.Common
 {
-    public class ModelOptions : ModelParams
+    public class ModelOptions : IModelParams
     {
-        public ModelOptions() : base("", 512, 20, 1337, true, true, false, false, "", "", -1, 512, false, false)
-        {
-        }
-
-        public string Name { get; set; }
         public int MaxInstances { get; set; }
 
+        public string Name { get; set; } = "unknown";
+        public int ContextSize { get; set; } = 512;
+        public int MainGpu { get; set; } = 0;
+        public bool LowVram { get; set; } = false;
+        public int GpuLayerCount { get; set; } = 20;
+        public int Seed { get; set; } = 1686349486;
+        public bool UseFp16Memory { get; set; } = true;
+        public bool UseMemorymap { get; set; } = true;
+        public bool UseMemoryLock { get; set; } = false;
+        public bool Perplexity { get; set; } = false;
+        public string ModelPath { get; set; }
+        public string LoraAdapter { get; set; } = string.Empty;
+        public string LoraBase { get; set; } = string.Empty;
+        public int Threads { get; set; } = Math.Max(Environment.ProcessorCount / 2, 1);
+        public int BatchSize { get; set; } = 512;
+        public bool ConvertEosToNewLine { get; set; } = false;
+        public bool EmbeddingMode { get; set; } = false;
+        public float[] TensorSplits { get; set; } = new float[] { 0 };
+        public int GroupedQueryAttention { get; set; } = 1;
+        public float RmsNormEpsilon { get; set; } = 5e-6f;
+        public float RopeFrequencyBase { get; set; } = 10000.0f;
+        public float RopeFrequencyScale { get; set; } = 1.0f;
     }
 }
diff --git a/LLama.Web/Common/ParameterOptions.cs b/LLama.Web/Common/ParameterOptions.cs
diff --git a/LLama.Web/Hubs/SessionConnectionHub.cs b/LLama.Web/Hubs/SessionConnectionHub.cs
@@ -2,7 +2,6 @@
 using LLama.Web.Models;
 using LLama.Web.Services;
 using Microsoft.AspNetCore.SignalR;
-using System.Diagnostics;
 
 namespace LLama.Web.Hubs
 {
@@ -38,15 +37,13 @@ public override async Task OnDisconnectedAsync(Exception? exception)
 
 
         [HubMethodName("LoadModel")]
-        public async Task OnLoadModel(LLamaExecutorType executorType, string modelName, string promptName, string parameterName)
+        public async Task OnLoadModel(CreateSessionModel sessionModel)
         {
-            _logger.Log(LogLevel.Information, "[OnLoadModel] - Load new model, Connection: {0}, Model: {1}, Prompt: {2}, Parameter: {3}", Context.ConnectionId, modelName, promptName, parameterName);
-
-            // Remove existing connections session
-            await _modelSessionService.RemoveAsync(Context.ConnectionId);
+            _logger.Log(LogLevel.Information, "[OnLoadModel] - Load new model, Connection: {0}", Context.ConnectionId);
+
 
             // Create model session
-            var modelSessionResult = await _modelSessionService.CreateAsync(executorType, Context.ConnectionId, modelName, promptName, parameterName);
+            var modelSessionResult = await _modelSessionService.CreateAsync(Context.ConnectionId, sessionModel);
             if (modelSessionResult.HasError)
             {
                 await Clients.Caller.OnError(modelSessionResult.Error);
@@ -63,35 +60,11 @@ public async Task OnSendPrompt(string prompt)
         {
             _logger.Log(LogLevel.Information, "[OnSendPrompt] - New prompt received, Connection: {0}", Context.ConnectionId);
 
-            // Get connections session
-            var modelSession = await _modelSessionService.GetAsync(Context.ConnectionId);
-            if (modelSession is null)
-            {
-                await Clients.Caller.OnError("No model has been loaded");
-                return;
-            }
-
-
-            // Create unique response id
-            var responseId = Guid.NewGuid().ToString();
-
-            // Send begin of response
-            await Clients.Caller.OnResponse(new ResponseFragment(responseId, isFirst: true));
-
-            // Send content of response
-            var stopwatch = Stopwatch.GetTimestamp();
-            await foreach (var fragment in modelSession.InferAsync(prompt, CancellationTokenSource.CreateLinkedTokenSource(Context.ConnectionAborted)))
+            // Send Infer response
+            await foreach (var responseFragment in _modelSessionService.InferAsync(Context.ConnectionId, prompt, CancellationTokenSource.CreateLinkedTokenSource(Context.ConnectionAborted)))
             {
-                await Clients.Caller.OnResponse(new ResponseFragment(responseId, fragment));
+                await Clients.Caller.OnResponse(responseFragment);
             }
-
-            // Send end of response
-            var elapsedTime = Stopwatch.GetElapsedTime(stopwatch);
-            var signature = modelSession.IsInferCanceled()
-                ? $"Inference cancelled after {elapsedTime.TotalSeconds:F0} seconds"
-                : $"Inference completed in {elapsedTime.TotalSeconds:F0} seconds";
-            await Clients.Caller.OnResponse(new ResponseFragment(responseId, signature, isLast: true));
-            _logger.Log(LogLevel.Information, "[OnSendPrompt] - Inference complete, Connection: {0}, Elapsed: {1}, Canceled: {2}", Context.ConnectionId, elapsedTime, modelSession.IsInferCanceled());
         }
 
     }

diff --git a/LLama.Web/Models/CreateSessionModel.cs b/LLama.Web/Models/CreateSessionModel.cs
@@ -0,0 +1,39 @@
+using LLama.Abstractions;
+using LLama.Common;
+using LLama.Web.Common;
+
+namespace LLama.Web.Models
+{
+    public class CreateSessionModel : IInferenceParams
+    {
+        public string Model { get; set; }
+        public string Prompt { get; set; }
+        public LLamaExecutorType ExecutorType { get; set; } = LLamaExecutorType.Interactive;
+        public string AntiPrompt { get; set; } = string.Empty;
+        public string OutputFilter { get; set; } = string.Empty;
+
+        public int TokensKeep { get; set; } = 0;
+        public int MaxTokens { get; set; } = -1;
+        public IEnumerable<string> AntiPrompts { get; set; } = Array.Empty<string>();
+        public string InputSuffix { get; set; } = string.Empty;
+        public string InputPrefix { get; set; } = string.Empty;
+        public int TopK { get; set; } = 40;
+        public float TopP { get; set; } = 0.95f;
+        public float TfsZ { get; set; } = 1.0f;
+        public float TypicalP { get; set; } = 1.0f;
+        public float Temperature { get; set; } = 0.8f;
+        public float RepeatPenalty { get; set; } = 1.1f;
+        public int RepeatLastTokensCount { get; set; } = 64;
+        public float FrequencyPenalty { get; set; } = .0f;
+        public float PresencePenalty { get; set; } = .0f;
+        public MirostatType Mirostat { get; set; } = MirostatType.Disable;
+        public float MirostatTau { get; set; } = 5.0f;
+        public float MirostatEta { get; set; } = 0.1f;
+        public bool PenalizeNL { get; set; } = true;
+
+
+        // TODO: Ensure overpost protected
+        public Dictionary<int, float> LogitBias { get; set; }
+        public string PathSession { get; set; } = string.Empty;
+    }
+}