Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multiple context layers per model #76

Closed
wants to merge 39 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
e7c1295
Move response generation from Hub to Service
saddam213 Jul 28, 2023
ecc8499
Merge branch 'master' into Multi_Context
saddam213 Jul 28, 2023
faacd26
Rename LLamaModel to LLamaModelContext
saddam213 Jul 28, 2023
fba0527
Separate Model and Context
saddam213 Jul 28, 2023
db94e4e
Add simple model cache to allow simple, multiple context
saddam213 Jul 28, 2023
695b17a
Apply Lora to cached model
saddam213 Jul 28, 2023
a95e69b
Remove model cache, should be at a higher level
saddam213 Jul 28, 2023
6cc0e3d
Move model cache to higher level, typo
saddam213 Jul 28, 2023
20c7b27
Added GH action for some simple CI
martindevans Jul 29, 2023
4ddb37f
fixed capitalisation
martindevans Jul 29, 2023
de0551e
Added queue fix, so that CI can pass
martindevans Jul 29, 2023
c37dc85
- moved dotnet version into matrix
martindevans Jul 29, 2023
85c7537
Always installing both 6 and 7
martindevans Jul 29, 2023
68fa27e
Cleaned up installing multiple dotnets
martindevans Jul 29, 2023
0591897
Fixed yaml syntax
martindevans Jul 29, 2023
c62141b
Improved formatting of response
saddam213 Jul 30, 2023
13c015b
Merge branch 'master' of https://github.com/SciSharp/LLamaSharp
saddam213 Jul 30, 2023
8b42d06
- Added a folder with a 7B Llama2 model, automatically downloaded fro…
martindevans Jul 30, 2023
5bab47b
Update ModelParams
saddam213 Aug 2, 2023
286bd82
Merge branch 'master' into Web_Interface_Settings
saddam213 Aug 3, 2023
85b7e60
Merge branch 'master' into SplitModelContext
saddam213 Aug 3, 2023
22cdd0f
Merge branch 'github_actions' of https://github.com/martindevans/LLam…
saddam213 Aug 3, 2023
47f33fc
Merge branch 'master' into SplitModelContext
saddam213 Aug 3, 2023
05ea664
Merge branch 'SplitModelContext' into Web_Interface_Settings
saddam213 Aug 3, 2023
60f5ca5
Inject ModelCacheService
saddam213 Aug 3, 2023
b8bb5ae
LLamaModel to track LLamaModelContext instances
saddam213 Aug 3, 2023
362812e
Async ModelCacheService
saddam213 Aug 3, 2023
87fdd45
Fix incorrect docs
saddam213 Aug 3, 2023
c310400
Merge branch 'SplitModelContext' into Web_Interface_Settings
saddam213 Aug 3, 2023
fc5bd43
Create sessions with live parameters
saddam213 Aug 3, 2023
b94d6a6
ModelParams & InferenceParams abstractions
saddam213 Aug 3, 2023
ed47e37
Simplify Web interface
saddam213 Aug 4, 2023
0f8c541
Update README, quick tidy up
saddam213 Aug 4, 2023
b3168b8
Add Web UI to README
saddam213 Aug 4, 2023
c3a25fc
Update CPU and CUDA_12 binaries
saddam213 Aug 4, 2023
864f2f7
Merge branch 'master' of https://github.com/SciSharp/LLamaSharp
saddam213 Aug 5, 2023
c0235eb
Update README example to new implementation
saddam213 Aug 5, 2023
bca736f
Latest linux binary
saddam213 Aug 5, 2023
2b5efa0
Manually marshal tensor_split for cross platform support
saddam213 Aug 5, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
name: CI
on:
push:
branches: [master]
pull_request:
branches: [master]

jobs:
build:
name: Test
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
build: [linux-debug, linux-release, macos-debug, macos-release, windows-debug, windows-release]
include:
- build: linux-debug
os: ubuntu-latest
config: debug
- build: linux-release
os: ubuntu-latest
config: release
- build: macos-debug
os: macos-latest
config: debug
- build: macos-release
os: macos-latest
config: release
- build: windows-debug
os: windows-2019
config: debug
- build: windows-release
os: windows-2019
config: release
steps:
- uses: actions/checkout@v2
- uses: actions/setup-dotnet@v1
with:
dotnet-version: |
6.0.x
7.0.x
- name: Cache Gradle packages
uses: actions/cache@v3
with:
key: "unit_test_models"
path: LLama.Unittest/Models
# workaround for actions/setup-dotnet#155
- name: Clear package cache
run: dotnet clean LLamaSharp.sln && dotnet nuget locals all --clear
- name: Restore packages
run: dotnet restore LLamaSharp.sln
- name: Build
run: dotnet build LLamaSharp.sln -c ${{ matrix.config }} --no-restore
- name: Test
run: dotnet test LLamaSharp.sln -c ${{ matrix.config }}
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -341,4 +341,5 @@ test/TensorFlowNET.Examples/mnist
*.xsd

# docs
site/
site/
/LLama.Unittest/Models/*.bin
3 changes: 2 additions & 1 deletion LLama.Examples/NewVersion/ChatSessionStripRoleName.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ public static void Run()
Console.Write("Please input your model path: ");
string modelPath = Console.ReadLine();
var prompt = File.ReadAllText("Assets/chat-with-bob.txt").Trim();
InteractiveExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5)));
LLamaModel model = new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5));
InteractiveExecutor ex = new(new LLamaModelContext(model));
ChatSession session = new ChatSession(ex).WithOutputTransform(new LLamaTransforms.KeywordTextOutputStreamTransform(new string[] { "User:", "Bob:" }, redundancyLength: 8));

Console.ForegroundColor = ConsoleColor.Yellow;
Expand Down
3 changes: 2 additions & 1 deletion LLama.Examples/NewVersion/ChatSessionWithRoleName.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ public static void Run()
Console.Write("Please input your model path: ");
string modelPath = Console.ReadLine();
var prompt = File.ReadAllText("Assets/chat-with-bob.txt").Trim();
InteractiveExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5)));
LLamaModel model = new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5));
InteractiveExecutor ex = new(new LLamaModelContext(model));
ChatSession session = new ChatSession(ex); // The only change is to remove the transform for the output text stream.

Console.ForegroundColor = ConsoleColor.Yellow;
Expand Down
3 changes: 2 additions & 1 deletion LLama.Examples/NewVersion/InstructModeExecute.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ public static void Run()
string modelPath = Console.ReadLine();
var prompt = File.ReadAllText("Assets/dan.txt").Trim();

InstructExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 1024)));
LLamaModel model = new LLamaModel(new ModelParams(modelPath, contextSize: 1024));
InstructExecutor ex = new(new LLamaModelContext(model));

Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine("The executor has been enabled. In this example, the LLM will follow your instructions. For example, you can input \"Write a story about a fox who want to " +
Expand Down
3 changes: 2 additions & 1 deletion LLama.Examples/NewVersion/InteractiveModeExecute.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ public async static Task Run()
string modelPath = Console.ReadLine();
var prompt = File.ReadAllText("Assets/chat-with-bob.txt").Trim();

InteractiveExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 256)));
LLamaModel model = new LLamaModel(new ModelParams(modelPath, contextSize: 256));
InteractiveExecutor ex = new(new LLamaModelContext(model));

Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine("The executor has been enabled. In this example, the prompt is printed, the maximum tokens is set to 128 and the context size is 256. (an example for small scale usage)");
Expand Down
9 changes: 6 additions & 3 deletions LLama.Examples/NewVersion/LoadAndSaveSession.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ public static void Run()
Console.Write("Please input your model path: ");
string modelPath = Console.ReadLine();
var prompt = File.ReadAllText("Assets/chat-with-bob.txt").Trim();
InteractiveExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5)));
LLamaModel model = new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5));
InteractiveExecutor ex = new(new LLamaModelContext(model));
ChatSession session = new ChatSession(ex); // The only change is to remove the transform for the output text stream.

Console.ForegroundColor = ConsoleColor.Yellow;
Expand Down Expand Up @@ -45,8 +46,10 @@ public static void Run()
Console.WriteLine("Saved session!");
Console.ForegroundColor = ConsoleColor.White;

ex.Model.Dispose();
ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5)));
ex.Context.Dispose();

//LLamaModel model = new LLamaModel(new ModelParams(modelPath, contextSize: 1024, seed: 1337, gpuLayerCount: 5));
ex = new(new LLamaModelContext(model));
session = new ChatSession(ex);
session.LoadSession(statePath);

Expand Down
11 changes: 6 additions & 5 deletions LLama.Examples/NewVersion/LoadAndSaveState.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ public static void Run()
string modelPath = Console.ReadLine();
var prompt = File.ReadAllText("Assets/chat-with-bob.txt").Trim();

InteractiveExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 256)));
LLamaModel model = new LLamaModel(new ModelParams(modelPath, contextSize: 256));
InteractiveExecutor ex = new(new LLamaModelContext(model));

Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine("The executor has been enabled. In this example, the prompt is printed, the maximum tokens is set to 64 and the context size is 256. (an example for small scale usage)");
Expand All @@ -37,7 +38,7 @@ public static void Run()
{
Console.Write("Your path to save model state: ");
string modelStatePath = Console.ReadLine();
ex.Model.SaveState(modelStatePath);
ex.Context.SaveState(modelStatePath);

Console.Write("Your path to save executor state: ");
string executorStatePath = Console.ReadLine();
Expand All @@ -47,9 +48,9 @@ public static void Run()
Console.WriteLine("All states saved!");
Console.ForegroundColor = ConsoleColor.White;

var model = ex.Model;
model.LoadState(modelStatePath);
ex = new InteractiveExecutor(model);
var context = ex.Context;
context.LoadState(modelStatePath);
ex = new InteractiveExecutor(context);
ex.LoadState(executorStatePath);
Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine("Loaded state!");
Expand Down
3 changes: 2 additions & 1 deletion LLama.Examples/NewVersion/StatelessModeExecute.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ public static void Run()
Console.Write("Please input your model path: ");
string modelPath = Console.ReadLine();

StatelessExecutor ex = new(new LLamaModel(new ModelParams(modelPath, contextSize: 256)));
LLamaModel model = new LLamaModel(new ModelParams(modelPath, contextSize: 256));
StatelessExecutor ex = new(new LLamaModelContext(model));

Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine("The executor has been enabled. In this example, the inference is an one-time job. That says, the previous input and response has " +
Expand Down
7 changes: 5 additions & 2 deletions LLama.Unittest/BasicTest.cs
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
using LLama.Common;

namespace LLama.Unittest
{
public class BasicTest
{
[Fact]
public void SimpleQA()
public void LoadModel()
{

var model = new LLamaModel(new ModelParams("Models/llama-2-7b-chat.ggmlv3.q3_K_S.bin", contextSize: 256));
model.Dispose();
}
}
}
15 changes: 15 additions & 0 deletions LLama.Unittest/LLama.Unittest.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,23 @@
</PackageReference>
</ItemGroup>

<Target Name="DownloadContentFiles" BeforeTargets="Build">
<DownloadFile SourceUrl="https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q3_K_S.bin" DestinationFolder="Models" DestinationFileName="llama-2-7b-chat.ggmlv3.q3_K_S.bin" SkipUnchangedFiles="true">
</DownloadFile>
</Target>

<ItemGroup>
<ProjectReference Include="..\LLama\LLamaSharp.csproj" />
</ItemGroup>

<ItemGroup>
<Folder Include="Models\" />
</ItemGroup>

<ItemGroup>
<None Update="Models\llama-2-7b-chat.ggmlv3.q3_K_S.bin">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
</ItemGroup>

</Project>
9 changes: 0 additions & 9 deletions LLama.Web/Common/LLamaOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,9 @@
public class LLamaOptions
{
public List<ModelOptions> Models { get; set; }
public List<PromptOptions> Prompts { get; set; } = new List<PromptOptions>();
public List<ParameterOptions> Parameters { get; set; } = new List<ParameterOptions>();

public void Initialize()
{
foreach (var prompt in Prompts)
{
if (File.Exists(prompt.Path))
{
prompt.Prompt = File.ReadAllText(prompt.Path).Trim();
}
}
}
}
}
31 changes: 24 additions & 7 deletions LLama.Web/Common/ModelOptions.cs
Original file line number Diff line number Diff line change
@@ -1,15 +1,32 @@
using LLama.Common;
using LLama.Abstractions;

namespace LLama.Web.Common
{
public class ModelOptions : ModelParams
public class ModelOptions : IModelParams
{
public ModelOptions() : base("", 512, 20, 1337, true, true, false, false, "", "", -1, 512, false, false)
{
}

public string Name { get; set; }
public int MaxInstances { get; set; }

public string Name { get; set; } = "unknown";
public int ContextSize { get; set; } = 512;
public int MainGpu { get; set; } = 0;
public bool LowVram { get; set; } = false;
public int GpuLayerCount { get; set; } = 20;
public int Seed { get; set; } = 1686349486;
public bool UseFp16Memory { get; set; } = true;
public bool UseMemorymap { get; set; } = true;
public bool UseMemoryLock { get; set; } = false;
public bool Perplexity { get; set; } = false;
public string ModelPath { get; set; }
public string LoraAdapter { get; set; } = string.Empty;
public string LoraBase { get; set; } = string.Empty;
public int Threads { get; set; } = Math.Max(Environment.ProcessorCount / 2, 1);
public int BatchSize { get; set; } = 512;
public bool ConvertEosToNewLine { get; set; } = false;
public bool EmbeddingMode { get; set; } = false;
public float[] TensorSplits { get; set; } = new float[] { 0 };
public int GroupedQueryAttention { get; set; } = 1;
public float RmsNormEpsilon { get; set; } = 5e-6f;
public float RopeFrequencyBase { get; set; } = 10000.0f;
public float RopeFrequencyScale { get; set; } = 1.0f;
}
}
9 changes: 0 additions & 9 deletions LLama.Web/Common/ParameterOptions.cs

This file was deleted.

41 changes: 7 additions & 34 deletions LLama.Web/Hubs/SessionConnectionHub.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
using LLama.Web.Models;
using LLama.Web.Services;
using Microsoft.AspNetCore.SignalR;
using System.Diagnostics;

namespace LLama.Web.Hubs
{
Expand Down Expand Up @@ -38,15 +37,13 @@ public override async Task OnDisconnectedAsync(Exception? exception)


[HubMethodName("LoadModel")]
public async Task OnLoadModel(LLamaExecutorType executorType, string modelName, string promptName, string parameterName)
public async Task OnLoadModel(CreateSessionModel sessionModel)
{
_logger.Log(LogLevel.Information, "[OnLoadModel] - Load new model, Connection: {0}, Model: {1}, Prompt: {2}, Parameter: {3}", Context.ConnectionId, modelName, promptName, parameterName);

// Remove existing connections session
await _modelSessionService.RemoveAsync(Context.ConnectionId);
_logger.Log(LogLevel.Information, "[OnLoadModel] - Load new model, Connection: {0}", Context.ConnectionId);


// Create model session
var modelSessionResult = await _modelSessionService.CreateAsync(executorType, Context.ConnectionId, modelName, promptName, parameterName);
var modelSessionResult = await _modelSessionService.CreateAsync(Context.ConnectionId, sessionModel);
if (modelSessionResult.HasError)
{
await Clients.Caller.OnError(modelSessionResult.Error);
Expand All @@ -63,35 +60,11 @@ public async Task OnSendPrompt(string prompt)
{
_logger.Log(LogLevel.Information, "[OnSendPrompt] - New prompt received, Connection: {0}", Context.ConnectionId);

// Get connections session
var modelSession = await _modelSessionService.GetAsync(Context.ConnectionId);
if (modelSession is null)
{
await Clients.Caller.OnError("No model has been loaded");
return;
}


// Create unique response id
var responseId = Guid.NewGuid().ToString();

// Send begin of response
await Clients.Caller.OnResponse(new ResponseFragment(responseId, isFirst: true));

// Send content of response
var stopwatch = Stopwatch.GetTimestamp();
await foreach (var fragment in modelSession.InferAsync(prompt, CancellationTokenSource.CreateLinkedTokenSource(Context.ConnectionAborted)))
// Send Infer response
await foreach (var responseFragment in _modelSessionService.InferAsync(Context.ConnectionId, prompt, CancellationTokenSource.CreateLinkedTokenSource(Context.ConnectionAborted)))
{
await Clients.Caller.OnResponse(new ResponseFragment(responseId, fragment));
await Clients.Caller.OnResponse(responseFragment);
}

// Send end of response
var elapsedTime = Stopwatch.GetElapsedTime(stopwatch);
var signature = modelSession.IsInferCanceled()
? $"Inference cancelled after {elapsedTime.TotalSeconds:F0} seconds"
: $"Inference completed in {elapsedTime.TotalSeconds:F0} seconds";
await Clients.Caller.OnResponse(new ResponseFragment(responseId, signature, isLast: true));
_logger.Log(LogLevel.Information, "[OnSendPrompt] - Inference complete, Connection: {0}, Elapsed: {1}, Canceled: {2}", Context.ConnectionId, elapsedTime, modelSession.IsInferCanceled());
}

}
Expand Down
39 changes: 39 additions & 0 deletions LLama.Web/Models/CreateSessionModel.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
using LLama.Abstractions;
using LLama.Common;
using LLama.Web.Common;

namespace LLama.Web.Models
{
public class CreateSessionModel : IInferenceParams
{
public string Model { get; set; }
public string Prompt { get; set; }
public LLamaExecutorType ExecutorType { get; set; } = LLamaExecutorType.Interactive;
public string AntiPrompt { get; set; } = string.Empty;
public string OutputFilter { get; set; } = string.Empty;

public int TokensKeep { get; set; } = 0;
public int MaxTokens { get; set; } = -1;
public IEnumerable<string> AntiPrompts { get; set; } = Array.Empty<string>();
public string InputSuffix { get; set; } = string.Empty;
public string InputPrefix { get; set; } = string.Empty;
public int TopK { get; set; } = 40;
public float TopP { get; set; } = 0.95f;
public float TfsZ { get; set; } = 1.0f;
public float TypicalP { get; set; } = 1.0f;
public float Temperature { get; set; } = 0.8f;
public float RepeatPenalty { get; set; } = 1.1f;
public int RepeatLastTokensCount { get; set; } = 64;
public float FrequencyPenalty { get; set; } = .0f;
public float PresencePenalty { get; set; } = .0f;
public MirostatType Mirostat { get; set; } = MirostatType.Disable;
public float MirostatTau { get; set; } = 5.0f;
public float MirostatEta { get; set; } = 0.1f;
public bool PenalizeNL { get; set; } = true;


// TODO: Ensure overpost protected
public Dictionary<int, float> LogitBias { get; set; }
public string PathSession { get; set; } = string.Empty;
}
}
Loading
Loading