SciSharp · martindevans · May 4, 2024 · May 2, 2024 · May 3, 2024 · May 3, 2024
diff --git a/LLama/LLamaContext.cs b/LLama/LLamaContext.cs
@@ -1,4 +1,4 @@
-using LLama.Exceptions;
+using LLama.Exceptions;
 using LLama.Native;
 using System;
 using System.Collections.Generic;
@@ -521,6 +521,17 @@ public LLamaTokenDataArray ApplyPenalty(int logits_i, IEnumerable<LLamaToken> la
             return candidates_p;
         }
 
+        /// <summary>
+        /// Gets whether or not the Bos token should be added.
+        /// From common.cpp https://github.com/ggerganov/llama.cpp/blob/60325fa56f61c228464c9f065db3aa6a61f2156e/common/common.cpp#L2417
+        /// </summary>
+        /// <returns></returns>
+        public bool ShouldAddBosToken()
+        {
+            var addBos = NativeApi.llama_add_bos_token(NativeHandle.ModelHandle);
+            return addBos != -1 ? Convert.ToBoolean(addBos) : NativeHandle.LLamaVocabType == LLamaVocabType.SentencePiece;
+        }
+
         #region eval overloads
         /// <summary>
         /// </summary>

diff --git a/LLama/LLamaExecutorBase.cs b/LLama/LLamaExecutorBase.cs
@@ -1,4 +1,4 @@
-using LLama.Abstractions;
+using LLama.Abstractions;
 using LLama.Common;
 using LLama.Exceptions;
 using LLama.Native;
@@ -195,13 +195,14 @@
             // if we run out of context:
             // - take the tokensToKeep first tokens from the original prompt (via n_past)
             // - take half of the last (n_ctx - tokensToKeep) tokens and recompute the logits in batches
-            int n_left = _pastTokensCount - tokensToKeep;
+            var n_left = _pastTokensCount - tokensToKeep;
+            var n_discard = n_left / 2;
 
-            _pastTokensCount = Math.Max(1, tokensToKeep);
-
-            // insert n_left/2 tokens at the start of embed from last_n_tokens
-            _embeds.InsertRange(0, _last_n_tokens.Take(_last_n_tokens.Count - _embeds.Count).Skip((int)Context.ContextSize - n_left / 2 - _embeds.Count));
+            NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, (LLamaSeqId)0, tokensToKeep, tokensToKeep + n_discard);
+            NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, (LLamaSeqId)0, tokensToKeep + n_discard, _pastTokensCount, -n_discard);
 
+            _pastTokensCount -= n_discard;
+
             // stop saving session if we run out of context
             _pathSession = string.Empty;
         }
@@ -419,13 +420,13 @@
            public string? SessionFilePath { get; set; }

            [JsonPropertyName("embd")]
            public LLamaToken[] Embeds { get; set; }

            [JsonPropertyName("embd_inps")]
            public LLamaToken[] EmbedInps { get; set; }

            [JsonPropertyName("session_tokens")]
            public LLamaToken[] SessionTokens { get; set; }

            [JsonPropertyName("last_n_tokens")]
            public LLamaToken[] LastTokens { get; set; }

diff --git a/LLama/LLamaInstructExecutor.cs b/LLama/LLamaInstructExecutor.cs
@@ -1,4 +1,4 @@
-using LLama.Abstractions;
+using LLama.Abstractions;
 using LLama.Common;
 using LLama.Native;
 using System;
@@ -106,7 +106,7 @@
            using (var fs = new FileStream(filename, FileMode.Open, FileAccess.Read))
            {
                var state = await JsonSerializer.DeserializeAsync<InstructExecutorState>(fs);
                await LoadState(state);
            }
        }

@@ -147,11 +147,11 @@
        }

        /// <inheritdoc />
        protected override async Task<(bool, IReadOnlyList<string>)> PostProcess(IInferenceParams inferenceParams, InferStateArgs args)
        {
            if (_embed_inps.Count <= _consumedTokensCount)
            {
                if (_last_n_tokens.TokensEndsWithAnyString(args.Antiprompts, Context.NativeHandle.ModelHandle, Context.Encoding))
                {
                    args.WaitForInput = true;
                    return (true, Array.Empty<string>());
@@ -186,7 +186,10 @@
                 _is_prompt_run = false;
                 if (_pastTokensCount + _embeds.Count > Context.ContextSize)
                 {
-                    HandleRunOutOfContext(inferenceParams.TokensKeep);
+                    // Ported from https://github.com/ggerganov/llama.cpp/blob/60325fa56f61c228464c9f065db3aa6a61f2156e/examples/main/main.cpp#L334
+                    // Instruct always uses input token size.
+                    var tokensToKeep = _embed_inps.Count;
+                    HandleRunOutOfContext(tokensToKeep);
                 }
 
                 TryReuseMatchingPrefix();

diff --git a/LLama/LLamaInteractExecutor.cs b/LLama/LLamaInteractExecutor.cs
@@ -1,4 +1,4 @@
-using LLama.Common;
+using LLama.Common;
 using LLama.Native;
 using LLama.Abstractions;
 using System;
@@ -98,7 +98,7 @@
            using (var fs = new FileStream(filename, FileMode.Open, FileAccess.Read))
            {
                var state = await JsonSerializer.DeserializeAsync<InteractiveExecutorState>(fs);
                await LoadState(state);
            }
        }

@@ -159,7 +159,7 @@
            {
                foreach (var image in Images)
                {
                    _imageEmbedHandles.Add(SafeLlavaImageEmbedHandle.CreateFromMemory(ClipModel.NativeHandle, Context, image));
                }

                int imageIndex = text.IndexOf("<image>");
@@ -196,11 +196,11 @@
        /// <param name="inferenceParams"></param>
        /// <param name="args"></param>
        /// <returns></returns>
        protected override async Task<(bool, IReadOnlyList<string>)> PostProcess(IInferenceParams inferenceParams, InferStateArgs args)
        {
            if (_embed_inps.Count <= _consumedTokensCount)
            {
                if (_last_n_tokens.TokensEndsWithAnyString(args.Antiprompts, Context.NativeHandle.ModelHandle, Context.Encoding))
                    args.WaitForInput = true;

                if (_pastTokensCount > 0 && args.WaitForInput)
@@ -231,7 +231,19 @@
                 _is_prompt_run = false;
                 if (_pastTokensCount + _embeds.Count > Context.ContextSize)
                 {
-                    HandleRunOutOfContext(inferenceParams.TokensKeep);
+                    // number of tokens to keep when resetting context
+                    // Ported from https://github.com/ggerganov/llama.cpp/blob/60325fa56f61c228464c9f065db3aa6a61f2156e/examples/main/main.cpp#L334
+                    var tokensToKeep = inferenceParams.TokensKeep;
+                    if (tokensToKeep < 0 || tokensToKeep > _embed_inps.Count)
+                    {
+                        tokensToKeep = _embed_inps.Count;
+                    }
+                    else
+                    {
+                        tokensToKeep += Convert.ToInt32(Context.ShouldAddBosToken()); // always keep the BOS token
+                    }
+
+                    HandleRunOutOfContext(tokensToKeep);
                 }
 
                 TryReuseMatchingPrefix();
@@ -247,7 +259,7 @@

                    // Images
                    foreach( var image in _imageEmbedHandles )
                        ClipModel.EvalImageEmbed(Context, image, ref _pastTokensCount);

                    // Post-image Tokens
                    end = Context.NativeHandle.Decode(_embeds.GetRange(_EmbedImagePosition, _embeds.Count - _EmbedImagePosition), LLamaSeqId.Zero, batch, ref _pastTokensCount);
@@ -280,7 +292,7 @@
                if (!string.IsNullOrEmpty(_pathSession) && args.NeedToSaveSession)
                {
                    args.NeedToSaveSession = false;
                    SaveSessionFile(_pathSession);
                }

                LLamaToken id;

diff --git a/LLama/LLamaStatelessExecutor.cs b/LLama/LLamaStatelessExecutor.cs
@@ -1,4 +1,4 @@
-using LLama.Abstractions;
+using LLama.Abstractions;
 using LLama.Common;
 using System;
 using System.Collections.Generic;
@@ -144,11 +144,25 @@ public async IAsyncEnumerable<string> InferAsync(string prompt, IInferenceParams
                 // based on this logic: https://github.com/ggerganov/llama.cpp/blob/master/examples/main/main.cpp#L497
                 if (n_past + tokens.Count >= Context.ContextSize)
                 {
-                    var n_left = n_past - inferenceParams.TokensKeep - 1;
+                    var canAddBos = Context.ShouldAddBosToken();
+                    var tokensKeep = inferenceParams.TokensKeep;
+
+                    // number of tokens to keep when resetting context
+                    // Ported from https://github.com/ggerganov/llama.cpp/blob/60325fa56f61c228464c9f065db3aa6a61f2156e/examples/main/main.cpp#L334
+                    if (tokensKeep < 0 || tokensKeep > tokens.Count)
+                    {
+                        tokensKeep = tokens.Count;
+                    }
+                    else
+                    {
+                        tokensKeep += Convert.ToInt32(canAddBos);
+                    }
+
+                    var n_left = n_past - tokensKeep;
                     var n_discard = n_left / 2;
 
-                    NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, (LLamaSeqId)0, inferenceParams.TokensKeep + 1, inferenceParams.TokensKeep + n_discard + 1);
-                    NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, (LLamaSeqId)0, inferenceParams.TokensKeep + 1 + n_discard, n_past, -n_discard);
+                    NativeApi.llama_kv_cache_seq_rm(Context.NativeHandle, (LLamaSeqId)0, tokensKeep , tokensKeep + n_discard);
+                    NativeApi.llama_kv_cache_seq_add(Context.NativeHandle, (LLamaSeqId)0, tokensKeep + n_discard, n_past, -n_discard);
 
                     n_past -= n_discard;
                 }

diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
@@ -1,4 +1,4 @@
-using System;
+using System;
 using System.Collections.Generic;
 using System.Runtime.InteropServices;
 using System.Text;
@@ -19,6 +19,8 @@ public sealed class SafeLLamaContextHandle
         /// </summary>
         public int VocabCount => ThrowIfDisposed().VocabCount;
 
+        public LLamaVocabType LLamaVocabType => ThrowIfDisposed().VocabType;
+
         /// <summary>
         /// Total number of tokens in the context
         /// </summary>