From 71bd5ac7cb6cdfc8f2af7e0a35abbed87661062f Mon Sep 17 00:00:00 2001
From: Gian Maria Ricci <ricci.gianmaria@gmail.com>
Date: Thu, 25 Jul 2024 17:12:46 +0200
Subject: [PATCH] Add page numbering on extracted MemoryRercord.

---
 service/Core/DataFormats/Text/TextChunker2.cs | 434 +++++++++
 .../Core/Handlers/TextPartitioningHandler.cs  | 191 ++--
 .../DataFormats/Text/TextChunker2Tests.cs     | 882 ++++++++++++++++++
 3 files changed, 1434 insertions(+), 73 deletions(-)
 create mode 100644 service/Core/DataFormats/Text/TextChunker2.cs
 create mode 100644 service/tests/Core.UnitTests/DataFormats/Text/TextChunker2Tests.cs
diff --git a/service/Core/DataFormats/Text/TextChunker2.cs b/service/Core/DataFormats/Text/TextChunker2.cs
new file mode 100644
index 000000000..4e8604ad9
--- /dev/null
+++ b/service/Core/DataFormats/Text/TextChunker2.cs
@@ -0,0 +1,434 @@
+﻿// Copyright (c) Microsoft. All rights reserved.
+
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Diagnostics.CodeAnalysis;
+using System.Linq;
+using System.Text;
+using Microsoft.KernelMemory.AI.OpenAI;
+
+namespace Microsoft.KernelMemory.DataFormats.Text;
+
+/// <summary>
+/// Split text in chunks, attempting to leave meaning intact.
+/// For plain text, split looking at new lines first, then periods, and so on.
+/// For markdown, split looking at punctuation first, and so on.
+/// </summary>
+[Experimental("KMEXP00")]
+public static class TextChunker2
+{
+    /// <summary>
+    /// This is the standard content to be split, for all content that cannot be divided in pages
+    /// we can simply send a single PageInfo with all the content in a single record.
+    /// </summary>
+    /// <param name="Content"></param>
+    /// <param name="Tag">A simple object that will be added on the extracted chunk, it is a simple object
+    /// because the caller can use Page Number or whatever data it needs.</param>
+    public record ChunkInfo(string Content, object? Tag)
+    {
+        /// <summary>
+        /// If you want to convert this to string it is possible to simply return the content.
+        /// This makes simpler create TextChunker2 based on TextChunker.
+        /// </summary>
+        /// <returns></returns>
+        public override string ToString()
+        {
+            return this.Content;
+        }
+    };
+
+    private static readonly char[] s_spaceChar = { ' ' };
+    private static readonly string?[] s_plaintextSplitOptions = { "\n\r", ".", "?!", ";", ":", ",", ")]}", " ", "-", null };
+    private static readonly string?[] s_markdownSplitOptions = { ".", "?!", ";", ":", ",", ")]}", " ", "-", "\n\r", null };
+
+    /// <summary>
+    /// Split plain text into lines.
+    /// </summary>
+    /// <param name="text">Text to split</param>
+    /// <param name="tag">Tag to associate to the split</param>
+    /// <param name="maxTokensPerLine">Maximum number of tokens per line.</param>
+    /// <param name="tokenCounter">Function to count tokens in a string. If not supplied, the default counter will be used.</param>
+    /// <returns>List of lines.</returns>
+    public static List<ChunkInfo> SplitPlainTextLines(
+        string text,
+        object? tag,
+        int maxTokensPerLine,
+        TextChunker.TokenCounter? tokenCounter = null) =>
+        InternalSplitLines(
+            new ChunkInfo(text, tag),
+            maxTokensPerLine,
+            trim: true,
+            s_plaintextSplitOptions, tokenCounter);
+
+    /// <summary>
+    /// Split markdown text into lines.
+    /// </summary>
+    /// <param name="text">Text to split</param>
+    /// <param name="tag">Tag to associate to the split</param>
+    /// <param name="maxTokensPerLine">Maximum number of tokens per line.</param>
+    /// <param name="tokenCounter">Function to count tokens in a string. If not supplied, the default counter will be used.</param>
+    /// <returns>List of lines.</returns>
+    public static List<ChunkInfo> SplitMarkDownLines(
+        string text,
+        object tag,
+        int maxTokensPerLine,
+        TextChunker.TokenCounter? tokenCounter = null) =>
+        InternalSplitLines(
+            new ChunkInfo(text, tag),
+            maxTokensPerLine,
+            trim: true,
+            s_markdownSplitOptions, tokenCounter);
+
+    /// <summary>
+    /// Split plain text into paragraphs.
+    /// Note: in the default KM implementation, one paragraph == one partition.
+    /// </summary>
+    /// <param name="lines">Lines of text.</param>
+    /// <param name="maxTokensPerParagraph">Maximum number of tokens per paragraph.</param>
+    /// <param name="overlapTokens">Number of tokens to overlap between paragraphs.</param>
+    /// <param name="chunkHeader">Text to be prepended to each individual chunk.</param>
+    /// <param name="tokenCounter">Function to count tokens in a string. If not supplied, the default counter will be used.</param>
+    /// <returns>List of paragraphs.</returns>
+    public static IReadOnlyCollection<ChunkInfo> SplitPlainTextParagraphs(
+        List<ChunkInfo> lines,
+        int maxTokensPerParagraph,
+        int overlapTokens = 0,
+        string? chunkHeader = null,
+        TextChunker.TokenCounter? tokenCounter = null) =>
+        InternalSplitTextParagraphs(
+            lines,
+            maxTokensPerParagraph,
+            overlapTokens,
+            chunkHeader,
+            static (text, maxTokens, tokenCounter) => InternalSplitLines(
+                text,
+                maxTokens,
+                trim: false,
+                s_plaintextSplitOptions,
+                tokenCounter),
+            tokenCounter);
+
+    /// <summary>
+    /// Split markdown text into paragraphs.
+    /// </summary>
+    /// <param name="lines">Lines of text.</param>
+    /// <param name="maxTokensPerParagraph">Maximum number of tokens per paragraph.</param>
+    /// <param name="overlapTokens">Number of tokens to overlap between paragraphs.</param>
+    /// <param name="chunkHeader">Text to be prepended to each individual chunk.</param>
+    /// <param name="tokenCounter">Function to count tokens in a string. If not supplied, the default counter will be used.</param>
+    /// <returns>List of paragraphs.</returns>
+    public static IReadOnlyCollection<ChunkInfo> SplitMarkdownParagraphs(
+        List<ChunkInfo> lines,
+        int maxTokensPerParagraph,
+        int overlapTokens = 0,
+        string? chunkHeader = null,
+        TextChunker.TokenCounter? tokenCounter = null) =>
+        InternalSplitTextParagraphs(
+            lines,
+            maxTokensPerParagraph,
+            overlapTokens,
+            chunkHeader,
+            static (text, maxTokens, tokenCounter) => InternalSplitLines(
+                text,
+                maxTokens,
+                trim: false,
+                s_markdownSplitOptions,
+                tokenCounter),
+            tokenCounter);
+
+    private static IReadOnlyCollection<ChunkInfo> InternalSplitTextParagraphs(
+        List<ChunkInfo> lines,
+        int maxTokensPerParagraph,
+        int overlapTokens,
+        string? chunkHeader,
+        Func<ChunkInfo, int, TextChunker.TokenCounter?, List<ChunkInfo>> longLinesSplitter,
+        TextChunker.TokenCounter? tokenCounter)
+    {
+        if (maxTokensPerParagraph <= 0)
+        {
+            throw new ArgumentException("maxTokensPerParagraph should be a positive number", nameof(maxTokensPerParagraph));
+        }
+
+        if (maxTokensPerParagraph <= overlapTokens)
+        {
+            throw new ArgumentException("overlapTokens cannot be larger than maxTokensPerParagraph", nameof(maxTokensPerParagraph));
+        }
+
+        if (lines.Count == 0)
+        {
+            return Array.Empty<ChunkInfo>();
+        }
+
+        var chunkHeaderTokens = chunkHeader is { Length: > 0 } ? GetTokenCount(chunkHeader, tokenCounter) : 0;
+
+        var adjustedMaxTokensPerParagraph = maxTokensPerParagraph - overlapTokens - chunkHeaderTokens;
+
+        // Split long lines first
+        var truncatedLines = lines
+            .SelectMany(line => longLinesSplitter(line, adjustedMaxTokensPerParagraph, tokenCounter))
+            .ToArray();
+
+        var paragraphs = BuildParagraph(truncatedLines, adjustedMaxTokensPerParagraph, tokenCounter);
+
+        var processedParagraphs = ProcessParagraphs(
+            paragraphs, adjustedMaxTokensPerParagraph, overlapTokens, chunkHeader, longLinesSplitter, tokenCounter);
+
+        return processedParagraphs;
+    }
+
+    private static List<ChunkInfo> BuildParagraph(
+        ChunkInfo[] truncatedLines,
+        int maxTokensPerParagraph,
+        TextChunker.TokenCounter? tokenCounter)
+    {
+        StringBuilder paragraphBuilder = new();
+        List<ChunkInfo> paragraphs = new();
+
+        if (truncatedLines == null || truncatedLines.Length == 0)
+        {
+            return paragraphs;
+        }
+
+        //paragraph tag is the tag was first associated to the current paraphBuilder.
+        object? paragraphTag = truncatedLines[0].Tag;
+        foreach (ChunkInfo line in truncatedLines)
+        {
+            if (paragraphBuilder.Length > 0)
+            {
+                string? paragraph = null;
+
+                int currentCount = GetTokenCount(line, tokenCounter) + 1;
+                if (currentCount < maxTokensPerParagraph)
+                {
+                    currentCount += GetTokenCount(paragraphBuilder.ToString(), tokenCounter);
+                }
+
+                if (currentCount >= maxTokensPerParagraph)
+                {
+                    // Complete the paragraph and prepare for the next
+                    paragraph = paragraphBuilder.ToString();
+
+                    paragraphs.Add(new ChunkInfo(paragraph.Trim(), paragraphTag));
+                    paragraphBuilder.Clear();
+                    paragraphTag = line.Tag;
+                }
+            }
+
+            paragraphBuilder.AppendLine(line.Content);
+        }
+
+        if (paragraphBuilder.Length > 0)
+        {
+            // Add the final paragraph if there's anything remaining, now the last paragraph tag is the first
+            // tag that contains text on the tag.
+            paragraphs.Add(new ChunkInfo(paragraphBuilder.ToString().Trim(), paragraphTag));
+        }
+
+        return paragraphs;
+    }
+
+    private static List<ChunkInfo> ProcessParagraphs(
+        List<ChunkInfo> paragraphs,
+        int adjustedMaxTokensPerParagraph,
+        int overlapTokens,
+        string? chunkHeader,
+        Func<ChunkInfo, int, TextChunker.TokenCounter?, List<ChunkInfo>> longLinesSplitter,
+        TextChunker.TokenCounter? tokenCounter)
+    {
+        // distribute text more evenly in the last paragraphs when the last paragraph is too short.
+        if (paragraphs.Count > 1)
+        {
+            var lastParagraph = paragraphs[paragraphs.Count - 1];
+            var secondLastParagraph = paragraphs[paragraphs.Count - 2];
+
+            if (GetTokenCount(lastParagraph, tokenCounter) < adjustedMaxTokensPerParagraph / 4)
+            {
+                var lastParagraphTokens = lastParagraph.Content.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries);
+                var secondLastParagraphTokens = secondLastParagraph.Content.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries);
+
+                var lastParagraphTokensCount = lastParagraphTokens.Length;
+                var secondLastParagraphTokensCount = secondLastParagraphTokens.Length;
+
+                if (lastParagraphTokensCount + secondLastParagraphTokensCount <= adjustedMaxTokensPerParagraph)
+                {
+                    var newSecondLastParagraph = string.Join(" ", secondLastParagraphTokens);
+                    var newLastParagraph = string.Join(" ", lastParagraphTokens);
+
+                    paragraphs[paragraphs.Count - 2] = new ChunkInfo($"{newSecondLastParagraph} {newLastParagraph}", secondLastParagraph.Tag);
+                    paragraphs.RemoveAt(paragraphs.Count - 1);
+                }
+            }
+        }
+
+        var processedParagraphs = new List<ChunkInfo>();
+        var paragraphStringBuilder = new StringBuilder();
+
+        for (int i = 0; i < paragraphs.Count; i++)
+        {
+            paragraphStringBuilder.Clear();
+
+            if (chunkHeader is not null)
+            {
+                paragraphStringBuilder.Append(chunkHeader);
+            }
+
+            var paragraph = paragraphs[i];
+
+            if (overlapTokens > 0 && i < paragraphs.Count - 1)
+            {
+                var nextParagraph = paragraphs[i + 1];
+                var split = longLinesSplitter(nextParagraph, overlapTokens, tokenCounter);
+
+                paragraphStringBuilder.Append(paragraph.Content);
+
+                if (split.Count != 0)
+                {
+                    paragraphStringBuilder.Append(' ').Append(split[0]);
+                }
+            }
+            else
+            {
+                paragraphStringBuilder.Append(paragraph.Content);
+            }
+
+            processedParagraphs.Add(new ChunkInfo(paragraphStringBuilder.ToString(), paragraph.Tag));
+        }
+
+        return processedParagraphs;
+    }
+
+    private static List<ChunkInfo> InternalSplitLines(
+        ChunkInfo chunkInput,
+        int maxTokensPerLine,
+        bool trim,
+        string?[] splitOptions,
+        TextChunker.TokenCounter? tokenCounter)
+    {
+        var result = new List<ChunkInfo>();
+
+        var text = chunkInput.Content.Replace("\r\n", "\n", StringComparison.OrdinalIgnoreCase); // normalize line endings
+        result.Add(new ChunkInfo(text, chunkInput.Tag));
+        for (int i = 0; i < splitOptions.Length; i++)
+        {
+            int count = result.Count; // track where the original input left off
+            var (splits2, inputWasSplit2) = Split(result, maxTokensPerLine, splitOptions[i].AsSpan(), trim, tokenCounter);
+            result.AddRange(splits2);
+            result.RemoveRange(0, count); // remove the original input
+            if (!inputWasSplit2)
+            {
+                break;
+            }
+        }
+
+        return result;
+    }
+
+    private static (List<ChunkInfo>, bool) Split(
+        List<ChunkInfo> input,
+        int maxTokens,
+        ReadOnlySpan<char> separators,
+        bool trim,
+        TextChunker.TokenCounter? tokenCounter)
+    {
+        bool inputWasSplit = false;
+        List<ChunkInfo> result = new();
+        int count = input.Count;
+        for (int i = 0; i < count; i++)
+        {
+            var currentInput = input[i];
+            var (splits, split) = Split(currentInput.Content.AsSpan(), currentInput.Content, maxTokens, separators, trim, tokenCounter);
+            result.AddRange(splits.Select(s => new ChunkInfo(s, currentInput.Tag)));
+            inputWasSplit |= split;
+        }
+
+        return (result, inputWasSplit);
+    }
+
+    private static (List<string>, bool) Split(
+        ReadOnlySpan<char> input,
+        string? inputString,
+        int maxTokens,
+        ReadOnlySpan<char> separators,
+        bool trim,
+        TextChunker.TokenCounter? tokenCounter)
+    {
+        Debug.Assert(inputString is null || input.SequenceEqual(inputString.AsSpan()));
+        List<string> result = new();
+        var inputWasSplit = false;
+
+        int inputTokenCount = GetTokenCount(inputString ??= input.ToString(), tokenCounter);
+
+        if (inputTokenCount > maxTokens)
+        {
+            inputWasSplit = true;
+
+            int half = input.Length / 2;
+            int cutPoint = -1;
+
+            if (separators.IsEmpty)
+            {
+                cutPoint = half;
+            }
+            else if (input.Length > 2)
+            {
+                int pos = 0;
+                while (true)
+                {
+                    int index = input.Slice(pos, input.Length - 1 - pos).IndexOfAny(separators);
+                    if (index < 0)
+                    {
+                        break;
+                    }
+
+                    index += pos;
+
+                    if (Math.Abs(half - index) < Math.Abs(half - cutPoint))
+                    {
+                        cutPoint = index + 1;
+                    }
+
+                    pos = index + 1;
+                }
+            }
+
+            if (cutPoint > 0)
+            {
+                var firstHalf = input.Slice(0, cutPoint);
+                var secondHalf = input.Slice(cutPoint);
+                if (trim)
+                {
+                    firstHalf = firstHalf.Trim();
+                    secondHalf = secondHalf.Trim();
+                }
+
+                // Recursion
+                var (splits1, split1) = Split(firstHalf, null, maxTokens, separators, trim, tokenCounter);
+                result.AddRange(splits1);
+                var (splits2, split2) = Split(secondHalf, null, maxTokens, separators, trim, tokenCounter);
+                result.AddRange(splits2);
+
+                inputWasSplit = split1 || split2;
+                return (result, inputWasSplit);
+            }
+        }
+
+        result.Add((inputString is not null, trim) switch
+        {
+            (true, true) => inputString!.Trim(),
+            (true, false) => inputString!,
+            (false, true) => input.Trim().ToString(),
+            (false, false) => input.ToString(),
+        });
+
+        return (result, inputWasSplit);
+    }
+
+    private static int GetTokenCount(ChunkInfo input, TextChunker.TokenCounter? tokenCounter) => GetTokenCount(input.Content, tokenCounter);
+
+    private static int GetTokenCount(string input, TextChunker.TokenCounter? tokenCounter)
+    {
+        // Fall back to GPT tokenizer if none configured
+        return tokenCounter?.Invoke(input) ?? DefaultGPTTokenizer.StaticCountTokens(input);
+    }
+}
diff --git a/service/Core/Handlers/TextPartitioningHandler.cs b/service/Core/Handlers/TextPartitioningHandler.cs
index d6c3f9e25..5235ca84c 100644
--- a/service/Core/Handlers/TextPartitioningHandler.cs
+++ b/service/Core/Handlers/TextPartitioningHandler.cs
@@ -2,12 +2,14 @@
 
 using System;
 using System.Collections.Generic;
+using System.Linq;
 using System.Threading;
 using System.Threading.Tasks;
 using Microsoft.Extensions.Logging;
 using Microsoft.KernelMemory.AI.OpenAI;
 using Microsoft.KernelMemory.Configuration;
 using Microsoft.KernelMemory.Context;
+using Microsoft.KernelMemory.DataFormats;
 using Microsoft.KernelMemory.DataFormats.Text;
 using Microsoft.KernelMemory.Diagnostics;
 using Microsoft.KernelMemory.Extensions;
@@ -66,6 +68,8 @@ public TextPartitioningHandler(
         }
     }
 
+    private record PartitionInfo(string Content, int? PageNumber);
+
     /// <inheritdoc />
     public async Task<(bool success, DataPipeline updatedPipeline)> InvokeAsync(
         DataPipeline pipeline, CancellationToken cancellationToken = default)
@@ -97,103 +101,144 @@ public TextPartitioningHandler(
             // Track new files being generated (cannot edit originalFile.GeneratedFiles while looping it)
             Dictionary<string, DataPipeline.GeneratedFileDetails> newFiles = new();
 
-            foreach (KeyValuePair<string, DataPipeline.GeneratedFileDetails> generatedFile in uploadedFile.GeneratedFiles)
+            List<PartitionInfo>? partitions = null;
+            List<string> sentences;
+            string partitionsMimeType = MimeTypes.PlainText;
+            DataPipeline.GeneratedFileDetails? file = null;
+
+            // we prefer extracting from structured data because we can leave page number
+            var extractedContent = uploadedFile.GeneratedFiles.FirstOrDefault(uploadedFile => uploadedFile.Value.ArtifactType == DataPipeline.ArtifactTypes.ExtractedContent);
+            if (extractedContent.Value != null)
             {
-                var file = generatedFile.Value;
-                if (file.AlreadyProcessedBy(this))
-                {
-                    this._log.LogTrace("File {0} already processed by this handler", file.Name);
-                    continue;
-                }
+                BinaryData dataExtractedContent = await this._orchestrator.ReadFileAsync(pipeline, extractedContent.Value.Name, cancellationToken).ConfigureAwait(false);
 
-                // Partition only the original text
-                if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedText)
+                var fileContent = dataExtractedContent.ToObjectFromJson<FileContent>();
+
+                if (fileContent != null)
                 {
-                    this._log.LogTrace("Skipping file {0} (not original text)", file.Name);
-                    continue;
-                }
+                    //ok lets try to deserialize the contentS
+                    this._log.LogTrace("File {0} was processed with ExtractedContent {1}", uploadedFile.Name, extractedContent.Value.Name);
+
+                    //now we should split with a splitter that keeps track of page number.
+                    file = extractedContent.Value;
 
-                // Use a different partitioning strategy depending on the file type
-                List<string> partitions;
-                List<string> sentences;
-                BinaryData partitionContent = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false);
-                string partitionsMimeType = MimeTypes.PlainText;
+                    List<TextChunker2.ChunkInfo> chunks = new();
+                    foreach (var content in fileContent.Sections)
+                    {
+                        var stringContent = content.Content;
 
-                // Skip empty partitions. Also: partitionContent.ToString() throws an exception if there are no bytes.
-                if (partitionContent.ToArray().Length == 0) { continue; }
+                        var lines = TextChunker.SplitPlainTextLines(stringContent, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter);
+                        chunks.AddRange(lines.Select(l => new TextChunker2.ChunkInfo(l, content.Number)));
+                    }
 
-                switch (file.MimeType)
+                    var stringPartitions = TextChunker2.SplitPlainTextParagraphs(chunks, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, chunkHeader: chunkHeader, tokenCounter: this._tokenCounter);
+                    partitions = stringPartitions.Select(c => new PartitionInfo(c.Content, (int?)c.Tag)).ToList();
+                }
+            }
+
+            if (partitions == null)
+            {
+                //old logic where we have no extracted content
+                foreach (KeyValuePair<string, DataPipeline.GeneratedFileDetails> generatedFile in uploadedFile.GeneratedFiles)
                 {
-                    case MimeTypes.PlainText:
+                    file = generatedFile.Value;
+                    if (file.AlreadyProcessedBy(this))
                     {
-                        this._log.LogDebug("Partitioning text file {0}", file.Name);
-                        string content = partitionContent.ToString();
-                        sentences = TextChunker.SplitPlainTextLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter);
-                        partitions = TextChunker.SplitPlainTextParagraphs(
-                            sentences, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, tokenCounter: this._tokenCounter, chunkHeader: chunkHeader);
-                        break;
+                        this._log.LogTrace("File {0} already processed by this handler", file.Name);
+                        continue;
                     }
 
-                    case MimeTypes.MarkDown:
+                    // Partition only the original text
+                    if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedText)
                     {
-                        this._log.LogDebug("Partitioning MarkDown file {0}", file.Name);
-                        string content = partitionContent.ToString();
-                        partitionsMimeType = MimeTypes.MarkDown;
-                        sentences = TextChunker.SplitMarkDownLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter);
-                        partitions = TextChunker.SplitMarkdownParagraphs(
-                            sentences, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, tokenCounter: this._tokenCounter);
-                        break;
+                        this._log.LogTrace("Skipping file {0} (not original text)", file.Name);
+                        continue;
                     }
 
-                    // TODO: add virtual/injectable logic
-                    // TODO: see https://learn.microsoft.com/en-us/windows/win32/search/-search-ifilter-about
+                    // Use a different partitioning strategy depending on the file type
+                    BinaryData partitionContent = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false);
 
-                    default:
-                        this._log.LogWarning("File {0} cannot be partitioned, type '{1}' not supported", file.Name, file.MimeType);
-                        // Don't partition other files
-                        continue;
+                    // Skip empty partitions. Also: partitionContent.ToString() throws an exception if there are no bytes.
+                    if (partitionContent.ToArray().Length == 0) { continue; }
+
+                    switch (file.MimeType)
+                    {
+                        case MimeTypes.PlainText:
+                        {
+                            this._log.LogDebug("Partitioning text file {0}", file.Name);
+                            string content = partitionContent.ToString();
+                            sentences = TextChunker.SplitPlainTextLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter);
+                            var stringPartitions = TextChunker.SplitPlainTextParagraphs(
+                                sentences, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, chunkHeader: chunkHeader, tokenCounter: this._tokenCounter);
+
+                            partitions = stringPartitions.Select(c => new PartitionInfo(c, null)).ToList();
+                            break;
+                        }
+
+                        case MimeTypes.MarkDown:
+                        {
+                            this._log.LogDebug("Partitioning MarkDown file {0}", file.Name);
+                            string content = partitionContent.ToString();
+                            partitionsMimeType = MimeTypes.MarkDown;
+                            sentences = TextChunker.SplitMarkDownLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter);
+                            var stringPartitions = TextChunker.SplitMarkdownParagraphs(
+                                sentences, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, tokenCounter: this._tokenCounter);
+
+                            partitions = stringPartitions.Select(c => new PartitionInfo(c, null)).ToList();
+                            break;
+                        }
+
+                        // TODO: add virtual/injectable logic
+                        // TODO: see https://learn.microsoft.com/en-us/windows/win32/search/-search-ifilter-about
+
+                        default:
+                            this._log.LogWarning("File {0} cannot be partitioned, type '{1}' not supported", file.Name, file.MimeType);
+                            // Don't partition other files
+                            continue;
+                    }
                 }
+            }
 
-                if (partitions.Count == 0) { continue; }
+            if (partitions == null || partitions.Count == 0 || file == null) { continue; }
 
-                this._log.LogDebug("Saving {0} file partitions", partitions.Count);
-                for (int partitionNumber = 0; partitionNumber < partitions.Count; partitionNumber++)
-                {
-                    // TODO: turn partitions in objects with more details, e.g. page number
-                    string text = partitions[partitionNumber];
-                    int sectionNumber = 0; // TODO: use this to store the page number (if any)
-                    BinaryData textData = new(text);
-
-                    int tokenCount = this._tokenCounter(text);
-                    this._log.LogDebug("Partition size: {0} tokens", tokenCount);
+            this._log.LogDebug("Saving {0} file partitions", partitions.Count);
+            for (int partitionNumber = 0; partitionNumber < partitions.Count; partitionNumber++)
+            {
+                // TODO: turn partitions in objects with more details, e.g. page number
+                var partition = partitions[partitionNumber];
+                string text = partition.Content;
+                int sectionNumber = partition.PageNumber ?? 0;
+                BinaryData textData = new(text);
 
-                    var destFile = uploadedFile.GetPartitionFileName(partitionNumber);
-                    await this._orchestrator.WriteFileAsync(pipeline, destFile, textData, cancellationToken).ConfigureAwait(false);
+                int tokenCount = this._tokenCounter(text);
+                this._log.LogDebug("Partition size: {0} tokens", tokenCount);
 
-                    var destFileDetails = new DataPipeline.GeneratedFileDetails
-                    {
-                        Id = Guid.NewGuid().ToString("N"),
-                        ParentId = uploadedFile.Id,
-                        Name = destFile,
-                        Size = text.Length,
-                        MimeType = partitionsMimeType,
-                        ArtifactType = DataPipeline.ArtifactTypes.TextPartition,
-                        PartitionNumber = partitionNumber,
-                        SectionNumber = sectionNumber,
-                        Tags = pipeline.Tags,
-                        ContentSHA256 = textData.CalculateSHA256(),
-                    };
-                    newFiles.Add(destFile, destFileDetails);
-                    destFileDetails.MarkProcessedBy(this);
-                }
+                var destFile = uploadedFile.GetPartitionFileName(partitionNumber);
+                await this._orchestrator.WriteFileAsync(pipeline, destFile, textData, cancellationToken).ConfigureAwait(false);
 
-                file.MarkProcessedBy(this);
+                var destFileDetails = new DataPipeline.GeneratedFileDetails
+                {
+                    Id = Guid.NewGuid().ToString("N"),
+                    ParentId = uploadedFile.Id,
+                    Name = destFile,
+                    Size = text.Length,
+                    MimeType = partitionsMimeType,
+                    ArtifactType = DataPipeline.ArtifactTypes.TextPartition,
+                    PartitionNumber = partitionNumber,
+                    SectionNumber = sectionNumber,
+                    Tags = pipeline.Tags,
+                    ContentSHA256 = textData.CalculateSHA256(),
+                };
+                newFiles.Add(destFile, destFileDetails);
+                destFileDetails.MarkProcessedBy(this);
             }
 
+            file.MarkProcessedBy(this);
+
             // Add new files to pipeline status
-            foreach (var file in newFiles)
+            foreach (var newFile in newFiles)
             {
-                uploadedFile.GeneratedFiles.Add(file.Key, file.Value);
+                uploadedFile.GeneratedFiles.Add(newFile.Key, newFile.Value);
             }
         }
 
diff --git a/service/tests/Core.UnitTests/DataFormats/Text/TextChunker2Tests.cs b/service/tests/Core.UnitTests/DataFormats/Text/TextChunker2Tests.cs
new file mode 100644
index 000000000..7bff2becc
--- /dev/null
+++ b/service/tests/Core.UnitTests/DataFormats/Text/TextChunker2Tests.cs
@@ -0,0 +1,882 @@
+﻿// Copyright (c) Microsoft. All rights reserved.
+
+using System.Text;
+using Microsoft.KernelMemory.DataFormats.Text;
+
+namespace Microsoft.KM.Core.UnitTests.DataFormats.Text;
+
+public sealed class TextChunker2Tests
+{
+    // Use this as the default chunker, to decouple the test from GPT3 tokenizer
+    private static readonly TextChunker.TokenCounter s_tokenCounter = s => (s.Length >> 2);
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitPlainTextLines()
+    {
+        const string Input = "This is a test of the emergency broadcast system. This is only a test.";
+        var expected = new[]
+        {
+            "This is a test of the emergency broadcast system.",
+            "This is only a test."
+        };
+
+        var result = TextChunker2.SplitPlainTextLines(Input, tag: null, 15, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitMarkdownParagraphs()
+    {
+        List<TextChunker2.ChunkInfo> input = new()
+        {
+            new TextChunker2.ChunkInfo("This is a test of the emergency broadcast system. This is only a test.", 1),
+            new TextChunker2.ChunkInfo("We repeat, this is only a test. A unit test.", 2)
+        };
+        var expected = new[]
+        {
+            "This is a test of the emergency broadcast system.",
+            "This is only a test.",
+            "We repeat, this is only a test. A unit test."
+        };
+
+        var expectedTag = new[]
+        {
+            1,
+            1,
+            2
+        };
+
+        var result = TextChunker2.SplitMarkdownParagraphs(input, 13, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal(expectedTag, result.Select(o => o.Tag).Cast<int>().ToArray());
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitMarkdownParagraphsWithOverlap()
+    {
+        List<TextChunker2.ChunkInfo> input = new()
+        {
+            new TextChunker2.ChunkInfo("This is a test of the emergency broadcast system. This is only a test.", 1),
+            new TextChunker2.ChunkInfo("We repeat, this is only a test. A unit test.", 2)
+        };
+
+        var expected = new[]
+        {
+            "This is a test of the emergency broadcast system.",
+            "emergency broadcast system. This is only a test.",
+            "This is only a test. We repeat, this is only a test.",
+            "We repeat, this is only a test. A unit test.",
+            "A unit test."
+        };
+
+        var expectedTag = new[]
+        {
+            1,
+            1,
+            1,
+            2,
+            2
+        };
+
+        var result = TextChunker2.SplitMarkdownParagraphs(input, 15, 8, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal(expectedTag, result.Select(o => o.Tag).Cast<int>().ToArray());
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphs()
+    {
+        List<TextChunker2.ChunkInfo> input = new()
+        {
+            new TextChunker2.ChunkInfo("This is a test of the emergency broadcast system. This is only a test.", 1),
+            new TextChunker2.ChunkInfo("We repeat, this is only a test. A unit test.", 2)
+        };
+
+        var expected = new[]
+        {
+            "This is a test of the emergency broadcast system.",
+            "This is only a test.",
+            "We repeat, this is only a test. A unit test."
+        };
+
+        var expectedTag = new[]
+        {
+            1,
+            1,
+            2
+        };
+
+        var result = TextChunker2.SplitPlainTextParagraphs(input, 13, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal(expectedTag, result.Select(o => o.Tag).Cast<int>().ToArray());
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphsWithOverlap()
+    {
+        List<TextChunker2.ChunkInfo> input = new()
+        {
+            new TextChunker2.ChunkInfo("This is a test of the emergency broadcast system. This is only a test.", 1),
+            new TextChunker2.ChunkInfo("We repeat, this is only a test. A unit test.", 2)
+        };
+
+        var expected = new[]
+        {
+            "This is a test of the emergency broadcast system.",
+            "emergency broadcast system. This is only a test.",
+            "This is only a test. We repeat, this is only a test.",
+            "We repeat, this is only a test. A unit test.",
+            "A unit test."
+        };
+
+        var expectedTag = new[]
+        {
+            1,
+            1,
+            1,
+            2,
+            2
+        };
+
+        var result = TextChunker2.SplitPlainTextParagraphs(input, 15, 8, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal(expectedTag, result.Select(o => o.Tag).Cast<int>().ToArray());
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitMarkDownLines()
+    {
+        const string Input = "This is a test of the emergency broadcast system. This is only a test.";
+        var expected = new[]
+        {
+            "This is a test of the emergency broadcast system.",
+            "This is only a test."
+        };
+
+        var result = TextChunker2.SplitMarkDownLines(Input, tag: 42, maxTokensPerLine: 15, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.All(result, c => c.Tag?.Equals(42));
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphsWithEmptyInput()
+    {
+        List<TextChunker2.ChunkInfo> input = new();
+
+        var result = TextChunker2.SplitPlainTextParagraphs(input, 13, tokenCounter: s_tokenCounter);
+
+        Assert.Empty(result);
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitMarkdownParagraphsWithEmptyInput()
+    {
+        List<TextChunker2.ChunkInfo> input = new();
+
+        var result = TextChunker2.SplitMarkdownParagraphs(input, 13, tokenCounter: s_tokenCounter);
+
+        Assert.Empty(result);
+    }
+
+    private List<TextChunker2.ChunkInfo> ConvertToChunkInput(List<string> input)
+    {
+        var result = new List<TextChunker2.ChunkInfo>();
+        for (int i = 0; i < input.Count; i++)
+        {
+            result.Add(new TextChunker2.ChunkInfo(input[i], i + 1));
+        }
+        return result;
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphsEvenly()
+    {
+        List<string> input = new()
+        {
+            "This is a test of the emergency broadcast system. This is only a test.",
+            "We repeat, this is only a test. A unit test.",
+            "A small note. And another. And once again. Seriously, this is the end. We're finished. All set. Bye.",
+            "Done."
+        };
+
+        var expected = new[]
+        {
+            "This is a test of the emergency broadcast system.",
+            "This is only a test.",
+            "We repeat, this is only a test. A unit test.",
+            "A small note. And another. And once again.",
+            "Seriously, this is the end. We're finished. All set. Bye. Done."
+        };
+
+        var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    // a plaintext example that splits on \r or \n
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphsOnNewlines()
+    {
+        List<string> input = new()
+        {
+            "This is a test of the emergency broadcast system\r\nThis is only a test",
+            "We repeat this is only a test\nA unit test",
+            "A small note\nAnd another\r\nAnd once again\rSeriously this is the end\nWe're finished\nAll set\nBye\n",
+            "Done"
+        };
+
+        var expected = new[]
+        {
+            "This is a test of the emergency broadcast system",
+            "This is only a test",
+            "We repeat this is only a test\nA unit test",
+            "A small note\nAnd another\nAnd once again",
+            "Seriously this is the end\nWe're finished\nAll set\nBye Done",
+        };
+
+        var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    // a plaintext example that splits on ? or !
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphsOnPunctuation()
+    {
+        List<string> input = new()
+        {
+            "This is a test of the emergency broadcast system. This is only a test",
+            "We repeat, this is only a test? A unit test",
+            "A small note! And another? And once again! Seriously, this is the end. We're finished. All set. Bye.",
+            "Done."
+        };
+
+        var expected = new[]
+        {
+            "This is a test of the emergency broadcast system.",
+            "This is only a test",
+            "We repeat, this is only a test? A unit test",
+            "A small note! And another? And once again!",
+            "Seriously, this is the end.",
+            $"We're finished. All set. Bye.{Environment.NewLine}Done.",
+        };
+
+        var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 2, 3, 3, 3], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    // a plaintext example that splits on ;
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphsOnSemicolons()
+    {
+        List<string> input = new()
+        {
+            "This is a test of the emergency broadcast system; This is only a test",
+            "We repeat; this is only a test; A unit test",
+            "A small note; And another; And once again; Seriously, this is the end; We're finished; All set; Bye.",
+            "Done."
+        };
+
+        var expected = new[]
+        {
+            "This is a test of the emergency broadcast system;",
+            "This is only a test",
+            "We repeat; this is only a test; A unit test",
+            "A small note; And another; And once again;",
+            "Seriously, this is the end; We're finished; All set; Bye. Done.",
+        };
+
+        var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    // a plaintext example that splits on :
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphsOnColons()
+    {
+        List<string> input = new()
+        {
+            "This is a test of the emergency broadcast system: This is only a test",
+            "We repeat: this is only a test: A unit test",
+            "A small note: And another: And once again: Seriously, this is the end: We're finished: All set: Bye.",
+            "Done."
+        };
+
+        var expected = new[]
+        {
+            "This is a test of the emergency broadcast system:",
+            "This is only a test",
+            "We repeat: this is only a test: A unit test",
+            "A small note: And another: And once again:",
+            "Seriously, this is the end: We're finished: All set: Bye. Done.",
+        };
+
+        var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    // a plaintext example that splits on ,
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphsOnCommas()
+    {
+        List<string> input = new()
+        {
+            "This is a test of the emergency broadcast system, This is only a test",
+            "We repeat, this is only a test, A unit test",
+            "A small note, And another, And once again, Seriously, this is the end, We're finished, All set, Bye.",
+            "Done."
+        };
+
+        var expected = new[]
+        {
+            "This is a test of the emergency broadcast system,",
+            "This is only a test",
+            "We repeat, this is only a test, A unit test",
+            "A small note, And another, And once again, Seriously,",
+            $"this is the end, We're finished, All set, Bye.{Environment.NewLine}Done.",
+        };
+
+        var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    // a plaintext example that splits on ) or ] or }
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphsOnClosingBrackets()
+    {
+        List<string> input = new()
+        {
+            "This is a test of the emergency broadcast system) This is only a test",
+            "We repeat) this is only a test) A unit test",
+            "A small note] And another) And once again] Seriously this is the end} We're finished} All set} Bye.",
+            "Done."
+        };
+
+        var expected = new[]
+        {
+            "This is a test of the emergency broadcast system)",
+            "This is only a test",
+            "We repeat) this is only a test) A unit test",
+            "A small note] And another) And once again]",
+            "Seriously this is the end} We're finished} All set} Bye. Done.",
+        };
+
+        var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    // a plaintext example that splits on ' '
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphsOnSpaces()
+    {
+        List<string> input = new()
+        {
+            "This is a test of the emergency broadcast system This is only a test",
+            "We repeat this is only a test A unit test",
+            "A small note And another And once again Seriously this is the end We're finished All set Bye.",
+            "Done."
+        };
+
+        var expected = new[]
+        {
+            "This is a test of the emergency",
+            "broadcast system This is only a test",
+            "We repeat this is only a test A unit test",
+            "A small note And another And once again Seriously",
+            $"this is the end We're finished All set Bye.{Environment.NewLine}Done.",
+        };
+
+        var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    // a plaintext example that splits on '-'
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphsOnHyphens()
+    {
+        List<string> input = new()
+        {
+            "This is a test of the emergency broadcast system-This is only a test",
+            "We repeat-this is only a test-A unit test",
+            "A small note-And another-And once again-Seriously, this is the end-We're finished-All set-Bye.",
+            "Done."
+        };
+
+        var expected = new[]
+        {
+            "This is a test of the emergency",
+            "broadcast system-This is only a test",
+            "We repeat-this is only a test-A unit test",
+            "A small note-And another-And once again-Seriously,",
+            $"this is the end-We're finished-All set-Bye.{Environment.NewLine}Done.",
+        };
+
+        var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    // a plaintext example that does not have any of the above characters
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphsWithNoDelimiters()
+    {
+        List<string> input = new()
+        {
+            "Thisisatestoftheemergencybroadcastsystem",
+            "Thisisonlyatest",
+            "WerepeatthisisonlyatestAunittest",
+            "AsmallnoteAndanotherAndonceagain",
+            "SeriouslythisistheendWe'refinishedAllsetByeDoneThisOneWillBeSplitToMeetTheLimit",
+        };
+
+        var expected = new[]
+        {
+            $"Thisisatestoftheemergencybroadcastsystem{Environment.NewLine}Thisisonlyatest",
+            "WerepeatthisisonlyatestAunittest",
+            "AsmallnoteAndanotherAndonceagain",
+            "SeriouslythisistheendWe'refinishedAllse",
+            "tByeDoneThisOneWillBeSplitToMeetTheLimit",
+        };
+
+        var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 3, 4, 5, 5], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    // a markdown example that splits on .
+
+    // a markdown example that splits on ? or !
+
+    // a markdown example that splits on ;
+
+    // a markdown example that splits on :
+
+    // a markdown example that splits on ,
+
+    // a markdown example that splits on ) or ] or }
+
+    // a markdown example that splits on ' '
+
+    // a markdown example that splits on '-'
+
+    // a markdown example that splits on '\r' or '\n'
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitMarkdownParagraphsOnNewlines()
+    {
+        List<string> input = new()
+        {
+            "This_is_a_test_of_the_emergency_broadcast_system\r\nThis_is_only_a_test",
+            "We_repeat_this_is_only_a_test\nA_unit_test",
+            "A_small_note\nAnd_another\r\nAnd_once_again\rSeriously_this_is_the_end\nWe're_finished\nAll_set\nBye\n",
+            "Done"
+        };
+
+        var expected = new[]
+        {
+            "This_is_a_test_of_the_emergency_broadcast_system",
+            "This_is_only_a_test",
+            "We_repeat_this_is_only_a_test\nA_unit_test",
+            "A_small_note\nAnd_another\nAnd_once_again",
+            "Seriously_this_is_the_end\nWe're_finished\nAll_set\nBye Done",
+        };
+
+        var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    // a markdown example that does not have any of the above characters
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitVeryLargeDocumentsWithoutStackOverflowing()
+    {
+#pragma warning disable CA5394 // this test relies on repeatable pseudo-random numbers
+        var rand = new Random(42);
+        var sb = new StringBuilder(100_000 * 11);
+        for (int wordNum = 0; wordNum < 100_000; wordNum++)
+        {
+            int wordLength = rand.Next(1, 10);
+            for (int charNum = 0; charNum < wordLength; charNum++)
+            {
+                sb.Append((char)('a' + rand.Next(0, 26)));
+            }
+
+            sb.Append(' ');
+        }
+
+        string text = sb.ToString();
+        List<TextChunker2.ChunkInfo> lines = TextChunker2.SplitPlainTextLines(text, tag: 42, 20, tokenCounter: s_tokenCounter);
+        var paragraphs = TextChunker2.SplitPlainTextParagraphs(lines, 200, tokenCounter: s_tokenCounter);
+        Assert.NotEmpty(paragraphs);
+#pragma warning restore CA5394
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitPlainTextLinesWithCustomTokenCounter()
+    {
+        const string input = "This is a test of the emergency broadcast system. This is only a test.";
+        var expected = new[]
+        {
+                "This is a test of the emergency broadcast system.",
+                "This is only a test."
+            };
+
+        var result = TextChunker2.SplitPlainTextLines(input, tag: 42, 60, s => s.Length);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([42, 42], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitMarkdownParagraphsWithCustomTokenCounter()
+    {
+        List<string> input = new()
+        {
+            "This is a test of the emergency broadcast system. This is only a test.",
+            "We repeat, this is only a test. A unit test."
+        };
+        var expected = new[]
+        {
+            "This is a test of the emergency broadcast system.",
+            "This is only a test.",
+            "We repeat, this is only a test. A unit test."
+        };
+
+        var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 52, tokenCounter: s => s.Length);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitMarkdownParagraphsWithOverlapAndCustomTokenCounter()
+    {
+        List<string> input = new()
+        {
+            "This is a test of the emergency broadcast system. This is only a test.",
+            "We repeat, this is only a test. A unit test."
+        };
+
+        var expected = new[]
+        {
+            "This is a test of the emergency broadcast system.",
+            "emergency broadcast system. This is only a test.",
+            "This is only a test. We repeat, this is only a test.",
+            "We repeat, this is only a test. A unit test.",
+            "A unit test."
+        };
+
+        var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 75, 40, tokenCounter: s => s.Length);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphsWithCustomTokenCounter()
+    {
+        List<string> input = new()
+        {
+            "This is a test of the emergency broadcast system. This is only a test.",
+            "We repeat, this is only a test. A unit test."
+        };
+
+        var expected = new[]
+        {
+            "This is a test of the emergency broadcast system.",
+            "This is only a test.",
+            "We repeat, this is only a test. A unit test."
+        };
+
+        var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 52, tokenCounter: s => s.Length);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphsWithOverlapAndCustomTokenCounter()
+    {
+        List<string> input = new()
+        {
+            "This is a test of the emergency broadcast system. This is only a test.",
+            "We repeat, this is only a test. A unit test."
+        };
+
+        var expected = new[]
+        {
+            "This is a test of the emergency broadcast system.",
+            "emergency broadcast system. This is only a test.",
+            "This is only a test. We repeat, this is only a test.",
+            "We repeat, this is only a test. A unit test.",
+            "A unit test."
+        };
+
+        var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 75, 40, tokenCounter: s => s.Length);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitMarkDownLinesWithCustomTokenCounter()
+    {
+        const string input = "This is a test of the emergency broadcast system. This is only a test.";
+        var expected = new[]
+        {
+                "This is a test of the emergency broadcast system.",
+                "This is only a test."
+            };
+
+        var result = TextChunker2.SplitMarkDownLines(input, tag: 42, 60, tokenCounter: s => s.Length);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([42, 42], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitMarkdownParagraphsWithHeader()
+    {
+        const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n";
+        List<string> input = new()
+            {
+                "This is a test of the emergency broadcast system. This is only a test.",
+                "We repeat, this is only a test. A unit test."
+            };
+        var expected = new[]
+        {
+                $"{ChunkHeader}This is a test of the emergency broadcast system.",
+                $"{ChunkHeader}This is only a test.",
+                $"{ChunkHeader}We repeat, this is only a test. A unit test."
+            };
+
+        var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 20, chunkHeader: ChunkHeader, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitMarkdownParagraphsWithOverlapAndHeader()
+    {
+        const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n";
+        List<string> input = new()
+            {
+                "This is a test of the emergency broadcast system. This is only a test.",
+                "We repeat, this is only a test. A unit test."
+            };
+
+        var expected = new[]
+        {
+                $"{ChunkHeader}This is a test of the emergency broadcast system.",
+                $"{ChunkHeader}emergency broadcast system. This is only a test.",
+                $"{ChunkHeader}This is only a test. We repeat, this is only a test.",
+                $"{ChunkHeader}We repeat, this is only a test. A unit test.",
+                $"{ChunkHeader}A unit test."
+            };
+
+        var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 22, 8, chunkHeader: ChunkHeader, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphsWithHeader()
+    {
+        const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n";
+        List<string> input = new()
+        {
+            "This is a test of the emergency broadcast system. This is only a test.",
+            "We repeat, this is only a test. A unit test."
+        };
+
+        var expected = new[]
+        {
+            $"{ChunkHeader}This is a test of the emergency broadcast system.",
+            $"{ChunkHeader}This is only a test.",
+            $"{ChunkHeader}We repeat, this is only a test. A unit test."
+        };
+
+        var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 20, chunkHeader: ChunkHeader, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphsWithOverlapAndHeader()
+    {
+        const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n";
+        List<string> input = new()
+            {
+                "This is a test of the emergency broadcast system. This is only a test.",
+                "We repeat, this is only a test. A unit test."
+            };
+
+        var expected = new[]
+        {
+                $"{ChunkHeader}This is a test of the emergency broadcast system.",
+                $"{ChunkHeader}emergency broadcast system. This is only a test.",
+                $"{ChunkHeader}This is only a test. We repeat, this is only a test.",
+                $"{ChunkHeader}We repeat, this is only a test. A unit test.",
+                $"{ChunkHeader}A unit test."
+            };
+
+        var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 22, 8, chunkHeader: ChunkHeader, tokenCounter: s_tokenCounter);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitMarkdownParagraphsWithHeaderAndCustomTokenCounter()
+    {
+        const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n";
+        List<string> input = new()
+        {
+            "This is a test of the emergency broadcast system. This is only a test.",
+            "We repeat, this is only a test. A unit test."
+        };
+        var expected = new[]
+        {
+            $"{ChunkHeader}This is a test of the emergency broadcast system.",
+            $"{ChunkHeader}This is only a test.",
+            $"{ChunkHeader}We repeat, this is only a test. A unit test."
+        };
+
+        var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 77, chunkHeader: ChunkHeader, tokenCounter: s => s.Length);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitMarkdownParagraphsWithOverlapAndHeaderAndCustomTokenCounter()
+    {
+        const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n";
+        List<string> input = new()
+        {
+            "This is a test of the emergency broadcast system. This is only a test.",
+            "We repeat, this is only a test. A unit test."
+        };
+
+        var expected = new[]
+        {
+            $"{ChunkHeader}This is a test of the emergency broadcast system.",
+            $"{ChunkHeader}emergency broadcast system. This is only a test.",
+            $"{ChunkHeader}This is only a test. We repeat, this is only a test.",
+            $"{ChunkHeader}We repeat, this is only a test. A unit test.",
+            $"{ChunkHeader}A unit test."
+        };
+
+        var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 100, 40, chunkHeader: ChunkHeader, tokenCounter: s => s.Length);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphsWithHeaderAndCustomTokenCounter()
+    {
+        const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n";
+        List<string> input = new()
+        {
+            "This is a test of the emergency broadcast system. This is only a test.",
+            "We repeat, this is only a test. A unit test."
+        };
+
+        var expected = new[]
+        {
+            $"{ChunkHeader}This is a test of the emergency broadcast system.",
+            $"{ChunkHeader}This is only a test.",
+            $"{ChunkHeader}We repeat, this is only a test. A unit test."
+        };
+
+        var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 77, chunkHeader: ChunkHeader, tokenCounter: s => s.Length);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast<int>());
+    }
+
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    public void CanSplitTextParagraphsWithOverlapAndHeaderAndCustomTokenCounter()
+    {
+        const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n";
+        List<string> input = new()
+        {
+            "This is a test of the emergency broadcast system. This is only a test.",
+            "We repeat, this is only a test. A unit test."
+        };
+
+        var expected = new[]
+        {
+            $"{ChunkHeader}This is a test of the emergency broadcast system.",
+            $"{ChunkHeader}emergency broadcast system. This is only a test.",
+            $"{ChunkHeader}This is only a test. We repeat, this is only a test.",
+            $"{ChunkHeader}We repeat, this is only a test. A unit test.",
+            $"{ChunkHeader}A unit test."
+        };
+
+        var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 100, 40, chunkHeader: ChunkHeader, tokenCounter: s => s.Length);
+
+        Assert.Equal(expected, result.Select(o => o.Content).ToArray());
+        Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast<int>());
+    }
+}