From 71bd5ac7cb6cdfc8f2af7e0a35abbed87661062f Mon Sep 17 00:00:00 2001 From: Gian Maria Ricci Date: Thu, 25 Jul 2024 17:12:46 +0200 Subject: [PATCH] Add page numbering on extracted MemoryRercord. --- service/Core/DataFormats/Text/TextChunker2.cs | 434 +++++++++ .../Core/Handlers/TextPartitioningHandler.cs | 191 ++-- .../DataFormats/Text/TextChunker2Tests.cs | 882 ++++++++++++++++++ 3 files changed, 1434 insertions(+), 73 deletions(-) create mode 100644 service/Core/DataFormats/Text/TextChunker2.cs create mode 100644 service/tests/Core.UnitTests/DataFormats/Text/TextChunker2Tests.cs diff --git a/service/Core/DataFormats/Text/TextChunker2.cs b/service/Core/DataFormats/Text/TextChunker2.cs new file mode 100644 index 000000000..4e8604ad9 --- /dev/null +++ b/service/Core/DataFormats/Text/TextChunker2.cs @@ -0,0 +1,434 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Text; +using Microsoft.KernelMemory.AI.OpenAI; + +namespace Microsoft.KernelMemory.DataFormats.Text; + +/// +/// Split text in chunks, attempting to leave meaning intact. +/// For plain text, split looking at new lines first, then periods, and so on. +/// For markdown, split looking at punctuation first, and so on. +/// +[Experimental("KMEXP00")] +public static class TextChunker2 +{ + /// + /// This is the standard content to be split, for all content that cannot be divided in pages + /// we can simply send a single PageInfo with all the content in a single record. + /// + /// + /// A simple object that will be added on the extracted chunk, it is a simple object + /// because the caller can use Page Number or whatever data it needs. + public record ChunkInfo(string Content, object? Tag) + { + /// + /// If you want to convert this to string it is possible to simply return the content. + /// This makes simpler create TextChunker2 based on TextChunker. + /// + /// + public override string ToString() + { + return this.Content; + } + }; + + private static readonly char[] s_spaceChar = { ' ' }; + private static readonly string?[] s_plaintextSplitOptions = { "\n\r", ".", "?!", ";", ":", ",", ")]}", " ", "-", null }; + private static readonly string?[] s_markdownSplitOptions = { ".", "?!", ";", ":", ",", ")]}", " ", "-", "\n\r", null }; + + /// + /// Split plain text into lines. + /// + /// Text to split + /// Tag to associate to the split + /// Maximum number of tokens per line. + /// Function to count tokens in a string. If not supplied, the default counter will be used. + /// List of lines. + public static List SplitPlainTextLines( + string text, + object? tag, + int maxTokensPerLine, + TextChunker.TokenCounter? tokenCounter = null) => + InternalSplitLines( + new ChunkInfo(text, tag), + maxTokensPerLine, + trim: true, + s_plaintextSplitOptions, tokenCounter); + + /// + /// Split markdown text into lines. + /// + /// Text to split + /// Tag to associate to the split + /// Maximum number of tokens per line. + /// Function to count tokens in a string. If not supplied, the default counter will be used. + /// List of lines. + public static List SplitMarkDownLines( + string text, + object tag, + int maxTokensPerLine, + TextChunker.TokenCounter? tokenCounter = null) => + InternalSplitLines( + new ChunkInfo(text, tag), + maxTokensPerLine, + trim: true, + s_markdownSplitOptions, tokenCounter); + + /// + /// Split plain text into paragraphs. + /// Note: in the default KM implementation, one paragraph == one partition. + /// + /// Lines of text. + /// Maximum number of tokens per paragraph. + /// Number of tokens to overlap between paragraphs. + /// Text to be prepended to each individual chunk. + /// Function to count tokens in a string. If not supplied, the default counter will be used. + /// List of paragraphs. + public static IReadOnlyCollection SplitPlainTextParagraphs( + List lines, + int maxTokensPerParagraph, + int overlapTokens = 0, + string? chunkHeader = null, + TextChunker.TokenCounter? tokenCounter = null) => + InternalSplitTextParagraphs( + lines, + maxTokensPerParagraph, + overlapTokens, + chunkHeader, + static (text, maxTokens, tokenCounter) => InternalSplitLines( + text, + maxTokens, + trim: false, + s_plaintextSplitOptions, + tokenCounter), + tokenCounter); + + /// + /// Split markdown text into paragraphs. + /// + /// Lines of text. + /// Maximum number of tokens per paragraph. + /// Number of tokens to overlap between paragraphs. + /// Text to be prepended to each individual chunk. + /// Function to count tokens in a string. If not supplied, the default counter will be used. + /// List of paragraphs. + public static IReadOnlyCollection SplitMarkdownParagraphs( + List lines, + int maxTokensPerParagraph, + int overlapTokens = 0, + string? chunkHeader = null, + TextChunker.TokenCounter? tokenCounter = null) => + InternalSplitTextParagraphs( + lines, + maxTokensPerParagraph, + overlapTokens, + chunkHeader, + static (text, maxTokens, tokenCounter) => InternalSplitLines( + text, + maxTokens, + trim: false, + s_markdownSplitOptions, + tokenCounter), + tokenCounter); + + private static IReadOnlyCollection InternalSplitTextParagraphs( + List lines, + int maxTokensPerParagraph, + int overlapTokens, + string? chunkHeader, + Func> longLinesSplitter, + TextChunker.TokenCounter? tokenCounter) + { + if (maxTokensPerParagraph <= 0) + { + throw new ArgumentException("maxTokensPerParagraph should be a positive number", nameof(maxTokensPerParagraph)); + } + + if (maxTokensPerParagraph <= overlapTokens) + { + throw new ArgumentException("overlapTokens cannot be larger than maxTokensPerParagraph", nameof(maxTokensPerParagraph)); + } + + if (lines.Count == 0) + { + return Array.Empty(); + } + + var chunkHeaderTokens = chunkHeader is { Length: > 0 } ? GetTokenCount(chunkHeader, tokenCounter) : 0; + + var adjustedMaxTokensPerParagraph = maxTokensPerParagraph - overlapTokens - chunkHeaderTokens; + + // Split long lines first + var truncatedLines = lines + .SelectMany(line => longLinesSplitter(line, adjustedMaxTokensPerParagraph, tokenCounter)) + .ToArray(); + + var paragraphs = BuildParagraph(truncatedLines, adjustedMaxTokensPerParagraph, tokenCounter); + + var processedParagraphs = ProcessParagraphs( + paragraphs, adjustedMaxTokensPerParagraph, overlapTokens, chunkHeader, longLinesSplitter, tokenCounter); + + return processedParagraphs; + } + + private static List BuildParagraph( + ChunkInfo[] truncatedLines, + int maxTokensPerParagraph, + TextChunker.TokenCounter? tokenCounter) + { + StringBuilder paragraphBuilder = new(); + List paragraphs = new(); + + if (truncatedLines == null || truncatedLines.Length == 0) + { + return paragraphs; + } + + //paragraph tag is the tag was first associated to the current paraphBuilder. + object? paragraphTag = truncatedLines[0].Tag; + foreach (ChunkInfo line in truncatedLines) + { + if (paragraphBuilder.Length > 0) + { + string? paragraph = null; + + int currentCount = GetTokenCount(line, tokenCounter) + 1; + if (currentCount < maxTokensPerParagraph) + { + currentCount += GetTokenCount(paragraphBuilder.ToString(), tokenCounter); + } + + if (currentCount >= maxTokensPerParagraph) + { + // Complete the paragraph and prepare for the next + paragraph = paragraphBuilder.ToString(); + + paragraphs.Add(new ChunkInfo(paragraph.Trim(), paragraphTag)); + paragraphBuilder.Clear(); + paragraphTag = line.Tag; + } + } + + paragraphBuilder.AppendLine(line.Content); + } + + if (paragraphBuilder.Length > 0) + { + // Add the final paragraph if there's anything remaining, now the last paragraph tag is the first + // tag that contains text on the tag. + paragraphs.Add(new ChunkInfo(paragraphBuilder.ToString().Trim(), paragraphTag)); + } + + return paragraphs; + } + + private static List ProcessParagraphs( + List paragraphs, + int adjustedMaxTokensPerParagraph, + int overlapTokens, + string? chunkHeader, + Func> longLinesSplitter, + TextChunker.TokenCounter? tokenCounter) + { + // distribute text more evenly in the last paragraphs when the last paragraph is too short. + if (paragraphs.Count > 1) + { + var lastParagraph = paragraphs[paragraphs.Count - 1]; + var secondLastParagraph = paragraphs[paragraphs.Count - 2]; + + if (GetTokenCount(lastParagraph, tokenCounter) < adjustedMaxTokensPerParagraph / 4) + { + var lastParagraphTokens = lastParagraph.Content.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries); + var secondLastParagraphTokens = secondLastParagraph.Content.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries); + + var lastParagraphTokensCount = lastParagraphTokens.Length; + var secondLastParagraphTokensCount = secondLastParagraphTokens.Length; + + if (lastParagraphTokensCount + secondLastParagraphTokensCount <= adjustedMaxTokensPerParagraph) + { + var newSecondLastParagraph = string.Join(" ", secondLastParagraphTokens); + var newLastParagraph = string.Join(" ", lastParagraphTokens); + + paragraphs[paragraphs.Count - 2] = new ChunkInfo($"{newSecondLastParagraph} {newLastParagraph}", secondLastParagraph.Tag); + paragraphs.RemoveAt(paragraphs.Count - 1); + } + } + } + + var processedParagraphs = new List(); + var paragraphStringBuilder = new StringBuilder(); + + for (int i = 0; i < paragraphs.Count; i++) + { + paragraphStringBuilder.Clear(); + + if (chunkHeader is not null) + { + paragraphStringBuilder.Append(chunkHeader); + } + + var paragraph = paragraphs[i]; + + if (overlapTokens > 0 && i < paragraphs.Count - 1) + { + var nextParagraph = paragraphs[i + 1]; + var split = longLinesSplitter(nextParagraph, overlapTokens, tokenCounter); + + paragraphStringBuilder.Append(paragraph.Content); + + if (split.Count != 0) + { + paragraphStringBuilder.Append(' ').Append(split[0]); + } + } + else + { + paragraphStringBuilder.Append(paragraph.Content); + } + + processedParagraphs.Add(new ChunkInfo(paragraphStringBuilder.ToString(), paragraph.Tag)); + } + + return processedParagraphs; + } + + private static List InternalSplitLines( + ChunkInfo chunkInput, + int maxTokensPerLine, + bool trim, + string?[] splitOptions, + TextChunker.TokenCounter? tokenCounter) + { + var result = new List(); + + var text = chunkInput.Content.Replace("\r\n", "\n", StringComparison.OrdinalIgnoreCase); // normalize line endings + result.Add(new ChunkInfo(text, chunkInput.Tag)); + for (int i = 0; i < splitOptions.Length; i++) + { + int count = result.Count; // track where the original input left off + var (splits2, inputWasSplit2) = Split(result, maxTokensPerLine, splitOptions[i].AsSpan(), trim, tokenCounter); + result.AddRange(splits2); + result.RemoveRange(0, count); // remove the original input + if (!inputWasSplit2) + { + break; + } + } + + return result; + } + + private static (List, bool) Split( + List input, + int maxTokens, + ReadOnlySpan separators, + bool trim, + TextChunker.TokenCounter? tokenCounter) + { + bool inputWasSplit = false; + List result = new(); + int count = input.Count; + for (int i = 0; i < count; i++) + { + var currentInput = input[i]; + var (splits, split) = Split(currentInput.Content.AsSpan(), currentInput.Content, maxTokens, separators, trim, tokenCounter); + result.AddRange(splits.Select(s => new ChunkInfo(s, currentInput.Tag))); + inputWasSplit |= split; + } + + return (result, inputWasSplit); + } + + private static (List, bool) Split( + ReadOnlySpan input, + string? inputString, + int maxTokens, + ReadOnlySpan separators, + bool trim, + TextChunker.TokenCounter? tokenCounter) + { + Debug.Assert(inputString is null || input.SequenceEqual(inputString.AsSpan())); + List result = new(); + var inputWasSplit = false; + + int inputTokenCount = GetTokenCount(inputString ??= input.ToString(), tokenCounter); + + if (inputTokenCount > maxTokens) + { + inputWasSplit = true; + + int half = input.Length / 2; + int cutPoint = -1; + + if (separators.IsEmpty) + { + cutPoint = half; + } + else if (input.Length > 2) + { + int pos = 0; + while (true) + { + int index = input.Slice(pos, input.Length - 1 - pos).IndexOfAny(separators); + if (index < 0) + { + break; + } + + index += pos; + + if (Math.Abs(half - index) < Math.Abs(half - cutPoint)) + { + cutPoint = index + 1; + } + + pos = index + 1; + } + } + + if (cutPoint > 0) + { + var firstHalf = input.Slice(0, cutPoint); + var secondHalf = input.Slice(cutPoint); + if (trim) + { + firstHalf = firstHalf.Trim(); + secondHalf = secondHalf.Trim(); + } + + // Recursion + var (splits1, split1) = Split(firstHalf, null, maxTokens, separators, trim, tokenCounter); + result.AddRange(splits1); + var (splits2, split2) = Split(secondHalf, null, maxTokens, separators, trim, tokenCounter); + result.AddRange(splits2); + + inputWasSplit = split1 || split2; + return (result, inputWasSplit); + } + } + + result.Add((inputString is not null, trim) switch + { + (true, true) => inputString!.Trim(), + (true, false) => inputString!, + (false, true) => input.Trim().ToString(), + (false, false) => input.ToString(), + }); + + return (result, inputWasSplit); + } + + private static int GetTokenCount(ChunkInfo input, TextChunker.TokenCounter? tokenCounter) => GetTokenCount(input.Content, tokenCounter); + + private static int GetTokenCount(string input, TextChunker.TokenCounter? tokenCounter) + { + // Fall back to GPT tokenizer if none configured + return tokenCounter?.Invoke(input) ?? DefaultGPTTokenizer.StaticCountTokens(input); + } +} diff --git a/service/Core/Handlers/TextPartitioningHandler.cs b/service/Core/Handlers/TextPartitioningHandler.cs index d6c3f9e25..5235ca84c 100644 --- a/service/Core/Handlers/TextPartitioningHandler.cs +++ b/service/Core/Handlers/TextPartitioningHandler.cs @@ -2,12 +2,14 @@ using System; using System.Collections.Generic; +using System.Linq; using System.Threading; using System.Threading.Tasks; using Microsoft.Extensions.Logging; using Microsoft.KernelMemory.AI.OpenAI; using Microsoft.KernelMemory.Configuration; using Microsoft.KernelMemory.Context; +using Microsoft.KernelMemory.DataFormats; using Microsoft.KernelMemory.DataFormats.Text; using Microsoft.KernelMemory.Diagnostics; using Microsoft.KernelMemory.Extensions; @@ -66,6 +68,8 @@ public TextPartitioningHandler( } } + private record PartitionInfo(string Content, int? PageNumber); + /// public async Task<(bool success, DataPipeline updatedPipeline)> InvokeAsync( DataPipeline pipeline, CancellationToken cancellationToken = default) @@ -97,103 +101,144 @@ public TextPartitioningHandler( // Track new files being generated (cannot edit originalFile.GeneratedFiles while looping it) Dictionary newFiles = new(); - foreach (KeyValuePair generatedFile in uploadedFile.GeneratedFiles) + List? partitions = null; + List sentences; + string partitionsMimeType = MimeTypes.PlainText; + DataPipeline.GeneratedFileDetails? file = null; + + // we prefer extracting from structured data because we can leave page number + var extractedContent = uploadedFile.GeneratedFiles.FirstOrDefault(uploadedFile => uploadedFile.Value.ArtifactType == DataPipeline.ArtifactTypes.ExtractedContent); + if (extractedContent.Value != null) { - var file = generatedFile.Value; - if (file.AlreadyProcessedBy(this)) - { - this._log.LogTrace("File {0} already processed by this handler", file.Name); - continue; - } + BinaryData dataExtractedContent = await this._orchestrator.ReadFileAsync(pipeline, extractedContent.Value.Name, cancellationToken).ConfigureAwait(false); - // Partition only the original text - if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedText) + var fileContent = dataExtractedContent.ToObjectFromJson(); + + if (fileContent != null) { - this._log.LogTrace("Skipping file {0} (not original text)", file.Name); - continue; - } + //ok lets try to deserialize the contentS + this._log.LogTrace("File {0} was processed with ExtractedContent {1}", uploadedFile.Name, extractedContent.Value.Name); + + //now we should split with a splitter that keeps track of page number. + file = extractedContent.Value; - // Use a different partitioning strategy depending on the file type - List partitions; - List sentences; - BinaryData partitionContent = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false); - string partitionsMimeType = MimeTypes.PlainText; + List chunks = new(); + foreach (var content in fileContent.Sections) + { + var stringContent = content.Content; - // Skip empty partitions. Also: partitionContent.ToString() throws an exception if there are no bytes. - if (partitionContent.ToArray().Length == 0) { continue; } + var lines = TextChunker.SplitPlainTextLines(stringContent, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter); + chunks.AddRange(lines.Select(l => new TextChunker2.ChunkInfo(l, content.Number))); + } - switch (file.MimeType) + var stringPartitions = TextChunker2.SplitPlainTextParagraphs(chunks, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, chunkHeader: chunkHeader, tokenCounter: this._tokenCounter); + partitions = stringPartitions.Select(c => new PartitionInfo(c.Content, (int?)c.Tag)).ToList(); + } + } + + if (partitions == null) + { + //old logic where we have no extracted content + foreach (KeyValuePair generatedFile in uploadedFile.GeneratedFiles) { - case MimeTypes.PlainText: + file = generatedFile.Value; + if (file.AlreadyProcessedBy(this)) { - this._log.LogDebug("Partitioning text file {0}", file.Name); - string content = partitionContent.ToString(); - sentences = TextChunker.SplitPlainTextLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter); - partitions = TextChunker.SplitPlainTextParagraphs( - sentences, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, tokenCounter: this._tokenCounter, chunkHeader: chunkHeader); - break; + this._log.LogTrace("File {0} already processed by this handler", file.Name); + continue; } - case MimeTypes.MarkDown: + // Partition only the original text + if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedText) { - this._log.LogDebug("Partitioning MarkDown file {0}", file.Name); - string content = partitionContent.ToString(); - partitionsMimeType = MimeTypes.MarkDown; - sentences = TextChunker.SplitMarkDownLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter); - partitions = TextChunker.SplitMarkdownParagraphs( - sentences, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, tokenCounter: this._tokenCounter); - break; + this._log.LogTrace("Skipping file {0} (not original text)", file.Name); + continue; } - // TODO: add virtual/injectable logic - // TODO: see https://learn.microsoft.com/en-us/windows/win32/search/-search-ifilter-about + // Use a different partitioning strategy depending on the file type + BinaryData partitionContent = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false); - default: - this._log.LogWarning("File {0} cannot be partitioned, type '{1}' not supported", file.Name, file.MimeType); - // Don't partition other files - continue; + // Skip empty partitions. Also: partitionContent.ToString() throws an exception if there are no bytes. + if (partitionContent.ToArray().Length == 0) { continue; } + + switch (file.MimeType) + { + case MimeTypes.PlainText: + { + this._log.LogDebug("Partitioning text file {0}", file.Name); + string content = partitionContent.ToString(); + sentences = TextChunker.SplitPlainTextLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter); + var stringPartitions = TextChunker.SplitPlainTextParagraphs( + sentences, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, chunkHeader: chunkHeader, tokenCounter: this._tokenCounter); + + partitions = stringPartitions.Select(c => new PartitionInfo(c, null)).ToList(); + break; + } + + case MimeTypes.MarkDown: + { + this._log.LogDebug("Partitioning MarkDown file {0}", file.Name); + string content = partitionContent.ToString(); + partitionsMimeType = MimeTypes.MarkDown; + sentences = TextChunker.SplitMarkDownLines(content, maxTokensPerLine: this._options.MaxTokensPerLine, tokenCounter: this._tokenCounter); + var stringPartitions = TextChunker.SplitMarkdownParagraphs( + sentences, maxTokensPerParagraph: maxTokensPerParagraph, overlapTokens: overlappingTokens, tokenCounter: this._tokenCounter); + + partitions = stringPartitions.Select(c => new PartitionInfo(c, null)).ToList(); + break; + } + + // TODO: add virtual/injectable logic + // TODO: see https://learn.microsoft.com/en-us/windows/win32/search/-search-ifilter-about + + default: + this._log.LogWarning("File {0} cannot be partitioned, type '{1}' not supported", file.Name, file.MimeType); + // Don't partition other files + continue; + } } + } - if (partitions.Count == 0) { continue; } + if (partitions == null || partitions.Count == 0 || file == null) { continue; } - this._log.LogDebug("Saving {0} file partitions", partitions.Count); - for (int partitionNumber = 0; partitionNumber < partitions.Count; partitionNumber++) - { - // TODO: turn partitions in objects with more details, e.g. page number - string text = partitions[partitionNumber]; - int sectionNumber = 0; // TODO: use this to store the page number (if any) - BinaryData textData = new(text); - - int tokenCount = this._tokenCounter(text); - this._log.LogDebug("Partition size: {0} tokens", tokenCount); + this._log.LogDebug("Saving {0} file partitions", partitions.Count); + for (int partitionNumber = 0; partitionNumber < partitions.Count; partitionNumber++) + { + // TODO: turn partitions in objects with more details, e.g. page number + var partition = partitions[partitionNumber]; + string text = partition.Content; + int sectionNumber = partition.PageNumber ?? 0; + BinaryData textData = new(text); - var destFile = uploadedFile.GetPartitionFileName(partitionNumber); - await this._orchestrator.WriteFileAsync(pipeline, destFile, textData, cancellationToken).ConfigureAwait(false); + int tokenCount = this._tokenCounter(text); + this._log.LogDebug("Partition size: {0} tokens", tokenCount); - var destFileDetails = new DataPipeline.GeneratedFileDetails - { - Id = Guid.NewGuid().ToString("N"), - ParentId = uploadedFile.Id, - Name = destFile, - Size = text.Length, - MimeType = partitionsMimeType, - ArtifactType = DataPipeline.ArtifactTypes.TextPartition, - PartitionNumber = partitionNumber, - SectionNumber = sectionNumber, - Tags = pipeline.Tags, - ContentSHA256 = textData.CalculateSHA256(), - }; - newFiles.Add(destFile, destFileDetails); - destFileDetails.MarkProcessedBy(this); - } + var destFile = uploadedFile.GetPartitionFileName(partitionNumber); + await this._orchestrator.WriteFileAsync(pipeline, destFile, textData, cancellationToken).ConfigureAwait(false); - file.MarkProcessedBy(this); + var destFileDetails = new DataPipeline.GeneratedFileDetails + { + Id = Guid.NewGuid().ToString("N"), + ParentId = uploadedFile.Id, + Name = destFile, + Size = text.Length, + MimeType = partitionsMimeType, + ArtifactType = DataPipeline.ArtifactTypes.TextPartition, + PartitionNumber = partitionNumber, + SectionNumber = sectionNumber, + Tags = pipeline.Tags, + ContentSHA256 = textData.CalculateSHA256(), + }; + newFiles.Add(destFile, destFileDetails); + destFileDetails.MarkProcessedBy(this); } + file.MarkProcessedBy(this); + // Add new files to pipeline status - foreach (var file in newFiles) + foreach (var newFile in newFiles) { - uploadedFile.GeneratedFiles.Add(file.Key, file.Value); + uploadedFile.GeneratedFiles.Add(newFile.Key, newFile.Value); } } diff --git a/service/tests/Core.UnitTests/DataFormats/Text/TextChunker2Tests.cs b/service/tests/Core.UnitTests/DataFormats/Text/TextChunker2Tests.cs new file mode 100644 index 000000000..7bff2becc --- /dev/null +++ b/service/tests/Core.UnitTests/DataFormats/Text/TextChunker2Tests.cs @@ -0,0 +1,882 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Text; +using Microsoft.KernelMemory.DataFormats.Text; + +namespace Microsoft.KM.Core.UnitTests.DataFormats.Text; + +public sealed class TextChunker2Tests +{ + // Use this as the default chunker, to decouple the test from GPT3 tokenizer + private static readonly TextChunker.TokenCounter s_tokenCounter = s => (s.Length >> 2); + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitPlainTextLines() + { + const string Input = "This is a test of the emergency broadcast system. This is only a test."; + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test." + }; + + var result = TextChunker2.SplitPlainTextLines(Input, tag: null, 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphs() + { + List input = new() + { + new TextChunker2.ChunkInfo("This is a test of the emergency broadcast system. This is only a test.", 1), + new TextChunker2.ChunkInfo("We repeat, this is only a test. A unit test.", 2) + }; + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expectedTag = new[] + { + 1, + 1, + 2 + }; + + var result = TextChunker2.SplitMarkdownParagraphs(input, 13, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal(expectedTag, result.Select(o => o.Tag).Cast().ToArray()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphsWithOverlap() + { + List input = new() + { + new TextChunker2.ChunkInfo("This is a test of the emergency broadcast system. This is only a test.", 1), + new TextChunker2.ChunkInfo("We repeat, this is only a test. A unit test.", 2) + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "emergency broadcast system. This is only a test.", + "This is only a test. We repeat, this is only a test.", + "We repeat, this is only a test. A unit test.", + "A unit test." + }; + + var expectedTag = new[] + { + 1, + 1, + 1, + 2, + 2 + }; + + var result = TextChunker2.SplitMarkdownParagraphs(input, 15, 8, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal(expectedTag, result.Select(o => o.Tag).Cast().ToArray()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphs() + { + List input = new() + { + new TextChunker2.ChunkInfo("This is a test of the emergency broadcast system. This is only a test.", 1), + new TextChunker2.ChunkInfo("We repeat, this is only a test. A unit test.", 2) + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expectedTag = new[] + { + 1, + 1, + 2 + }; + + var result = TextChunker2.SplitPlainTextParagraphs(input, 13, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal(expectedTag, result.Select(o => o.Tag).Cast().ToArray()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsWithOverlap() + { + List input = new() + { + new TextChunker2.ChunkInfo("This is a test of the emergency broadcast system. This is only a test.", 1), + new TextChunker2.ChunkInfo("We repeat, this is only a test. A unit test.", 2) + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "emergency broadcast system. This is only a test.", + "This is only a test. We repeat, this is only a test.", + "We repeat, this is only a test. A unit test.", + "A unit test." + }; + + var expectedTag = new[] + { + 1, + 1, + 1, + 2, + 2 + }; + + var result = TextChunker2.SplitPlainTextParagraphs(input, 15, 8, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal(expectedTag, result.Select(o => o.Tag).Cast().ToArray()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkDownLines() + { + const string Input = "This is a test of the emergency broadcast system. This is only a test."; + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test." + }; + + var result = TextChunker2.SplitMarkDownLines(Input, tag: 42, maxTokensPerLine: 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.All(result, c => c.Tag?.Equals(42)); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsWithEmptyInput() + { + List input = new(); + + var result = TextChunker2.SplitPlainTextParagraphs(input, 13, tokenCounter: s_tokenCounter); + + Assert.Empty(result); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphsWithEmptyInput() + { + List input = new(); + + var result = TextChunker2.SplitMarkdownParagraphs(input, 13, tokenCounter: s_tokenCounter); + + Assert.Empty(result); + } + + private List ConvertToChunkInput(List input) + { + var result = new List(); + for (int i = 0; i < input.Count; i++) + { + result.Add(new TextChunker2.ChunkInfo(input[i], i + 1)); + } + return result; + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsEvenly() + { + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test.", + "A small note. And another. And once again. Seriously, this is the end. We're finished. All set. Bye.", + "Done." + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test.", + "We repeat, this is only a test. A unit test.", + "A small note. And another. And once again.", + "Seriously, this is the end. We're finished. All set. Bye. Done." + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a plaintext example that splits on \r or \n + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsOnNewlines() + { + List input = new() + { + "This is a test of the emergency broadcast system\r\nThis is only a test", + "We repeat this is only a test\nA unit test", + "A small note\nAnd another\r\nAnd once again\rSeriously this is the end\nWe're finished\nAll set\nBye\n", + "Done" + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system", + "This is only a test", + "We repeat this is only a test\nA unit test", + "A small note\nAnd another\nAnd once again", + "Seriously this is the end\nWe're finished\nAll set\nBye Done", + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a plaintext example that splits on ? or ! + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsOnPunctuation() + { + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test", + "We repeat, this is only a test? A unit test", + "A small note! And another? And once again! Seriously, this is the end. We're finished. All set. Bye.", + "Done." + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test", + "We repeat, this is only a test? A unit test", + "A small note! And another? And once again!", + "Seriously, this is the end.", + $"We're finished. All set. Bye.{Environment.NewLine}Done.", + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a plaintext example that splits on ; + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsOnSemicolons() + { + List input = new() + { + "This is a test of the emergency broadcast system; This is only a test", + "We repeat; this is only a test; A unit test", + "A small note; And another; And once again; Seriously, this is the end; We're finished; All set; Bye.", + "Done." + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system;", + "This is only a test", + "We repeat; this is only a test; A unit test", + "A small note; And another; And once again;", + "Seriously, this is the end; We're finished; All set; Bye. Done.", + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a plaintext example that splits on : + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsOnColons() + { + List input = new() + { + "This is a test of the emergency broadcast system: This is only a test", + "We repeat: this is only a test: A unit test", + "A small note: And another: And once again: Seriously, this is the end: We're finished: All set: Bye.", + "Done." + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system:", + "This is only a test", + "We repeat: this is only a test: A unit test", + "A small note: And another: And once again:", + "Seriously, this is the end: We're finished: All set: Bye. Done.", + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a plaintext example that splits on , + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsOnCommas() + { + List input = new() + { + "This is a test of the emergency broadcast system, This is only a test", + "We repeat, this is only a test, A unit test", + "A small note, And another, And once again, Seriously, this is the end, We're finished, All set, Bye.", + "Done." + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system,", + "This is only a test", + "We repeat, this is only a test, A unit test", + "A small note, And another, And once again, Seriously,", + $"this is the end, We're finished, All set, Bye.{Environment.NewLine}Done.", + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a plaintext example that splits on ) or ] or } + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsOnClosingBrackets() + { + List input = new() + { + "This is a test of the emergency broadcast system) This is only a test", + "We repeat) this is only a test) A unit test", + "A small note] And another) And once again] Seriously this is the end} We're finished} All set} Bye.", + "Done." + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system)", + "This is only a test", + "We repeat) this is only a test) A unit test", + "A small note] And another) And once again]", + "Seriously this is the end} We're finished} All set} Bye. Done.", + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a plaintext example that splits on ' ' + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsOnSpaces() + { + List input = new() + { + "This is a test of the emergency broadcast system This is only a test", + "We repeat this is only a test A unit test", + "A small note And another And once again Seriously this is the end We're finished All set Bye.", + "Done." + }; + + var expected = new[] + { + "This is a test of the emergency", + "broadcast system This is only a test", + "We repeat this is only a test A unit test", + "A small note And another And once again Seriously", + $"this is the end We're finished All set Bye.{Environment.NewLine}Done.", + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a plaintext example that splits on '-' + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsOnHyphens() + { + List input = new() + { + "This is a test of the emergency broadcast system-This is only a test", + "We repeat-this is only a test-A unit test", + "A small note-And another-And once again-Seriously, this is the end-We're finished-All set-Bye.", + "Done." + }; + + var expected = new[] + { + "This is a test of the emergency", + "broadcast system-This is only a test", + "We repeat-this is only a test-A unit test", + "A small note-And another-And once again-Seriously,", + $"this is the end-We're finished-All set-Bye.{Environment.NewLine}Done.", + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a plaintext example that does not have any of the above characters + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsWithNoDelimiters() + { + List input = new() + { + "Thisisatestoftheemergencybroadcastsystem", + "Thisisonlyatest", + "WerepeatthisisonlyatestAunittest", + "AsmallnoteAndanotherAndonceagain", + "SeriouslythisistheendWe'refinishedAllsetByeDoneThisOneWillBeSplitToMeetTheLimit", + }; + + var expected = new[] + { + $"Thisisatestoftheemergencybroadcastsystem{Environment.NewLine}Thisisonlyatest", + "WerepeatthisisonlyatestAunittest", + "AsmallnoteAndanotherAndonceagain", + "SeriouslythisistheendWe'refinishedAllse", + "tByeDoneThisOneWillBeSplitToMeetTheLimit", + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 3, 4, 5, 5], result.Select(o => o.Tag).Cast()); + } + + // a markdown example that splits on . + + // a markdown example that splits on ? or ! + + // a markdown example that splits on ; + + // a markdown example that splits on : + + // a markdown example that splits on , + + // a markdown example that splits on ) or ] or } + + // a markdown example that splits on ' ' + + // a markdown example that splits on '-' + + // a markdown example that splits on '\r' or '\n' + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphsOnNewlines() + { + List input = new() + { + "This_is_a_test_of_the_emergency_broadcast_system\r\nThis_is_only_a_test", + "We_repeat_this_is_only_a_test\nA_unit_test", + "A_small_note\nAnd_another\r\nAnd_once_again\rSeriously_this_is_the_end\nWe're_finished\nAll_set\nBye\n", + "Done" + }; + + var expected = new[] + { + "This_is_a_test_of_the_emergency_broadcast_system", + "This_is_only_a_test", + "We_repeat_this_is_only_a_test\nA_unit_test", + "A_small_note\nAnd_another\nAnd_once_again", + "Seriously_this_is_the_end\nWe're_finished\nAll_set\nBye Done", + }; + + var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 15, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2, 3, 3], result.Select(o => o.Tag).Cast()); + } + + // a markdown example that does not have any of the above characters + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitVeryLargeDocumentsWithoutStackOverflowing() + { +#pragma warning disable CA5394 // this test relies on repeatable pseudo-random numbers + var rand = new Random(42); + var sb = new StringBuilder(100_000 * 11); + for (int wordNum = 0; wordNum < 100_000; wordNum++) + { + int wordLength = rand.Next(1, 10); + for (int charNum = 0; charNum < wordLength; charNum++) + { + sb.Append((char)('a' + rand.Next(0, 26))); + } + + sb.Append(' '); + } + + string text = sb.ToString(); + List lines = TextChunker2.SplitPlainTextLines(text, tag: 42, 20, tokenCounter: s_tokenCounter); + var paragraphs = TextChunker2.SplitPlainTextParagraphs(lines, 200, tokenCounter: s_tokenCounter); + Assert.NotEmpty(paragraphs); +#pragma warning restore CA5394 + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitPlainTextLinesWithCustomTokenCounter() + { + const string input = "This is a test of the emergency broadcast system. This is only a test."; + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test." + }; + + var result = TextChunker2.SplitPlainTextLines(input, tag: 42, 60, s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([42, 42], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphsWithCustomTokenCounter() + { + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 52, tokenCounter: s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphsWithOverlapAndCustomTokenCounter() + { + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "emergency broadcast system. This is only a test.", + "This is only a test. We repeat, this is only a test.", + "We repeat, this is only a test. A unit test.", + "A unit test." + }; + + var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 75, 40, tokenCounter: s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsWithCustomTokenCounter() + { + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 52, tokenCounter: s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsWithOverlapAndCustomTokenCounter() + { + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "emergency broadcast system. This is only a test.", + "This is only a test. We repeat, this is only a test.", + "We repeat, this is only a test. A unit test.", + "A unit test." + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 75, 40, tokenCounter: s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkDownLinesWithCustomTokenCounter() + { + const string input = "This is a test of the emergency broadcast system. This is only a test."; + var expected = new[] + { + "This is a test of the emergency broadcast system.", + "This is only a test." + }; + + var result = TextChunker2.SplitMarkDownLines(input, tag: 42, 60, tokenCounter: s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([42, 42], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphsWithHeader() + { + const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n"; + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + var expected = new[] + { + $"{ChunkHeader}This is a test of the emergency broadcast system.", + $"{ChunkHeader}This is only a test.", + $"{ChunkHeader}We repeat, this is only a test. A unit test." + }; + + var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 20, chunkHeader: ChunkHeader, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphsWithOverlapAndHeader() + { + const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n"; + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expected = new[] + { + $"{ChunkHeader}This is a test of the emergency broadcast system.", + $"{ChunkHeader}emergency broadcast system. This is only a test.", + $"{ChunkHeader}This is only a test. We repeat, this is only a test.", + $"{ChunkHeader}We repeat, this is only a test. A unit test.", + $"{ChunkHeader}A unit test." + }; + + var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 22, 8, chunkHeader: ChunkHeader, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsWithHeader() + { + const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n"; + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expected = new[] + { + $"{ChunkHeader}This is a test of the emergency broadcast system.", + $"{ChunkHeader}This is only a test.", + $"{ChunkHeader}We repeat, this is only a test. A unit test." + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 20, chunkHeader: ChunkHeader, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsWithOverlapAndHeader() + { + const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n"; + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expected = new[] + { + $"{ChunkHeader}This is a test of the emergency broadcast system.", + $"{ChunkHeader}emergency broadcast system. This is only a test.", + $"{ChunkHeader}This is only a test. We repeat, this is only a test.", + $"{ChunkHeader}We repeat, this is only a test. A unit test.", + $"{ChunkHeader}A unit test." + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 22, 8, chunkHeader: ChunkHeader, tokenCounter: s_tokenCounter); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphsWithHeaderAndCustomTokenCounter() + { + const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n"; + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + var expected = new[] + { + $"{ChunkHeader}This is a test of the emergency broadcast system.", + $"{ChunkHeader}This is only a test.", + $"{ChunkHeader}We repeat, this is only a test. A unit test." + }; + + var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 77, chunkHeader: ChunkHeader, tokenCounter: s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitMarkdownParagraphsWithOverlapAndHeaderAndCustomTokenCounter() + { + const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n"; + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expected = new[] + { + $"{ChunkHeader}This is a test of the emergency broadcast system.", + $"{ChunkHeader}emergency broadcast system. This is only a test.", + $"{ChunkHeader}This is only a test. We repeat, this is only a test.", + $"{ChunkHeader}We repeat, this is only a test. A unit test.", + $"{ChunkHeader}A unit test." + }; + + var result = TextChunker2.SplitMarkdownParagraphs(this.ConvertToChunkInput(input), 100, 40, chunkHeader: ChunkHeader, tokenCounter: s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsWithHeaderAndCustomTokenCounter() + { + const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n"; + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expected = new[] + { + $"{ChunkHeader}This is a test of the emergency broadcast system.", + $"{ChunkHeader}This is only a test.", + $"{ChunkHeader}We repeat, this is only a test. A unit test." + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 77, chunkHeader: ChunkHeader, tokenCounter: s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 2], result.Select(o => o.Tag).Cast()); + } + + [Fact] + [Trait("Category", "UnitTest")] + public void CanSplitTextParagraphsWithOverlapAndHeaderAndCustomTokenCounter() + { + const string ChunkHeader = "DOCUMENT NAME: test.txt\n\n"; + List input = new() + { + "This is a test of the emergency broadcast system. This is only a test.", + "We repeat, this is only a test. A unit test." + }; + + var expected = new[] + { + $"{ChunkHeader}This is a test of the emergency broadcast system.", + $"{ChunkHeader}emergency broadcast system. This is only a test.", + $"{ChunkHeader}This is only a test. We repeat, this is only a test.", + $"{ChunkHeader}We repeat, this is only a test. A unit test.", + $"{ChunkHeader}A unit test." + }; + + var result = TextChunker2.SplitPlainTextParagraphs(this.ConvertToChunkInput(input), 100, 40, chunkHeader: ChunkHeader, tokenCounter: s => s.Length); + + Assert.Equal(expected, result.Select(o => o.Content).ToArray()); + Assert.Equal([1, 1, 1, 2, 2], result.Select(o => o.Tag).Cast()); + } +}