diff --git a/backend/app/parsing.py b/backend/app/parsing.py index b13c4e61..9e47d9b7 100644 --- a/backend/app/parsing.py +++ b/backend/app/parsing.py @@ -1,12 +1,32 @@ """Module contains logic for parsing binary blobs into text.""" from langchain_community.document_loaders.parsers import ( BS4HTMLParser, - MarkdownParser, PDFMinerParser, ) +from langchain.text_splitter import MarkdownHeaderTextSplitter from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser from langchain_community.document_loaders.parsers.msword import MsWordParser from langchain_community.document_loaders.parsers.txt import TextParser +from langchain_core.document_loaders import BaseBlobParser +from langchain_core.documents import Document + + +class MarkdownParser(BaseBlobParser): + """Parser for Markdown blobs.""" + def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] + """Lazily parse the blob.""" + headers_to_split_on = [ + ("#", "Header 1"), + ("##", "Header 2"), + ("###", "Header 3"), + ("###$", "Header 4"), + ("####", "Header 5"), + ("#####", "Header 6"), + ] + splitter = MarkdownHeaderTextSplitter(headers_to_split_on) + for doc in splitter.split_text(blob.as_string()): + yield doc + HANDLERS = { "application/pdf": PDFMinerParser(), diff --git a/backend/tests/unit_tests/fixtures/sample.md b/backend/tests/unit_tests/fixtures/sample.md index 77128cfd..884d1821 100644 --- a/backend/tests/unit_tests/fixtures/sample.md +++ b/backend/tests/unit_tests/fixtures/sample.md @@ -2,6 +2,12 @@ ## Heading 2 +Some text for heading 2. + ### Heading 3 +Some text for heading 3. + #### Heading 4 + +Some text for heading 4. \ No newline at end of file