Skip to content

Commit

Permalink
Use markdown splitter for .md files.
Browse files Browse the repository at this point in the history
  • Loading branch information
P. Taylor Goetz committed May 25, 2024
1 parent 5b55ca3 commit 0e5ccb9
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 1 deletion.
22 changes: 21 additions & 1 deletion backend/app/parsing.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,32 @@
"""Module contains logic for parsing binary blobs into text."""
from langchain_community.document_loaders.parsers import (
BS4HTMLParser,
MarkdownParser,
PDFMinerParser,
)
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser
from langchain_community.document_loaders.parsers.msword import MsWordParser
from langchain_community.document_loaders.parsers.txt import TextParser
from langchain_core.document_loaders import BaseBlobParser
from langchain_core.documents import Document


class MarkdownParser(BaseBlobParser):
"""Parser for Markdown blobs."""
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob."""
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
("###$", "Header 4"),
("####", "Header 5"),
("#####", "Header 6"),
]
splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
for doc in splitter.split_text(blob.as_string()):
yield doc


HANDLERS = {
"application/pdf": PDFMinerParser(),
Expand Down
6 changes: 6 additions & 0 deletions backend/tests/unit_tests/fixtures/sample.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

## Heading 2

Some text for heading 2.

### Heading 3

Some text for heading 3.

#### Heading 4

Some text for heading 4.

0 comments on commit 0e5ccb9

Please sign in to comment.