Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow Markdown for RAG file types. #342

Closed
wants to merge 11 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion backend/app/message_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class LiberalToolMessage(ToolMessage):


def _convert_pydantic_dict_to_message(
data: MessageLikeRepresentation
data: MessageLikeRepresentation,
) -> MessageLikeRepresentation:
if (
isinstance(data, dict)
Expand Down
31 changes: 30 additions & 1 deletion backend/app/parsing.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,41 @@
"""Module contains logic for parsing binary blobs into text."""
from langchain_community.document_loaders.parsers import BS4HTMLParser, PDFMinerParser
from typing import Iterator

from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_community.document_loaders.parsers import (
BS4HTMLParser,
PDFMinerParser,
)
from langchain_community.document_loaders.parsers.generic import MimeTypeBasedParser
from langchain_community.document_loaders.parsers.msword import MsWordParser
from langchain_community.document_loaders.parsers.txt import TextParser
from langchain_core.document_loaders import BaseBlobParser
from langchain_core.document_loaders.blob_loaders import Blob
from langchain_core.documents import Document


class MarkdownParser(BaseBlobParser):
"""Parser for Markdown blobs."""

def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob."""
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
("###$", "Header 4"),
("####", "Header 5"),
("#####", "Header 6"),
]
splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
for doc in splitter.split_text(blob.as_string()):
yield doc


HANDLERS = {
"application/pdf": PDFMinerParser(),
"text/plain": TextParser(),
"text/markdown": MarkdownParser(),
"text/html": BS4HTMLParser(),
"application/msword": MsWordParser(),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": (
Expand Down
3,320 changes: 1,768 additions & 1,552 deletions backend/poetry.lock

Large diffs are not rendered by default.

13 changes: 7 additions & 6 deletions backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ fastapi = "^0.103.2"
# langchain = { git = "git@github.com:langchain-ai/langchain.git/", branch = "nc/subclass-runnable-binding" , subdirectory = "libs/langchain"}
orjson = "^3.9.10"
python-multipart = "^0.0.6"
tiktoken = "^0.5.1"
langchain = ">=0.0.338"
langgraph = "^0.0.38"
tiktoken = "^0.7.0"
langchain = ">=0.2"
langgraph = "^0.0.55"
pydantic = "<2.0"
langchain-openai = "^0.1.3"
langchain-openai = "^0.1.7"
beautifulsoup4 = "^4.12.3"
boto3 = "^1.34.28"
duckduckgo-search = "^5.3.0"
Expand All @@ -32,18 +32,19 @@ wikipedia = "^1.4.0"
langchain-google-vertexai = "^1.0.1"
setuptools = "^69.0.3"
pdfminer-six = "^20231228"
langchain-robocorp = "^0.0.5"
langchain-robocorp = "^0.0.8"
fireworks-ai = "^0.11.2"
httpx = { version = "0.25.2", extras = ["socks"] }
unstructured = {extras = ["doc", "docx"], version = "^0.12.5"}
pgvector = "^0.2.5"
psycopg2-binary = "^2.9.9"
asyncpg = "^0.29.0"
langchain-core = "^0.1.44"
langchain-core = ">0.2.0"
pyjwt = {extras = ["crypto"], version = "^2.8.0"}
langchain-anthropic = "^0.1.8"
structlog = "^24.1.0"
python-json-logger = "^2.0.7"
langchain-community = "0.2.1"

[tool.poetry.group.dev.dependencies]
uvicorn = "^0.23.2"
Expand Down
18 changes: 13 additions & 5 deletions backend/tests/unit_tests/agent_executor/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def test_list_of_supported_mimetypes() -> None:
"application/pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"text/html",
"text/markdown",
"text/plain",
]

Expand All @@ -29,11 +30,18 @@ def test_attempt_to_parse_each_fixture() -> None:
blob = Blob.from_path(path)
documents = MIMETYPE_BASED_PARSER.parse(blob)
try:
assert len(documents) == 1
doc = documents[0]
assert "source" in doc.metadata
assert doc.metadata["source"] == str(path)
assert "🦜" in doc.page_content
if type_ == "text/markdown":
assert len(documents) >= 1
# doc = documents[0]
# assert "source" in doc.metadata
# assert doc.metadata["source"] == str(path)
# assert "🦜" in doc.page_content
else:
assert len(documents) == 1
doc = documents[0]
assert "source" in doc.metadata
assert doc.metadata["source"] == str(path)
assert "🦜" in doc.page_content
except Exception as e:
raise AssertionError(f"Failed to parse {path}") from e

Expand Down
4 changes: 3 additions & 1 deletion backend/tests/unit_tests/agent_executor/test_upload.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from io import BytesIO

from langchain.text_splitter import RecursiveCharacterTextSplitter
from fastapi import UploadFile
from langchain.text_splitter import RecursiveCharacterTextSplitter

from app.upload import IngestRunnable, _guess_mimetype, convert_ingestion_input_to_blob
from tests.unit_tests.fixtures import get_sample_paths
from tests.unit_tests.utils import InMemoryVectorStore
Expand Down Expand Up @@ -46,4 +47,5 @@ def test_mimetype_guessing() -> None:
"sample.pdf": "application/pdf",
"sample.rtf": "application/rtf",
"sample.txt": "text/plain",
"sample.md": "text/markdown",
} == name_to_mime
13 changes: 13 additions & 0 deletions backend/tests/unit_tests/fixtures/sample.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# 🦜️ LangChain

## Heading 2

Some text for heading 2.

### Heading 3

Some text for heading 3.

#### Heading 4

Some text for heading 4.
Loading