Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mimetype without libmagic #327

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ ARG TARGETARCH
ARG TARGETVARIANT

# Install system dependencies
RUN apt-get update && apt-get install -y libmagic1 && rm -rf /var/lib/apt/lists/*
RUN apt-get update && rm -rf /var/lib/apt/lists/*
RUN wget -O golang-migrate.deb https://github.com/golang-migrate/migrate/releases/download/v4.17.0/migrate.${TARGETOS}-${TARGETARCH}${TARGETVARIANT}.deb \
&& dpkg -i golang-migrate.deb \
&& rm golang-migrate.deb
Expand Down
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,14 +109,10 @@ pyenv activate opengpts
Once your Python environment is set up, you can install the project dependencies:

The backend service uses [poetry](https://python-poetry.org/docs/#installation) to manage dependencies.
It assumes libmagic to be [installed](https://github.com/ahupp/python-magic?tab=readme-ov-file#installation) in your
host system.

```shell
pip install poetry
pip install libmagic
pip install langchain-community
brew install libmagic
```

**Install Postgres and the Postgres Vector Extension**
Expand Down
2 changes: 1 addition & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ ARG TARGETARCH
ARG TARGETVARIANT

# Install system dependencies
RUN apt-get update && apt-get install -y libmagic1 && rm -rf /var/lib/apt/lists/*
RUN apt-get update && rm -rf /var/lib/apt/lists/*
RUN wget -O golang-migrate.deb https://github.com/golang-migrate/migrate/releases/download/v4.17.0/migrate.${TARGETOS}-${TARGETARCH}${TARGETVARIANT}.deb \
&& dpkg -i golang-migrate.deb \
&& rm golang-migrate.deb
Expand Down
5 changes: 3 additions & 2 deletions backend/app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from app.api import router as api_router
from app.auth.handlers import AuthedUser
from app.lifespan import lifespan
from app.upload import ingest_runnable
from app.upload import convert_ingestion_input_to_blob, ingest_runnable

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -44,7 +44,8 @@ async def ingest_files(
if thread is None:
raise HTTPException(status_code=404, detail="Thread not found.")

return ingest_runnable.batch([file.file for file in files], config)
file_blobs = [convert_ingestion_input_to_blob(file) for file in files]
return ingest_runnable.batch(file_blobs, config)


@app.get("/health")
Expand Down
62 changes: 44 additions & 18 deletions backend/app/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@

from __future__ import annotations

import mimetypes
import os
from typing import BinaryIO, List, Optional

from fastapi import UploadFile
from langchain_community.vectorstores.pgvector import PGVector
from langchain_core.document_loaders.blob_loaders import Blob
from langchain_core.runnables import (
Expand All @@ -27,25 +29,52 @@
from app.parsing import MIMETYPE_BASED_PARSER


def _guess_mimetype(file_bytes: bytes) -> str:
"""Guess the mime-type of a file."""
def _guess_mimetype(file_name: str, file_bytes: bytes) -> str:
"""Guess the mime-type of a file based on its name or bytes."""
# Guess based on the file extension
mime_type, _ = mimetypes.guess_type(file_name)

# Return detected mime type from mimetypes guess, unless it's None
if mime_type:
return mime_type

# Signature-based detection for common types
if file_bytes.startswith(b"%PDF"):
return "application/pdf"
elif file_bytes.startswith(
(b"\x50\x4B\x03\x04", b"\x50\x4B\x05\x06", b"\x50\x4B\x07\x08")
):
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
elif file_bytes.startswith(b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"):
return "application/msword"
elif file_bytes.startswith(b"\x09\x00\xff\x00\x06\x00"):
return "application/vnd.ms-excel"

# Check for CSV-like plain text content (commas, tabs, newlines)
try:
import magic
except ImportError as e:
raise ImportError(
"magic package not found, please install it with `pip install python-magic`"
) from e
decoded = file_bytes[:1024].decode("utf-8", errors="ignore")
if all(char in decoded for char in (",", "\n")) or all(
char in decoded for char in ("\t", "\n")
):
return "text/csv"
elif decoded.isprintable() or decoded == "":
return "text/plain"
except UnicodeDecodeError:
pass

mime = magic.Magic(mime=True)
mime_type = mime.from_buffer(file_bytes)
return mime_type
return "application/octet-stream"


def _convert_ingestion_input_to_blob(data: BinaryIO) -> Blob:
def convert_ingestion_input_to_blob(file: UploadFile) -> Blob:
"""Convert ingestion input to blob."""
file_data = data.read()
mimetype = _guess_mimetype(file_data)
file_name = data.name
file_data = file.file.read()
file_name = file.filename

# Check if file_name is a valid string
if not isinstance(file_name, str):
raise TypeError(f"Expected string for file name, got {type(file_name)}")

mimetype = _guess_mimetype(file_name, file_data)
return Blob.from_data(
data=file_data,
path=file_name,
Expand Down Expand Up @@ -104,10 +133,7 @@ def namespace(self) -> str:
)
return self.assistant_id if self.assistant_id is not None else self.thread_id

def invoke(
self, input: BinaryIO, config: Optional[RunnableConfig] = None
) -> List[str]:
blob = _convert_ingestion_input_to_blob(input)
def invoke(self, blob: Blob, config: Optional[RunnableConfig] = None) -> List[str]:
out = ingest_blob(
blob,
MIMETYPE_BASED_PARSER,
Expand Down
9 changes: 2 additions & 7 deletions backend/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ tiktoken = "^0.5.1"
langchain = ">=0.0.338"
langgraph = "^0.0.38"
pydantic = "<2.0"
python-magic = "^0.4.27"
langchain-openai = "^0.1.3"
beautifulsoup4 = "^4.12.3"
boto3 = "^1.34.28"
Expand Down
20 changes: 13 additions & 7 deletions backend/tests/unit_tests/agent_executor/test_upload.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from io import BytesIO

from langchain.text_splitter import RecursiveCharacterTextSplitter

from app.upload import IngestRunnable, _guess_mimetype
from fastapi import UploadFile
from app.upload import IngestRunnable, _guess_mimetype, convert_ingestion_input_to_blob
from tests.unit_tests.fixtures import get_sample_paths
from tests.unit_tests.utils import InMemoryVectorStore

Expand All @@ -17,9 +17,15 @@ def test_ingestion_runnable() -> None:
input_key="file_contents",
assistant_id="TheParrot",
)
data = BytesIO(b"test")
data.name = "filename"
ids = runnable.invoke(data)
# Simulate file data
file_data = BytesIO(b"test data")
file_data.seek(0)
# Create UploadFile object
file = UploadFile(filename="testfile.txt", file=file_data)

# Convert the file to blob
blob = convert_ingestion_input_to_blob(file)
ids = runnable.invoke(blob)
assert len(ids) == 1


Expand All @@ -28,7 +34,7 @@ def test_mimetype_guessing() -> None:
name_to_mime = {}
for file in sorted(get_sample_paths()):
data = file.read_bytes()
name_to_mime[file.name] = _guess_mimetype(data)
name_to_mime[file.name] = _guess_mimetype(file.name, data)

assert {
"sample.docx": (
Expand All @@ -38,6 +44,6 @@ def test_mimetype_guessing() -> None:
"sample.html": "text/html",
"sample.odt": "application/vnd.oasis.opendocument.text",
"sample.pdf": "application/pdf",
"sample.rtf": "text/rtf",
"sample.rtf": "application/rtf",
"sample.txt": "text/plain",
} == name_to_mime
Loading