Skip to content

Commit

Permalink
mimetype without libmagic
Browse files Browse the repository at this point in the history
  • Loading branch information
mkorpela committed May 3, 2024
1 parent bb498e3 commit 2f3edba
Show file tree
Hide file tree
Showing 7 changed files with 1,665 additions and 1,498 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ ARG TARGETARCH
ARG TARGETVARIANT

# Install system dependencies
RUN apt-get update && apt-get install -y libmagic1 && rm -rf /var/lib/apt/lists/*
RUN apt-get update && rm -rf /var/lib/apt/lists/*
RUN wget -O golang-migrate.deb https://github.com/golang-migrate/migrate/releases/download/v4.17.0/migrate.${TARGETOS}-${TARGETARCH}${TARGETVARIANT}.deb \
&& dpkg -i golang-migrate.deb \
&& rm golang-migrate.deb
Expand Down
4 changes: 0 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,14 +109,10 @@ pyenv activate opengpts
Once your Python environment is set up, you can install the project dependencies:

The backend service uses [poetry](https://python-poetry.org/docs/#installation) to manage dependencies.
It assumes libmagic to be [installed](https://github.com/ahupp/python-magic?tab=readme-ov-file#installation) in your
host system.

```shell
pip install poetry
pip install libmagic
pip install langchain-community
brew install libmagic
```

**Install Postgres and the Postgres Vector Extension**
Expand Down
2 changes: 1 addition & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ ARG TARGETARCH
ARG TARGETVARIANT

# Install system dependencies
RUN apt-get update && apt-get install -y libmagic1 && rm -rf /var/lib/apt/lists/*
RUN apt-get update && rm -rf /var/lib/apt/lists/*
RUN wget -O golang-migrate.deb https://github.com/golang-migrate/migrate/releases/download/v4.17.0/migrate.${TARGETOS}-${TARGETARCH}${TARGETVARIANT}.deb \
&& dpkg -i golang-migrate.deb \
&& rm golang-migrate.deb
Expand Down
5 changes: 3 additions & 2 deletions backend/app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from app.api import router as api_router
from app.auth.handlers import AuthedUser
from app.lifespan import lifespan
from app.upload import ingest_runnable
from app.upload import ingest_runnable, convert_ingestion_input_to_blob

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -44,7 +44,8 @@ async def ingest_files(
if thread is None:
raise HTTPException(status_code=404, detail="Thread not found.")

return ingest_runnable.batch([file.file for file in files], config)
file_blobs = [convert_ingestion_input_to_blob(file) for file in files]
return ingest_runnable.batch(file_blobs, config)


@app.get("/health")
Expand Down
64 changes: 44 additions & 20 deletions backend/app/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@

from __future__ import annotations

import mimetypes
import os
from typing import Any, BinaryIO, List, Optional

from langchain_community.document_loaders.blob_loaders.schema import Blob
from typing import Any, BinaryIO, List, Optional
from fastapi import UploadFile
from langchain_community.vectorstores.pgvector import PGVector
from langchain_core.runnables import (
ConfigurableField,
Expand All @@ -27,25 +29,48 @@
from app.parsing import MIMETYPE_BASED_PARSER


def _guess_mimetype(file_bytes: bytes) -> str:
"""Guess the mime-type of a file."""
def _guess_mimetype(file_name: str, file_bytes: bytes) -> str:
"""Guess the mime-type of a file based on its name or bytes."""
# Guess based on the file extension
mime_type, _ = mimetypes.guess_type(file_name)

# Return detected mime type from mimetypes guess, unless it's None
if mime_type:
return mime_type

# Signature-based detection for common types
if file_bytes.startswith(b'%PDF'):
return 'application/pdf'
elif file_bytes.startswith((b'\x50\x4B\x03\x04', b'\x50\x4B\x05\x06', b'\x50\x4B\x07\x08')):
return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
elif file_bytes.startswith(b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'):
return 'application/msword'
elif file_bytes.startswith(b'\x09\x00\xff\x00\x06\x00'):
return 'application/vnd.ms-excel'

# Check for CSV-like plain text content (commas, tabs, newlines)
try:
import magic
except ImportError as e:
raise ImportError(
"magic package not found, please install it with `pip install python-magic`"
) from e
decoded = file_bytes[:1024].decode('utf-8', errors='ignore')
if all(char in decoded for char in (',', '\n')) or all(char in decoded for char in ('\t', '\n')):
return 'text/csv'
elif decoded.isprintable() or decoded == '':
return 'text/plain'
except UnicodeDecodeError:
pass

mime = magic.Magic(mime=True)
mime_type = mime.from_buffer(file_bytes)
return mime_type
return 'application/octet-stream'


def _convert_ingestion_input_to_blob(data: BinaryIO) -> Blob:
def convert_ingestion_input_to_blob(file: UploadFile) -> Blob:
"""Convert ingestion input to blob."""
file_data = data.read()
mimetype = _guess_mimetype(file_data)
file_name = data.name
file_data = file.file.read()
file_name = file.filename

# Check if file_name is a valid string
if not isinstance(file_name, str):
raise TypeError(f"Expected string for file name, got {type(file_name)}")

mimetype = _guess_mimetype(file_name, file_data)
return Blob.from_data(
data=file_data,
path=file_name,
Expand Down Expand Up @@ -105,22 +130,21 @@ def namespace(self) -> str:
return self.assistant_id if self.assistant_id is not None else self.thread_id

def invoke(
self, input: BinaryIO, config: Optional[RunnableConfig] = None
self, blob: Blob, config: Optional[RunnableConfig] = None
) -> List[str]:
return self.batch([input], config)
return self.batch([blob], config)

def batch(
self,
inputs: List[BinaryIO],
inputs: List[Blob],
config: RunnableConfig | List[RunnableConfig] | None = None,
*,
return_exceptions: bool = False,
**kwargs: Any | None,
) -> List:
"""Ingest a batch of files into the vectorstore."""
ids = []
for data in inputs:
blob = _convert_ingestion_input_to_blob(data)
for blob in inputs:
ids.extend(
ingest_blob(
blob,
Expand Down
Loading

0 comments on commit 2f3edba

Please sign in to comment.