mimetype without libmagic

langchain-ai · May 3, 2024 · 2f3edba · 2f3edba
1 parent bb498e3
commit 2f3edba
Show file tree

Hide file tree

Showing 7 changed files with 1,665 additions and 1,498 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -20,7 +20,7 @@ ARG TARGETARCH
 ARG TARGETVARIANT
 
 # Install system dependencies
-RUN apt-get update && apt-get install -y libmagic1 && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && rm -rf /var/lib/apt/lists/*
 RUN wget -O golang-migrate.deb https://github.com/golang-migrate/migrate/releases/download/v4.17.0/migrate.${TARGETOS}-${TARGETARCH}${TARGETVARIANT}.deb \
     && dpkg -i golang-migrate.deb \
     && rm golang-migrate.deb

diff --git a/README.md b/README.md
@@ -109,14 +109,10 @@ pyenv activate opengpts
 Once your Python environment is set up, you can install the project dependencies:
 
 The backend service uses [poetry](https://python-poetry.org/docs/#installation) to manage dependencies.
-It assumes libmagic to be [installed](https://github.com/ahupp/python-magic?tab=readme-ov-file#installation) in your 
-host system.
 
 ```shell 
 pip install poetry
-pip install libmagic
 pip install langchain-community
-brew install libmagic
 ```
 
 **Install Postgres and the Postgres Vector Extension**

diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -6,7 +6,7 @@ ARG TARGETARCH
 ARG TARGETVARIANT
 
 # Install system dependencies
-RUN apt-get update && apt-get install -y libmagic1 && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && rm -rf /var/lib/apt/lists/*
 RUN wget -O golang-migrate.deb https://github.com/golang-migrate/migrate/releases/download/v4.17.0/migrate.${TARGETOS}-${TARGETARCH}${TARGETVARIANT}.deb \
     && dpkg -i golang-migrate.deb \
     && rm golang-migrate.deb

diff --git a/backend/app/server.py b/backend/app/server.py
@@ -11,7 +11,7 @@
 from app.api import router as api_router
 from app.auth.handlers import AuthedUser
 from app.lifespan import lifespan
-from app.upload import ingest_runnable
+from app.upload import ingest_runnable, convert_ingestion_input_to_blob
 
 logger = logging.getLogger(__name__)
 
@@ -44,7 +44,8 @@ async def ingest_files(
         if thread is None:
             raise HTTPException(status_code=404, detail="Thread not found.")
 
-    return ingest_runnable.batch([file.file for file in files], config)
+    file_blobs = [convert_ingestion_input_to_blob(file) for file in files]
+    return ingest_runnable.batch(file_blobs, config)
 
 
 @app.get("/health")

diff --git a/backend/app/upload.py b/backend/app/upload.py
@@ -9,10 +9,12 @@
 
 from __future__ import annotations
 
+import mimetypes
 import os
-from typing import Any, BinaryIO, List, Optional
 
 from langchain_community.document_loaders.blob_loaders.schema import Blob
+from typing import Any, BinaryIO, List, Optional
+from fastapi import UploadFile
 from langchain_community.vectorstores.pgvector import PGVector
 from langchain_core.runnables import (
     ConfigurableField,
@@ -27,25 +29,48 @@
 from app.parsing import MIMETYPE_BASED_PARSER
 
 
-def _guess_mimetype(file_bytes: bytes) -> str:
-    """Guess the mime-type of a file."""
+def _guess_mimetype(file_name: str, file_bytes: bytes) -> str:
+    """Guess the mime-type of a file based on its name or bytes."""
+    # Guess based on the file extension
+    mime_type, _ = mimetypes.guess_type(file_name)
+
+    # Return detected mime type from mimetypes guess, unless it's None
+    if mime_type:
+        return mime_type
+
+    # Signature-based detection for common types
+    if file_bytes.startswith(b'%PDF'):
+        return 'application/pdf'
+    elif file_bytes.startswith((b'\x50\x4B\x03\x04', b'\x50\x4B\x05\x06', b'\x50\x4B\x07\x08')):
+        return 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
+    elif file_bytes.startswith(b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'):
+        return 'application/msword'
+    elif file_bytes.startswith(b'\x09\x00\xff\x00\x06\x00'):
+        return 'application/vnd.ms-excel'
+
+    # Check for CSV-like plain text content (commas, tabs, newlines)
     try:
-        import magic
-    except ImportError as e:
-        raise ImportError(
-            "magic package not found, please install it with `pip install python-magic`"
-        ) from e
+        decoded = file_bytes[:1024].decode('utf-8', errors='ignore')
+        if all(char in decoded for char in (',', '\n')) or all(char in decoded for char in ('\t', '\n')):
+            return 'text/csv'
+        elif decoded.isprintable() or decoded == '':
+            return 'text/plain'
+    except UnicodeDecodeError:
+        pass
 
-    mime = magic.Magic(mime=True)
-    mime_type = mime.from_buffer(file_bytes)
-    return mime_type
+    return 'application/octet-stream'
 
 
-def _convert_ingestion_input_to_blob(data: BinaryIO) -> Blob:
+def convert_ingestion_input_to_blob(file: UploadFile) -> Blob:
     """Convert ingestion input to blob."""
-    file_data = data.read()
-    mimetype = _guess_mimetype(file_data)
-    file_name = data.name
+    file_data = file.file.read()
+    file_name = file.filename
+
+    # Check if file_name is a valid string
+    if not isinstance(file_name, str):
+        raise TypeError(f"Expected string for file name, got {type(file_name)}")
+
+    mimetype = _guess_mimetype(file_name, file_data)
     return Blob.from_data(
         data=file_data,
         path=file_name,
@@ -105,22 +130,21 @@ def namespace(self) -> str:
         return self.assistant_id if self.assistant_id is not None else self.thread_id
 
     def invoke(
-        self, input: BinaryIO, config: Optional[RunnableConfig] = None
+        self, blob: Blob, config: Optional[RunnableConfig] = None
     ) -> List[str]:
-        return self.batch([input], config)
+        return self.batch([blob], config)
 
     def batch(
         self,
-        inputs: List[BinaryIO],
+        inputs: List[Blob],
         config: RunnableConfig | List[RunnableConfig] | None = None,
         *,
         return_exceptions: bool = False,
         **kwargs: Any | None,
     ) -> List:
         """Ingest a batch of files into the vectorstore."""
         ids = []
-        for data in inputs:
-            blob = _convert_ingestion_input_to_blob(data)
+        for blob in inputs:
             ids.extend(
                 ingest_blob(
                     blob,