Skip to content

Commit

Permalink
Remove libmagic
Browse files Browse the repository at this point in the history
  • Loading branch information
danorlando committed Jul 9, 2024
1 parent a51f4da commit 1eedf39
Show file tree
Hide file tree
Showing 6 changed files with 195 additions and 129 deletions.
64 changes: 35 additions & 29 deletions stack/app/api/v1/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
Request,
)
import uuid
import structlog
from stack.app.repositories.file import get_file_repository, FileRepository

from stack.app.schema.rag import DeleteDocumentsResponse
from stack.app.core.auth.utils import get_header_user_id
from stack.app.schema.file import (
FileSchema,
Expand All @@ -24,12 +25,8 @@
from stack.app.core.auth.request_validators import AuthenticatedUser
from stack.app.core.configuration import settings
from typing import Optional
from stack.app.core.logger import logging
from stack.app.utils.file_helpers import (
guess_mime_type,
is_mime_type_supported,
guess_file_extension,
)
from stack.app.utils.file_helpers import guess_mime_type, is_mime_type_supported

from stack.app.vectordbs.qdrant import QdrantService
from stack.app.repositories.assistant import (
get_assistant_repository,
Expand All @@ -38,7 +35,7 @@

router = APIRouter()
DEFAULT_TAG = "Files"
logger = logging.getLogger(__name__)
logger = structlog.get_logger()


@router.post(
Expand Down Expand Up @@ -75,7 +72,7 @@ async def upload_file(
user_id = get_header_user_id(request)

file_content = await file.read()
mime_type = guess_mime_type(file_content)
mime_type = guess_mime_type(file.filename, file_content)
if not is_mime_type_supported(mime_type):
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST, detail="Unsupported file type."
Expand All @@ -102,7 +99,7 @@ async def upload_file(
except HTTPException as e:
raise e
except Exception as e:
logger.error(f"Error uploading file: {str(e)}", exc_info=True)
logger.exception(f"Error uploading file: {str(e)}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="An error occurred while uploading the file.",
Expand All @@ -128,7 +125,7 @@ async def retrieve_files(
files = await files_repository.retrieve_files(user_id=user_id, purpose=purpose)
return files
except Exception as e:
logger.error(f"Error retrieving files: {str(e)}", exc_info=True)
logger.exception(f"Error retrieving files: {str(e)}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="An error occurred while retrieving the files.",
Expand All @@ -152,13 +149,13 @@ async def retrieve_file(
try:
file = await files_repository.retrieve_file(file_id=file_id)
if not file:
logger.error(f"File not found for file id: {file_id}")
logger.exception(f"File not found for file id: {file_id}")
raise HTTPException(status_code=404, detail="File not found")
return file
except HTTPException as e:
raise e
except Exception as e:
logger.error(f"Error retrieving file: {str(e)}", exc_info=True)
logger.exception(f"Error retrieving file: {str(e)}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail="An error occurred while retrieving the file.",
Expand Down Expand Up @@ -187,6 +184,11 @@ async def delete_file(
raise HTTPException(status_code=404, detail="File not found")
if file.user_id != user_id:
raise HTTPException(status_code=403, detail="Forbidden")

deleted_chunks = DeleteDocumentsResponse(num_deleted_chunks=0)
assistants = []
num_of_assistants = 0

if file.purpose in [
FilePurpose.ASSISTANTS,
FilePurpose.THREADS,
Expand All @@ -195,33 +197,37 @@ async def delete_file(
# delete any embeddings associated with the file from the vector db
service = QdrantService()
deleted_chunks = await service.delete(str(file_id))
# If this is an assistants file, delete the file from any assistants that may be using it
assistants = await assistant_repository.remove_all_file_references(file_id)
num_of_assistants = len(assistants)
logger.info(f"Deleted file from {num_of_assistants} assistants")
else:
deleted_chunks = []
num_of_assistants = 0

# delete the file from the filesystem
ext = guess_file_extension(file.mime_type)
file_path = f"{settings.FILE_DATA_DIRECTORY}/{file.id}.{ext}"
# If this is an assistants file, delete the file from any assistants that may be using it
if file.purpose == FilePurpose.ASSISTANTS:
assistants = await assistant_repository.remove_all_file_references(
file_id
)
num_of_assistants = len(assistants)
logger.info(f"Deleted file from {num_of_assistants} assistants")

if os.path.isfile(file_path):
os.remove(file_path)
if os.path.isfile(file.source):
os.remove(file.source)
else:
logger.error(f"File not found on filesystem: {file_path}", exc_info=True)
logger.exception(f"File not found on filesystem: {file.source}")
raise HTTPException(
status_code=400,
detail=f"File not found on filesystem at location: {file.source}. Unable to delete.",
)

# delete the file from the database
await files_repository.delete_file(file_id=file_id)

return DeleteFileResponse(
file_id=file_id,
num_of_deleted_chunks=deleted_chunks.num_deleted_chunks,
num_of_assistants=num_of_assistants,
deleted_chunks=deleted_chunks,
assistants=assistants,
)
except HTTPException as e:
raise e
except Exception as e:
logger.error(f"Error deleting assistant file: {str(e)}", exc_info=True)
logger.exception(f"Error deleting file: {str(e)}", exc_info=True)
raise HTTPException(
status_code=500,
detail="An error occurred while deleting the file from the system.",
Expand All @@ -244,7 +250,7 @@ async def retrieve_file_content(
try:
return await files_repository.retrieve_file_content_as_response(str(file_id))
except Exception as e:
logger.error(f"Error retrieving file content: {str(e)}", exc_info=True)
logger.exception(f"Error retrieving file content: {str(e)}", exc_info=True)
raise HTTPException(
status_code=500,
detail="An server error occurred while retrieving the file content.",
Expand Down
21 changes: 14 additions & 7 deletions stack/app/repositories/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from stack.app.core.datastore import get_postgresql_session_provider
from typing import Optional, Any
from stack.app.core.configuration import settings
from stack.app.utils.file_helpers import guess_file_extension
from stack.app.utils.file_helpers import guess_file_extension, guess_mime_type
from fastapi import Response

logger = structlog.get_logger()
Expand All @@ -28,14 +28,21 @@ def __init__(self, postgresql_session):
self.postgresql_session = postgresql_session

async def create_file(self, data: dict, file_content: bytes) -> File:
"""Creates a new file in the database and saves the file content to the
local file system."""
try:
# Guess the mime type and file extension
mime_type = guess_mime_type(data.get("filename", ""), file_content)
file_extension = guess_file_extension(
data.get("filename", ""), file_content
)

# Update the data dictionary with the guessed mime type
data["mime_type"] = mime_type

file = await self.create(model=File, values=data)
await self.postgresql_session.commit()

# Create the file data directory if it doesn't exist
os.makedirs(settings.FILE_DATA_DIRECTORY, exist_ok=True)
file_extension = guess_file_extension(data.get("mime_type"))

# Save the file content to the local file system using the generated UUID
local_file_name = f"{file.id}.{file_extension}"
Expand All @@ -58,7 +65,7 @@ async def create_file(self, data: dict, file_content: bytes) -> File:
file_data=data,
)
raise HTTPException(
status_code=400, detail=f"Failed to create file."
status_code=400, detail=f"Failed to create file: {e}."
) from e

@staticmethod
Expand Down Expand Up @@ -122,9 +129,9 @@ async def delete_file(self, file_id: uuid.UUID) -> File:
except SQLAlchemyError as e:
await self.postgresql_session.rollback()
logger.exception(
f"Failed to delete file due to a database error: ", exc_info=True
f"Failed to delete file due to a database error: {e}", exc_info=True
)
raise HTTPException(status_code=400, detail="Failed to delete file.")
raise HTTPException(status_code=400, detail=f"Failed to delete file.")

async def retrieve_file_content(self, file_id: str) -> Any:
"""Fetches the content of a file by ID from the local file system."""
Expand Down
76 changes: 54 additions & 22 deletions stack/app/utils/file_helpers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import mimetypes
import re
from langchain.document_loaders.parsers import BS4HTMLParser, PDFMinerParser
from langchain.document_loaders.parsers.generic import MimeTypeBasedParser
from langchain.document_loaders.parsers.msword import MsWordParser
Expand All @@ -8,6 +9,10 @@
"application/pdf": PDFMinerParser(),
"text/plain": TextParser(),
"text/html": BS4HTMLParser(),
"text/markdown": TextParser(),
"text/csv": TextParser(),
"application/json": TextParser(),
"application/rtf": TextParser(),
"application/msword": MsWordParser(),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": MsWordParser(),
}
Expand All @@ -20,37 +25,64 @@
)


def guess_mime_type(file_bytes: bytes) -> str:
"""Guess the mime-type of a file."""
def guess_mime_type(file_name: str, file_bytes: bytes) -> str:
"""Guess the mime-type of a file based on its name or bytes."""
# Guess based on the file extension
mime_type, _ = mimetypes.guess_type(file_name)

# Return detected mime type from mimetypes guess, unless it's None
if mime_type:
return mime_type

# Signature-based detection for common types
if file_bytes.startswith(b"%PDF"):
return "application/pdf"
elif file_bytes.startswith(
(b"\x50\x4B\x03\x04", b"\x50\x4B\x05\x06", b"\x50\x4B\x07\x08")
):
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
elif file_bytes.startswith(b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"):
return "application/msword"
elif file_bytes.startswith(b"\x09\x00\xff\x00\x06\x00"):
return "application/vnd.ms-excel"

# Check for CSV-like plain text content (commas, tabs, newlines)
try:
import magic
except ImportError as e:
raise ImportError(
"magic package not found, please install it with `pip install python-magic`"
) from e
return magic.from_buffer(file_bytes, mime=True)
decoded = file_bytes[:1024].decode("utf-8", errors="ignore")
if all(char in decoded for char in (",", "\n")) or all(
char in decoded for char in ("\t", "\n")
):
return "text/csv"
elif decoded.isprintable() or decoded == "":
return "text/plain"
except UnicodeDecodeError:
pass

return "application/octet-stream"


def guess_file_extension(file_type: str) -> str:
def guess_file_extension(file_name: str, file_bytes: bytes) -> str:
"""Guess the file extension based on the file type."""
extension = mimetypes.guess_extension(file_type)
mime_type = guess_mime_type(file_name, file_bytes)
extension = mimetypes.guess_extension(mime_type)

if extension:
return extension.lstrip(".") # Remove the leading dot from the extension
else:
# Fallback for common file types
if "PDF" in file_type.upper():
return "pdf"
elif "TEXT" in file_type.upper():
return "txt"
elif "HTML" in file_type.upper():
return "html"
elif "WORD" in file_type.upper():
return "doc" if "Microsoft Word 2007+" in file_type else "docx"
else:
raise ValueError(
f"Unable to determine file extension for file type: {file_type}"
)
mime_to_ext = {
"application/pdf": "pdf",
"application/msword": "doc",
"text/rtf": "rtf",
"text/markdown": "md",
"application/json": "json",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
"text/csv": "csv",
"text/plain": "txt",
"text/html": "html",
"application/octet-stream": "bin",
}
return mime_to_ext.get(mime_type, "bin")


def get_file_handler(mime_type: str):
Expand Down
11 changes: 0 additions & 11 deletions stack/app/utils/transform_stream_for_client.py

This file was deleted.

33 changes: 0 additions & 33 deletions stack/app/utils/vector_collection.py

This file was deleted.

Loading

0 comments on commit 1eedf39

Please sign in to comment.