diff --git a/.gitignore b/.gitignore index 4a6ba32b..f0c3b0d9 100644 --- a/.gitignore +++ b/.gitignore @@ -72,8 +72,8 @@ instance/ .scrapy # Sphinx documentation -docs/_build/ -docs/build/ +../docs/_build/ +../docs/build/ # PyBuilder target/ diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index d0c3cbf1..00000000 --- a/docs/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = source -BUILDDIR = build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 6908c194..00000000 --- a/docs/README.md +++ /dev/null @@ -1,10 +0,0 @@ -install the required pkgs: -``` -pip install -r requirements.txt -``` - - -to host the webpages locally: -``` -python -m http.server -``` \ No newline at end of file diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 747ffb7b..00000000 --- a/docs/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=source -set BUILDDIR=build - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.https://www.sphinx-doc.org/ - exit /b 1 -) - -if "%1" == "" goto help - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 3d310f10..00000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -sphinx -myst-nb -furo \ No newline at end of file diff --git a/docs/source/API/abc.rst b/docs/source/API/abc.rst deleted file mode 100644 index 310a7a87..00000000 --- a/docs/source/API/abc.rst +++ /dev/null @@ -1,5 +0,0 @@ -Abstract Class -============== - -.. autoclass:: FlagEmbedding.abc.inference.AbsEmbedder - :members: \ No newline at end of file diff --git a/docs/source/API/evaluation.rst b/docs/source/API/evaluation.rst deleted file mode 100644 index 0e6b32f8..00000000 --- a/docs/source/API/evaluation.rst +++ /dev/null @@ -1,2 +0,0 @@ -Evaluation -========== \ No newline at end of file diff --git a/docs/source/API/inference.rst b/docs/source/API/inference.rst deleted file mode 100644 index 3fe80034..00000000 --- a/docs/source/API/inference.rst +++ /dev/null @@ -1,2 +0,0 @@ -Inference -========= \ No newline at end of file diff --git a/docs/source/C-MTEB.rst b/docs/source/C-MTEB.rst deleted file mode 100644 index fac2ec33..00000000 --- a/docs/source/C-MTEB.rst +++ /dev/null @@ -1,47 +0,0 @@ -.. C-MTEB -.. ====== - -.. Introduction -.. ------------ - -.. `C-MTEB `_ is a benchmark for chinese text embedding. It contains 35 -.. datasets in 6 different tasks, providing a comprehensive evaluation to the quality of an embedding model on Chinese. - - -.. .. image:: ../_static/img/C_MTEB.png -.. :width: 700 -.. :align: center - - -.. Installation -.. ------------ - -.. C-MTEB is developed based on MTEB, you can install C-MTEB by: - -.. .. code:: bash - -.. pip install -U C_MTEB - -.. or install by FlagEmbedding's repo: - -.. .. code:: bash - -.. git clone https://github.com/FlagOpen/FlagEmbedding.git -.. cd FlagEmbedding/C_MTEB -.. pip install -e . - -.. Citing the Work -.. --------------- - -.. There are more details in our publication. If you find C-MTEB useful, you can cite it by: - -.. .. code:: - -.. @misc{c-pack, -.. title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, -.. author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff}, -.. year={2023}, -.. eprint={2309.07597}, -.. archivePrefix={arXiv}, -.. primaryClass={cs.CL} -.. } \ No newline at end of file diff --git a/docs/source/Introduction/installation.rst b/docs/source/Introduction/installation.rst deleted file mode 100644 index a4809b46..00000000 --- a/docs/source/Introduction/installation.rst +++ /dev/null @@ -1,28 +0,0 @@ -:github_url: https://github.com/AI4Finance-Foundation/FinRL - -Installation -============ - -Using pip: ----------- - -.. code:: bash - - pip install -U FlagEmbedding - -Install from sources: ---------------------- - -Clone the repository and install - -.. code:: bash - - git clone https://github.com/FlagOpen/FlagEmbedding.git - cd FlagEmbedding - pip install . - -For development in editable mode: - -.. code:: bash - - pip install -e . \ No newline at end of file diff --git a/docs/source/Introduction/quick_start.ipynb b/docs/source/Introduction/quick_start.ipynb deleted file mode 100644 index 4eed81cb..00000000 --- a/docs/source/Introduction/quick_start.ipynb +++ /dev/null @@ -1,472 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Quick Start" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this tutorial, we will show how to use BGE models on a text retrieval task in 5 minutes." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 0: Preparation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, install FlagEmbedding in the environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -U FlagEmbedding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Below is a super tiny courpus with only 10 sentences, which will be the dataset we use.\n", - "\n", - "Each sentence is a concise discription of a famous people in specific domain." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "corpus = [\n", - " \"Michael Jackson was a legendary pop icon known for his record-breaking music and dance innovations.\",\n", - " \"Fei-Fei Li is a professor in Stanford University, revolutionized computer vision with the ImageNet project.\",\n", - " \"Brad Pitt is a versatile actor and producer known for his roles in films like 'Fight Club' and 'Once Upon a Time in Hollywood.'\",\n", - " \"Geoffrey Hinton, as a foundational figure in AI, received Turing Award for his contribution in deep learning.\",\n", - " \"Eminem is a renowned rapper and one of the best-selling music artists of all time.\",\n", - " \"Taylor Swift is a Grammy-winning singer-songwriter known for her narrative-driven music.\",\n", - " \"Sam Altman leads OpenAI as its CEO, with astonishing works of GPT series and pursuing safe and beneficial AI.\",\n", - " \"Morgan Freeman is an acclaimed actor famous for his distinctive voice and diverse roles.\",\n", - " \"Andrew Ng spread AI knowledge globally via public courses on Coursera and Stanford University.\",\n", - " \"Robert Downey Jr. is an iconic actor best known for playing Iron Man in the Marvel Cinematic Universe.\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We want to know which one of these people could be an expert of neural network and who he/she is. \n", - "\n", - "Thus we generate the following query:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "query = \"Who could be an expert of neural network?\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 1: Text -> Embedding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, let's use a [BGE embedding model](https://huggingface.co/BAAI/bge-base-en-v1.5) to create sentence embedding for the corpus." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "from FlagEmbedding import FlagModel\n", - "\n", - "# get the BGE embedding model\n", - "model = FlagModel('BAAI/bge-base-en-v1.5',\n", - " query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n", - " use_fp16=True)\n", - "\n", - "# get the embedding of the query and corpus\n", - "corpus_embeddings = model.encode(corpus)\n", - "query_embedding = model.encode(query)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The embedding of each sentence is a vector with length 768. " - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shape of the query embedding: (768,)\n", - "shape of the corpus embeddings: (10, 768)\n" - ] - } - ], - "source": [ - "print(\"shape of the query embedding: \", query_embedding.shape)\n", - "print(\"shape of the corpus embeddings:\", corpus_embeddings.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run the following print line to take a look at the first 10 elements of the query embedding vector." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[-0.00790005 -0.00683443 -0.00806659 0.00756918 0.04374858 0.02838556\n", - " 0.02357143 -0.02270943 -0.03611493 -0.03038301]\n" - ] - } - ], - "source": [ - "print(query_embedding[:10])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 2: Calculate Similarity" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, we have the embeddings of the query and the corpus. The next step is to calculate the similarity between the query and each sentence in the corpus. Here we use the dot product/inner product as our similarity metric." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[0.39290053 0.6031525 0.32672375 0.6082418 0.39446455 0.35350388\n", - " 0.4626108 0.40196604 0.5284606 0.36792332]\n" - ] - } - ], - "source": [ - "sim_scores = query_embedding @ corpus_embeddings.T\n", - "print(sim_scores)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The result is a list of score representing the query's similarity to: [sentence 0, sentence 1, sentence 2, ...]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 3: Ranking" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After we have the similarity score of the query to each sentence in the corpus, we can rank them from large to small." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[3, 1, 8, 6, 7, 4, 0, 9, 5, 2]\n" - ] - } - ], - "source": [ - "# get the indices in sorted order\n", - "sorted_indices = sorted(range(len(sim_scores)), key=lambda k: sim_scores[k], reverse=True)\n", - "print(sorted_indices)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now from the ranking, the sentence with index 3 is the best answer to our query \"Who could be an expert of neural network?\"\n", - "\n", - "And that person is Geoffrey Hinton!" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Geoffrey Hinton, as a foundational figure in AI, received Turing Award for his contribution in deep learning.\n" - ] - } - ], - "source": [ - "print(corpus[3])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "According to the order of indecies, we can print out the ranking of people that our little retriever got." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Score of 0.608: \"Geoffrey Hinton, as a foundational figure in AI, received Turing Award for his contribution in deep learning.\"\n", - "Score of 0.603: \"Fei-Fei Li is a professor in Stanford University, revolutionized computer vision with the ImageNet project.\"\n", - "Score of 0.528: \"Andrew Ng spread AI knowledge globally via public courses on Coursera and Stanford University.\"\n", - "Score of 0.463: \"Sam Altman leads OpenAI as its CEO, with astonishing works of GPT series and pursuing safe and beneficial AI.\"\n", - "Score of 0.402: \"Morgan Freeman is an acclaimed actor famous for his distinctive voice and diverse roles.\"\n", - "Score of 0.394: \"Eminem is a renowned rapper and one of the best-selling music artists of all time.\"\n", - "Score of 0.393: \"Michael Jackson was a legendary pop icon known for his record-breaking music and dance innovations.\"\n", - "Score of 0.368: \"Robert Downey Jr. is an iconic actor best known for playing Iron Man in the Marvel Cinematic Universe.\"\n", - "Score of 0.354: \"Taylor Swift is a Grammy-winning singer-songwriter known for her narrative-driven music.\"\n", - "Score of 0.327: \"Brad Pitt is a versatile actor and producer known for his roles in films like 'Fight Club' and 'Once Upon a Time in Hollywood.'\"\n" - ] - } - ], - "source": [ - "# iteratively print the score and corresponding sentences in descending order\n", - "\n", - "for i in sorted_indices:\n", - " print(f\"Score of {sim_scores[i]:.3f}: \\\"{corpus[i]}\\\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the ranking, not surprisingly, the similarity scores of the query and the discriptions of Geoffrey Hinton and Fei-Fei Li is way higher than others, following by those of Andrew Ng and Sam Altman. \n", - "\n", - "While the key phrase \"neural network\" in the query does not appear in any of those discriptions, the BGE embedding model is still powerful enough to get the semantic meaning of query and corpus well." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 4: Evaluate" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We've seen the embedding model performed pretty well on the \"neural network\" query. What about the more general quality?\n", - "\n", - "Let's generate a very small dataset of queries and corresponding ground truth answers. Note that the ground truth answers are the indices of sentences in the corpus." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "queries = [\n", - " \"Who could be an expert of neural network?\",\n", - " \"Who might had won Grammy?\",\n", - " \"Won Academy Awards\",\n", - " \"One of the most famous female singers.\",\n", - " \"Inventor of AlexNet\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "ground_truth = [\n", - " [1, 3],\n", - " [0, 4, 5],\n", - " [2, 7, 9],\n", - " [5],\n", - " [3],\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here we repeat the steps we covered above to get the predicted ranking of each query." - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[[3, 1, 8, 6, 7, 4, 0, 9, 5, 2],\n", - " [5, 0, 3, 4, 1, 9, 7, 2, 6, 8],\n", - " [3, 2, 7, 5, 9, 0, 1, 4, 6, 8],\n", - " [5, 0, 4, 7, 1, 9, 2, 3, 6, 8],\n", - " [3, 1, 8, 6, 0, 7, 5, 9, 4, 2]]" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# use bge model to generate embeddings for all the queries\n", - "queries_embedding = model.encode(queries)\n", - "# compute similarity scores\n", - "scores = queries_embedding @ corpus_embeddings.T\n", - "# get he final rankings\n", - "rankings = [sorted(range(len(sim_scores)), key=lambda k: sim_scores[k], reverse=True) for sim_scores in scores]\n", - "rankings" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Mean Reciprocal Rank ([MRR](https://en.wikipedia.org/wiki/Mean_reciprocal_rank)) is a widely used metric in information retrieval to evaluate the effectiveness of a system. Here we use that to have a very rough idea how our system performs." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "def MRR(preds, labels, cutoffs):\n", - " mrr = [0 for _ in range(len(cutoffs))]\n", - " for pred, label in zip(preds, labels):\n", - " for i, c in enumerate(cutoffs):\n", - " for j, index in enumerate(pred):\n", - " if j < c and index in label:\n", - " mrr[i] += 1/(j+1)\n", - " break\n", - " mrr = [k/len(preds) for k in mrr]\n", - " return mrr" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We choose to use 1 and 5 as our cutoffs, with the result of 0.8 and 0.9 respectively." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MRR@1: 0.8\n", - "MRR@5: 0.9\n" - ] - } - ], - "source": [ - "cutoffs = [1, 5]\n", - "mrrs = MRR(rankings, ground_truth, cutoffs)\n", - "for i, c in enumerate(cutoffs):\n", - " print(f\"MRR@{c}: {mrrs[i]}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/source/_static/img/BAAI_logo.png b/docs/source/_static/img/BAAI_logo.png deleted file mode 100644 index c39cc6fd..00000000 Binary files a/docs/source/_static/img/BAAI_logo.png and /dev/null differ diff --git a/docs/source/_static/img/C_MTEB.png b/docs/source/_static/img/C_MTEB.png deleted file mode 100644 index 0b0f0941..00000000 Binary files a/docs/source/_static/img/C_MTEB.png and /dev/null differ diff --git a/docs/source/bge/bge_icl.rst b/docs/source/bge/bge_icl.rst deleted file mode 100644 index 5c91e04f..00000000 --- a/docs/source/bge/bge_icl.rst +++ /dev/null @@ -1,2 +0,0 @@ -BGE-en-icl -========== \ No newline at end of file diff --git a/docs/source/bge/bge_m3.rst b/docs/source/bge/bge_m3.rst deleted file mode 100644 index 77609d50..00000000 --- a/docs/source/bge/bge_m3.rst +++ /dev/null @@ -1,2 +0,0 @@ -BGE-M3 -====== \ No newline at end of file diff --git a/docs/source/bge/bge_reranker.rst b/docs/source/bge/bge_reranker.rst deleted file mode 100644 index 47545c37..00000000 --- a/docs/source/bge/bge_reranker.rst +++ /dev/null @@ -1,2 +0,0 @@ -BGE-Reranker -============ \ No newline at end of file diff --git a/docs/source/bge/bge_v1.rst b/docs/source/bge/bge_v1.rst deleted file mode 100644 index 6c141ca4..00000000 --- a/docs/source/bge/bge_v1.rst +++ /dev/null @@ -1,49 +0,0 @@ -BGE-v1 -====== - -BGE ---- - -The first group of BGE models was released in Aug 2023. The :code:`bge-large-en` and :code:`bge-large-zh` ranked 1st on MTEB and -C-MTEB benchmarks at the time released. - -+-------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ -| Model | Language | Parameters | Model Size | Description | -+===================================================================+===========+============+==============+=======================================================================+ -| `BAAI/bge-large-en `_ | English | 335M | 1.34 GB | Embedding Model which map text into vector | -+-------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ -| `BAAI/bge-base-en `_ | English | 109M | 438 MB | a base-scale model but with similar ability to `BAAI/bge-large-en` | -+-------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ -| `BAAI/bge-small-en `_ | English | 33.4M | 133 MB | a small-scale model but with competitive performance | -+-------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ -| `BAAI/bge-large-zh `_ | Chinese | 326M | 1.3 GB | Embedding Model which map text into vector | -+-------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ -| `BAAI/bge-base-zh `_ | Chinese | 102M | 409 MB | a base-scale model but with similar ability to `BAAI/bge-large-zh` | -+-------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ -| `BAAI/bge-small-zh `_ | Chinese | 24M | 95.8 MB | a small-scale model but with competitive performance | -+-------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ - -BGE-v1.5 --------- - -Then to enhance its retrieval ability without instruction and alleviate the issue of the similarity distribution, :code:`bge-*-1.5` models -were released in Sep 2023. They are still the most popular embedding models that balanced well between embedding quality and model sizes. - -+-----------------------------------------------------------------------------+-----------+------------+--------------+--------------+ -| Model | Language | Parameters | Model Size | Description | -+=============================================================================+===========+============+==============+==============+ -| `BAAI/bge-large-en-v1.5 `_ | English | 335M | 1.34 GB | version 1.5 | -+-----------------------------------------------------------------------------+-----------+------------+--------------+ with more + -| `BAAI/bge-base-en-v1.5 `_ | English | 109M | 438 MB | reasonable | -+-----------------------------------------------------------------------------+-----------+------------+--------------+ similarity + -| `BAAI/bge-small-en-v1.5 `_ | English | 33.4M | 133 MB | distribution | -+-----------------------------------------------------------------------------+-----------+------------+--------------+ + -| `BAAI/bge-large-zh-v1.5 `_ | Chinese | 326M | 1.3 GB | | -+-----------------------------------------------------------------------------+-----------+------------+--------------+ + -| `BAAI/bge-base-zh-v1.5 `_ | Chinese | 102M | 409 MB | | -+-----------------------------------------------------------------------------+-----------+------------+--------------+ + -| `BAAI/bge-small-zh-v1.5 `_ | Chinese | 24M | 95.8 MB | | -+-----------------------------------------------------------------------------+-----------+------------+--------------+--------------+ - - - diff --git a/docs/source/bge/introduction.rst b/docs/source/bge/introduction.rst deleted file mode 100644 index 86cb3ce7..00000000 --- a/docs/source/bge/introduction.rst +++ /dev/null @@ -1,5 +0,0 @@ -Introduction -============ - -**BGE** stands for **BAAI General Embeddings**, which is a series of embedding models released by BAAI. - diff --git a/docs/source/conf.py b/docs/source/conf.py deleted file mode 100644 index 16e06535..00000000 --- a/docs/source/conf.py +++ /dev/null @@ -1,53 +0,0 @@ -# Configuration file for the Sphinx documentation builder. -# -# For the full list of built-in configuration values, see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Project information ----------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information -import os -import sys - -sys.path.insert(0, os.path.abspath(".")) -sys.path.insert(0, os.path.abspath("..")) - -project = 'FlagEmbedding' -copyright = '2024, BAAI' -author = 'BAAI' - -# -- General configuration --------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration - -extensions = [ - "sphinx.ext.napoleon", - "sphinx.ext.autodoc", - "sphinx.ext.githubpages", - "sphinx.ext.viewcode", - "sphinx.ext.coverage", - "myst_nb", -] - -templates_path = ['_templates'] -exclude_patterns = [] - -# -- Options for HTML output ------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output - -html_theme = 'furo' -# html_logo = "_static/img/BAAI_logo.png" -html_title = "FlagEmbedding" -html_static_path = ['_static'] -html_theme_options = { - # "light_logo": "/_static/img/BAAI_logo.png", - "light_css_variables": { - "color-brand-primary": "#238be8", - "color-brand-content": "#238be8", - }, - "dark_css_variables": { - "color-brand-primary": "#FBCB67", - "color-brand-content": "#FBCB67", - }, -} - -# MyST-NB conf -nb_execution_mode = "off" \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst deleted file mode 100644 index bfa0d5e1..00000000 --- a/docs/source/index.rst +++ /dev/null @@ -1,77 +0,0 @@ -.. FlagEmbedding documentation master file, created by - sphinx-quickstart on Sat Oct 12 13:27:49 2024. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -FlagEmbedding -============= - -| -| - -.. image:: _static/img/BAAI_logo.png - :target: https://github.com/FlagOpen/FlagEmbedding - :width: 400 - :align: center - -| -| - -Welcome to FlagEmbedding documentation! - -`FlagEmbedding `_ focuses on retrieval-augmented LLMs, -developed with the support of the Beijing Academy of Artificial Intelligence (BAAI). -We are aiming to enhance text and multi-model retrieval by leveraging advanced embedding techniques. - -- We provide high quality text embedding models and rerankers, with multi-language and multi-model, in `BGE <./bge/introduction.html>`_ series. -- We construct a benchmark for chinese text embedding `C-MTEB <./cmteb/introduction.html>`_, which has been merged into MTEB. - - - - -.. toctree:: - :maxdepth: 1 - :hidden: - - Home - - -.. toctree:: - :hidden: - :maxdepth: 1 - :caption: Introduction - - Introduction/installation - Introduction/quick_start - -.. toctree:: - :hidden: - :maxdepth: 1 - :caption: BGE - - bge/introduction - bge/bge_v1 - bge/bge_m3 - bge/bge_icl - bge/bge_reranker - -.. toctree:: - :hidden: - :maxdepth: 4 - :caption: API - - API/abc - API/inference - API/evaluation - -.. toctree:: - :hidden: - :maxdepth: 2 - :caption: Tutorials - - tutorial/1_Embedding - tutorial/2_Metrics - tutorial/3_Indexing - tutorial/4_Evaluation - tutorial/5_Reranking - tutorial/6_RAG \ No newline at end of file diff --git a/docs/source/tutorial/1_Embedding.rst b/docs/source/tutorial/1_Embedding.rst deleted file mode 100644 index f68ea30f..00000000 --- a/docs/source/tutorial/1_Embedding.rst +++ /dev/null @@ -1,12 +0,0 @@ -1. Embedding -============ - -.. toctree:: - :hidden: - :maxdepth: 1 - :caption: Embedding - - 1_Embedding/1.1.1 - 1_Embedding/1.2.1 - 1_Embedding/1.2.2 - 1_Embedding/1.2.3 \ No newline at end of file diff --git a/docs/source/tutorial/1_Embedding/1.1.1.ipynb b/docs/source/tutorial/1_Embedding/1.1.1.ipynb deleted file mode 100644 index a3de317e..00000000 --- a/docs/source/tutorial/1_Embedding/1.1.1.ipynb +++ /dev/null @@ -1,395 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Intro to Embedding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For text retrieval, pattern matching is the most intuitive way. People would use certain characters, words, phrases, or sentence patterns. However, not only for human, it is also extremely inefficient for computer to do pattern matching between a query and a collection of text files to find the possible results. \n", - "\n", - "For images and acoustic waves, there are rgb pixels and digital signals. Similarly, in order to accomplish more sophisticated tasks of natural language such as retrieval, classification, clustering, or semantic search, we need a way to represent text data. That's how text embedding comes in front of the stage." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Background" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Traditional text embedding methods like one-hot encoding and bag-of-words (BoW) represent words and sentences as sparse vectors based on their statistical features, such as word appearance and frequency within a document. More advanced methods like TF-IDF and BM25 improve on these by considering a word's importance across an entire corpus, while n-gram techniques capture word order in small groups. However, these approaches suffer from the \"curse of dimensionality\" and fail to capture semantic similarity like \"cat\" and \"kitty\", difference like \"play the watch\" and \"watch the play\"." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# example of bag-of-words\n", - "sentence1 = \"I love basketball\"\n", - "sentence2 = \"I have a basketball match\"\n", - "\n", - "words = ['I', 'love', 'basketball', 'have', 'a', 'match']\n", - "sen1_vec = [1, 1, 1, 0, 0, 0]\n", - "sen2_vec = [1, 0, 1, 1, 1, 1]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To overcome these limitations, dense word embeddings were developed, mapping words to vectors in a low-dimensional space that captures semantic and relational information. Early models like Word2Vec demonstrated the power of dense embeddings using neural networks. Subsequent advancements with neural network architectures like RNNs, LSTMs, and Transformers have enabled more sophisticated models such as BERT, RoBERTa, and GPT to excel in capturing complex word relationships and contexts. **BAAI General Embedding (BGE)** provide a series of open-source models that could satisfy all kinds of demands." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get Embedding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The first step of modern text retrieval is embedding the text. So let's take a look at how to use the embedding models." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install the packages:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "%%capture\n", - "%pip install -U FlagEmbedding sentence_transformers openai cohere" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll use the following three sentences as the inputs:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "sentences = [\n", - " \"That is a happy dog\",\n", - " \"That is a very happy person\",\n", - " \"Today is a sunny day\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Open-source Models" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A huge portion of embedding models are in the open source community. The advantages of open-source models include:\n", - "- Free, no extra cost. But make sure to check the License and your use case before using.\n", - "- No frequency limit, can accelerate a lot if you have enough GPUs to parallelize.\n", - "- Transparent and might be reproducible.\n", - "\n", - "Let's take a look at two representatives:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### BGE" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "BGE is a series of embedding models and rerankers published by BAAI. Several of them reached SOTA at the time they released." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Embeddings:\n", - "(3, 768)\n", - "Similarity scores:\n", - "[[1. 0.7900386 0.57525384]\n", - " [0.7900386 0.9999998 0.59190154]\n", - " [0.57525384 0.59190154 0.99999994]]\n" - ] - } - ], - "source": [ - "from FlagEmbedding import FlagModel\n", - "\n", - "# Load BGE model\n", - "model = FlagModel('BAAI/bge-base-en-v1.5')\n", - "\n", - "# encode the queries and corpus\n", - "embeddings = model.encode(sentences)\n", - "print(f\"Embeddings:\\n{embeddings.shape}\")\n", - "\n", - "scores = embeddings @ embeddings.T\n", - "print(f\"Similarity scores:\\n{scores}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Sentence Transformers" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Sentence Transformers is a library for sentence embeddings with a huge amount of embedding models and datasets for related tasks." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Embeddings:\n", - "(3, 384)\n", - "Similarity scores:\n", - "[[0.99999976 0.6210502 0.24906276]\n", - " [0.6210502 0.9999997 0.21061528]\n", - " [0.24906276 0.21061528 0.9999999 ]]\n" - ] - } - ], - "source": [ - "from sentence_transformers import SentenceTransformer\n", - "\n", - "model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n", - "\n", - "embeddings = model.encode(sentences, normalize_embeddings=True)\n", - "print(f\"Embeddings:\\n{embeddings.shape}\")\n", - "\n", - "scores = embeddings @ embeddings.T\n", - "print(f\"Similarity scores:\\n{scores}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Commercial Models" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are also plenty choices of commercial models. They have the advantages of:\n", - "- Efficient memory usage, fast inference with no need of GPUs.\n", - "- Systematic support, commercial models have closer connections with their other products.\n", - "- Better training data, commercial models might be trained on larger, higher-quality datasets than some open-source models." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### OpenAI" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Along with GPT series, OpenAI has their own embedding models. Make sure to fill in your own API key in the field `\"YOUR_API_KEY\"`" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import numpy as np\n", - "\n", - "os.environ[\"OPENAI_API_KEY\"] = \"YOUR_API_KEY\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then run the following cells to get the embeddings. Check their official [documentation](https://platform.openai.com/docs/guides/embeddings) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "from openai import OpenAI\n", - "\n", - "client = OpenAI()\n", - "\n", - "response = client.embeddings.create(input = sentences, model=\"text-embedding-3-small\")" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Embeddings:\n", - "(3, 1536)\n", - "Similarity scores:\n", - "[[1.00000004 0.697673 0.34739798]\n", - " [0.697673 1.00000005 0.31969923]\n", - " [0.34739798 0.31969923 0.99999998]]\n" - ] - } - ], - "source": [ - "embeddings = np.asarray([response.data[i].embedding for i in range(len(sentences))])\n", - "print(f\"Embeddings:\\n{embeddings.shape}\")\n", - "\n", - "scores = embeddings @ embeddings.T\n", - "print(f\"Similarity scores:\\n{scores}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Voyage AI" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Voyage AI provides embedding models and rerankers for different purpus and in various fields. Their API keys can be freely used in low frequency and token length." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "os.environ[\"VOYAGE_API_KEY\"] = \"YOUR_API_KEY\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check their official [documentation](https://docs.voyageai.com/docs/api-key-and-installation) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "import voyageai\n", - "\n", - "vo = voyageai.Client()\n", - "\n", - "result = vo.embed(sentences, model=\"voyage-large-2-instruct\")" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Embeddings:\n", - "(3, 1024)\n", - "Similarity scores:\n", - "[[0.99999997 0.87282517 0.63276503]\n", - " [0.87282517 0.99999998 0.64720015]\n", - " [0.63276503 0.64720015 0.99999999]]\n" - ] - } - ], - "source": [ - "embeddings = np.asarray(result.embeddings)\n", - "print(f\"Embeddings:\\n{embeddings.shape}\")\n", - "\n", - "scores = embeddings @ embeddings.T\n", - "print(f\"Similarity scores:\\n{scores}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/source/tutorial/1_Embedding/1.2.1.ipynb b/docs/source/tutorial/1_Embedding/1.2.1.ipynb deleted file mode 100644 index 39d5cf07..00000000 --- a/docs/source/tutorial/1_Embedding/1.2.1.ipynb +++ /dev/null @@ -1,486 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "06cff9e4", - "metadata": {}, - "source": [ - "# BGE Series" - ] - }, - { - "cell_type": "markdown", - "id": "880e229d", - "metadata": {}, - "source": [ - "In this Part, we will walk through the BGE series and introduce how to use the BGE embedding models." - ] - }, - { - "cell_type": "markdown", - "id": "2516fd49", - "metadata": {}, - "source": [ - "## 1. BAAI General Embedding" - ] - }, - { - "cell_type": "markdown", - "id": "2113ee71", - "metadata": {}, - "source": [ - "BGE stands for BAAI General Embedding, it's a series of embeddings models developed and published by Beijing Academy of Artificial Intelligence (BAAI)." - ] - }, - { - "cell_type": "markdown", - "id": "16515b99", - "metadata": {}, - "source": [ - "A full support of APIs and related usages of BGE is maintained in [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding) on GitHub.\n", - "\n", - "Run the following cell to install FlagEmbedding in your environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88095fd0", - "metadata": {}, - "outputs": [], - "source": [ - "%%capture\n", - "%pip install -U FlagEmbedding" - ] - }, - { - "cell_type": "markdown", - "id": "bc6e30a0", - "metadata": {}, - "source": [ - "The collection of BGE models can be found in [Huggingface collection](https://huggingface.co/collections/BAAI/bge-66797a74476eb1f085c7446d)." - ] - }, - { - "cell_type": "markdown", - "id": "67a16ccf", - "metadata": {}, - "source": [ - "## 2. BGE Series Models" - ] - }, - { - "cell_type": "markdown", - "id": "2e10034a", - "metadata": {}, - "source": [ - "### 2.1 BGE" - ] - }, - { - "cell_type": "markdown", - "id": "0cdc6702", - "metadata": {}, - "source": [ - "The very first version of BGE has 6 models, with 'large', 'base', and 'small' for English and Chinese. " - ] - }, - { - "cell_type": "markdown", - "id": "04b75f72", - "metadata": {}, - "source": [ - "| Model | Language | Parameters | Model Size | Description | Base Model |\n", - "|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n", - "| [BAAI/bge-large-en](https://huggingface.co/BAAI/bge-large-en) | English | 335M | 1.34 GB | Embedding Model which map text into vector | BERT |\n", - "| [BAAI/bge-base-en](https://huggingface.co/BAAI/bge-base-en) | English | 109M | 438 MB | a base-scale model but with similar ability to `bge-large-en` | BERT |\n", - "| [BAAI/bge-small-en](https://huggingface.co/BAAI/bge-small-en) | English | 33.4M | 133 MB | a small-scale model but with competitive performance | BERT |\n", - "| [BAAI/bge-large-zh](https://huggingface.co/BAAI/bge-large-zh) | Chinese | 326M | 1.3 GB | Embedding Model which map text into vector | BERT |\n", - "| [BAAI/bge-base-zh](https://huggingface.co/BAAI/bge-base-zh) | Chinese | 102M | 409 MB | a base-scale model but with similar ability to `bge-large-zh` | BERT |\n", - "| [BAAI/bge-small-zh](https://huggingface.co/BAAI/bge-small-zh) | Chinese | 24M | 95.8 MB | a small-scale model but with competitive performance | BERT |" - ] - }, - { - "cell_type": "markdown", - "id": "c9c45d17", - "metadata": {}, - "source": [ - "For inference, import FlagModel from FlagEmbedding and initialize the model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "89e07751", - "metadata": {}, - "outputs": [], - "source": [ - "from FlagEmbedding import FlagModel\n", - "\n", - "# Load BGE model\n", - "model = FlagModel('BAAI/bge-base-en',\n", - " query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n", - " use_fp16=True)\n", - "\n", - "queries = [\"query 1\", \"query 2\"]\n", - "corpus = [\"passage 1\", \"passage 2\"]\n", - "\n", - "# encode the queries and corpus\n", - "q_embeddings = model.encode(queries)\n", - "p_embeddings = model.encode(corpus)\n", - "\n", - "# compute the similarity scores\n", - "scores = q_embeddings @ p_embeddings.T\n", - "print(scores)" - ] - }, - { - "cell_type": "markdown", - "id": "6c8e69ed", - "metadata": {}, - "source": [ - "To use `FlagModel`:\n", - "```\n", - "FlagModel.encode(sentences, batch_size=256, max_length=512, convert_to_numpy=True)\n", - "```\n", - "The *encode()* function directly encode the input sentences to embedding vectors.\n", - "```\n", - "FlagModel.encode_queries(sentences, batch_size=256, max_length=512, convert_to_numpy=True)\n", - "```\n", - "The *encode_queries()* function concatenate the `query_instruction_for_retrieval` with each of the input query, and then call `encode()`." - ] - }, - { - "cell_type": "markdown", - "id": "2c86a5a3", - "metadata": {}, - "source": [ - "### 2.2 BGE v1.5" - ] - }, - { - "cell_type": "markdown", - "id": "454ff7aa", - "metadata": {}, - "source": [ - "BGE 1.5 alleviate the issue of the similarity distribution, and enhance retrieval ability without instruction." - ] - }, - { - "cell_type": "markdown", - "id": "30b1f897", - "metadata": {}, - "source": [ - "| Model | Language | Parameters | Model Size | Description | Base Model |\n", - "|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n", - "| [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5) | English | 335M | 1.34 GB | version 1.5 with more reasonable similarity distribution | BERT |\n", - "| [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) | English | 109M | 438 MB | version 1.5 with more reasonable similarity distribution | BERT |\n", - "| [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) | English | 33.4M | 133 MB | version 1.5 with more reasonable similarity distribution | BERT |\n", - "| [BAAI/bge-large-zh-v1.5](https://huggingface.co/BAAI/bge-large-zh-v1.5) | Chinese | 326M | 1.3 GB | version 1.5 with more reasonable similarity distribution | BERT |\n", - "| [BAAI/bge-base-zh-v1.5](https://huggingface.co/BAAI/bge-base-zh-v1.5) | Chinese | 102M | 409 MB | version 1.5 with more reasonable similarity distribution | BERT |\n", - "| [BAAI/bge-small-zh-v1.5](https://huggingface.co/BAAI/bge-small-zh-v1.5) | Chinese | 24M | 95.8 MB | version 1.5 with more reasonable similarity distribution | BERT |" - ] - }, - { - "cell_type": "markdown", - "id": "ed00c504", - "metadata": {}, - "source": [ - "BGE 1.5 models shares the same API of `FlagModel` with BGE models." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "9b17afcc", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[0.736794 0.5989914]\n", - " [0.5684842 0.7461165]]\n" - ] - } - ], - "source": [ - "model = FlagModel('BAAI/bge-base-en-v1.5',\n", - " query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n", - " use_fp16=True)\n", - "\n", - "queries = [\"query 1\", \"query 2\"]\n", - "corpus = [\"passage 1\", \"passage 2\"]\n", - "\n", - "# encode the queries and corpus\n", - "q_embeddings = model.encode(queries)\n", - "p_embeddings = model.encode(corpus)\n", - "\n", - "# compute the similarity scores\n", - "scores = q_embeddings @ p_embeddings.T\n", - "print(scores)" - ] - }, - { - "cell_type": "markdown", - "id": "38c3ce1c", - "metadata": {}, - "source": [ - "### 2.3 LLM-Embedder" - ] - }, - { - "cell_type": "markdown", - "id": "1bc3fee0", - "metadata": {}, - "source": [ - "LLM-Embedder is a unified embedding model supporting diverse retrieval augmentation needs for LLMs. It is fine-tuned over 6 tasks:\n", - "- Question Answering (qa)\n", - "- Conversational Search (convsearch)\n", - "- Long Conversation (chat)\n", - "- Long-Rnage Language Modeling (lrlm)\n", - "- In-Context Learning (icl)\n", - "- Tool Learning (tool)" - ] - }, - { - "cell_type": "markdown", - "id": "13b926e9", - "metadata": {}, - "source": [ - "| Model | Language | Parameters | Model Size | Description | Base Model |\n", - "|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n", - "| [BAAI/llm-embedder](https://huggingface.co/BAAI/llm-embedder) | English | 109M | 438 MB | a unified embedding model to support diverse retrieval augmentation needs for LLMs | BERT |" - ] - }, - { - "cell_type": "markdown", - "id": "a7b3f109", - "metadata": {}, - "source": [ - "To use `LLMEmbedder`:\n", - "```python\n", - "LLMEmbedder.encode_queries(\n", - " queries, \n", - " batch_size=256, \n", - " max_length=256, \n", - " task='qa'\n", - ")\n", - "```\n", - "The *encode_queries()* will call the *_encode()* functions (similar to the *encode()* in `FlagModel`) and add the corresponding query instruction of the given *task* in front of each of the input *queries*.\n", - "```python\n", - "LLMEmbedder.encode_keys(\n", - " keys, \n", - " batch_size=256, \n", - " max_length=512, \n", - " task='qa'\n", - ")\n", - "```\n", - "Similarly, *encode_keys()* also calls *_encode()* and automatically add instructions according to given task." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "5f077420", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[0.89705944 0.85341793]\n", - " [0.8462474 0.90914035]]\n" - ] - } - ], - "source": [ - "from FlagEmbedding import LLMEmbedder\n", - "\n", - "# load the LLMEmbedder model\n", - "model = LLMEmbedder('BAAI/llm-embedder', use_fp16=False)\n", - "\n", - "# Define queries and keys\n", - "queries = [\"test query 1\", \"test query 2\"]\n", - "keys = [\"test key 1\", \"test key 2\"]\n", - "\n", - "# Encode for a specific task (qa, icl, chat, lrlm, tool, convsearch)\n", - "task = \"qa\"\n", - "query_embeddings = model.encode_queries(queries, task=task)\n", - "key_embeddings = model.encode_keys(keys, task=task)\n", - "\n", - "# compute the similarity scores\n", - "similarity = query_embeddings @ key_embeddings.T\n", - "print(similarity)" - ] - }, - { - "cell_type": "markdown", - "id": "dcf2a82b", - "metadata": {}, - "source": [ - "### 2.4 BGE M3" - ] - }, - { - "cell_type": "markdown", - "id": "cc5b5a5e", - "metadata": {}, - "source": [ - "BGE-M3 is the new version of BGE models that is distinguished for its versatility in:\n", - "- Multi-Functionality: Simultaneously perform the three common retrieval functionalities of embedding model: dense retrieval, multi-vector retrieval, and sparse retrieval.\n", - "- Multi-Linguality: Supports more than 100 working languages.\n", - "- Multi-Granularity: Can proces inputs with different granularityies, spanning from short sentences to long documents of up to 8192 tokens.\n", - "\n", - "For more details, feel free to check out the [paper](https://arxiv.org/pdf/2402.03216)." - ] - }, - { - "cell_type": "markdown", - "id": "41348e03", - "metadata": {}, - "source": [ - "| Model | Language | Parameters | Model Size | Description | Base Model |\n", - "|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n", - "| [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3) | Multilingual | 568M | 2.27 GB | Multi-Functionality(dense retrieval, sparse retrieval, multi-vector(colbert)), Multi-Linguality, and Multi-Granularity(8192 tokens) | XLM-RoBERTa |" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "d4647625", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 228780.22it/s]\n" - ] - } - ], - "source": [ - "from FlagEmbedding import BGEM3FlagModel\n", - "\n", - "model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)\n", - "\n", - "sentences = [\"What is BGE M3?\", \"Defination of BM25\"]" - ] - }, - { - "cell_type": "markdown", - "id": "1f89f1a9", - "metadata": {}, - "source": [ - "```python\n", - "BGEM3FlagModel.encode(\n", - " sentences, \n", - " batch_size=12, \n", - " max_length=8192, \n", - " return_dense=True, \n", - " return_sparse=False, \n", - " return_colbert_vecs=False\n", - ")\n", - "```\n", - "It returns a dictionary like:\n", - "```python\n", - "{\n", - " 'dense_vecs': 'array of dense embeddings of inputs if return_dense=True, otherwise None,'\n", - " 'lexical_weights': 'array of dictionaries with keys and values are ids of tokens and their corresponding weights if return_sparse=True, otherwise None,'\n", - " 'colbert_vecs': 'array of multi-vector embeddings of inputs if return_cobert_vecs=True, otherwise None,'\n", - "}\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "f0b11cf0", - "metadata": {}, - "outputs": [], - "source": [ - "# If you don't need such a long length of 8192 input tokens, you can set max_length to a smaller value to speed up encoding.\n", - "embeddings = model.encode(\n", - " sentences, \n", - " max_length=10,\n", - " return_dense=True, \n", - " return_sparse=True, \n", - " return_colbert_vecs=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "72cba126", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "dense embedding:\n", - "[[-0.03411707 -0.04707828 -0.00089447 ... 0.04828531 0.00755427\n", - " -0.02961654]\n", - " [-0.01041734 -0.04479263 -0.02429199 ... -0.00819298 0.01503995\n", - " 0.01113793]]\n", - "sparse embedding:\n", - "[defaultdict(, {'4865': 0.08362077, '83': 0.081469566, '335': 0.12964639, '11679': 0.25186998, '276': 0.17001738, '363': 0.26957875, '32': 0.040755156}), defaultdict(, {'262': 0.050144322, '5983': 0.13689369, '2320': 0.045134712, '111': 0.06342201, '90017': 0.25167602, '2588': 0.33353207})]\n", - "multi-vector:\n", - "[array([[-8.6726490e-03, -4.8921868e-02, -3.0449261e-03, ...,\n", - " -2.2082448e-02, 5.7268854e-02, 1.2811369e-02],\n", - " [-8.8765034e-03, -4.6860173e-02, -9.5845405e-03, ...,\n", - " -3.1404708e-02, 5.3911421e-02, 6.8714428e-03],\n", - " [ 1.8445771e-02, -4.2359587e-02, 8.6754939e-04, ...,\n", - " -1.9803897e-02, 3.8384371e-02, 7.6852231e-03],\n", - " ...,\n", - " [-2.5543230e-02, -1.6561864e-02, -4.2125367e-02, ...,\n", - " -4.5030322e-02, 4.4091221e-02, -1.0043185e-02],\n", - " [ 4.9905590e-05, -5.5475257e-02, 8.4884483e-03, ...,\n", - " -2.2911752e-02, 6.0379632e-02, 9.3577225e-03],\n", - " [ 2.5895271e-03, -2.9331330e-02, -1.8961012e-02, ...,\n", - " -8.0389353e-03, 3.2842189e-02, 4.3894034e-02]], dtype=float32), array([[ 0.01715658, 0.03835309, -0.02311821, ..., 0.00146474,\n", - " 0.02993429, -0.05985384],\n", - " [ 0.00996143, 0.039217 , -0.03855301, ..., 0.00599566,\n", - " 0.02722942, -0.06509776],\n", - " [ 0.01777726, 0.03919311, -0.01709837, ..., 0.00805702,\n", - " 0.03988946, -0.05069073],\n", - " ...,\n", - " [ 0.05474931, 0.0075684 , 0.00329455, ..., -0.01651684,\n", - " 0.02397249, 0.00368039],\n", - " [ 0.0093503 , 0.05022853, -0.02385841, ..., 0.02575599,\n", - " 0.00786822, -0.03260205],\n", - " [ 0.01805054, 0.01337725, 0.00016697, ..., 0.01843987,\n", - " 0.01374448, 0.00310114]], dtype=float32)]\n" - ] - } - ], - "source": [ - "print(f\"dense embedding:\\n{embeddings['dense_vecs']}\")\n", - "print(f\"sparse embedding:\\n{embeddings['lexical_weights']}\")\n", - "print(f\"multi-vector:\\n{embeddings['colbert_vecs']}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/source/tutorial/1_Embedding/1.2.2.ipynb b/docs/source/tutorial/1_Embedding/1.2.2.ipynb deleted file mode 100644 index dbe94b89..00000000 --- a/docs/source/tutorial/1_Embedding/1.2.2.ipynb +++ /dev/null @@ -1,419 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# BGE Explanation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this section, we will go through BGE and BGE-v1.5's structure and how they generate embeddings." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 0. Installation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install the required packages in your environment." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "%%capture\n", - "%pip install -U transformers FlagEmbedding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Encode sentences" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To know how exactly a sentence is encoded, let's first load the tokenizer and model from HF transformers instead of FlagEmbedding" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "from transformers import AutoTokenizer, AutoModel\n", - "import torch\n", - "\n", - "tokenizer = AutoTokenizer.from_pretrained(\"BAAI/bge-base-en-v1.5\")\n", - "model = AutoModel.from_pretrained(\"BAAI/bge-base-en-v1.5\")\n", - "\n", - "sentences = [\"embedding\", \"I love machine learning and nlp\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run the following cell to check the model of bge-base-en-v1.5. It has the exactly same structure of BERT-base, 12 encoder layers and hidden dimension of 768.\n", - "\n", - "Note that the corresponding models of BGE and BGE-v1.5 have same structures. For example, bge-base-en and bge-base-en-v1.5 have the same structure." - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "BertModel(\n", - " (embeddings): BertEmbeddings(\n", - " (word_embeddings): Embedding(30522, 768, padding_idx=0)\n", - " (position_embeddings): Embedding(512, 768)\n", - " (token_type_embeddings): Embedding(2, 768)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (encoder): BertEncoder(\n", - " (layer): ModuleList(\n", - " (0-11): 12 x BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " (intermediate_act_fn): GELUActivation()\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " )\n", - " )\n", - " (pooler): BertPooler(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (activation): Tanh()\n", - " )\n", - ")" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.eval()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, let's tokenize the sentences." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'input_ids': tensor([[ 101, 7861, 8270, 4667, 102, 0, 0, 0, 0],\n", - " [ 101, 1045, 2293, 3698, 4083, 1998, 17953, 2361, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],\n", - " [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0],\n", - " [1, 1, 1, 1, 1, 1, 1, 1, 1]])}" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "inputs = tokenizer(\n", - " sentences, \n", - " padding=True, \n", - " truncation=True, \n", - " return_tensors='pt', \n", - " max_length=512\n", - ")\n", - "inputs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the results, we can see that each sentence begins with token 101 and ends with 102, they are the `[CLS]` and `[SEP]` special token used in BERT." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([2, 9, 768])" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "last_hidden_state = model(**inputs, return_dict=True).last_hidden_state\n", - "last_hidden_state.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here we implement the pooling function, with two choices of using `[CLS]`'s last hidden state, or the mean pooling of the whole last hidden state." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "def pooling(last_hidden_state: torch.Tensor, pooling_method='cls', attention_mask: torch.Tensor = None):\n", - " if pooling_method == 'cls':\n", - " return last_hidden_state[:, 0]\n", - " elif pooling_method == 'mean':\n", - " s = torch.sum(last_hidden_state * attention_mask.unsqueeze(-1).float(), dim=1)\n", - " d = attention_mask.sum(dim=1, keepdim=True).float()\n", - " return s / d" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Different from more commonly used mean pooling, BGE is trained to use the last hidden state of `[CLS]` as the sentence embedding: \n", - "\n", - "`sentence_embeddings = model_output[0][:, 0]`\n", - "\n", - "If you use mean pooling, there will be a significant decrease in performance. Therefore, make sure to use the correct method to obtain sentence vectors." - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([2, 768])" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "embeddings = pooling(\n", - " last_hidden_state, \n", - " pooling_method='cls', \n", - " attention_mask=inputs['attention_mask']\n", - ")\n", - "embeddings.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Assembling them together, we get the whole encoding function:" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "def _encode(sentences, max_length=512, convert_to_numpy=True):\n", - "\n", - " # handle the case of single sentence and a list of sentences\n", - " input_was_string = False\n", - " if isinstance(sentences, str):\n", - " sentences = [sentences]\n", - " input_was_string = True\n", - "\n", - " inputs = tokenizer(\n", - " sentences, \n", - " padding=True, \n", - " truncation=True, \n", - " return_tensors='pt', \n", - " max_length=max_length\n", - " )\n", - "\n", - " last_hidden_state = model(**inputs, return_dict=True).last_hidden_state\n", - " \n", - " embeddings = pooling(\n", - " last_hidden_state, \n", - " pooling_method='cls', \n", - " attention_mask=inputs['attention_mask']\n", - " )\n", - "\n", - " # normalize the embedding vectors\n", - " embeddings = torch.nn.functional.normalize(embeddings, dim=-1)\n", - "\n", - " # convert to numpy if needed\n", - " if convert_to_numpy:\n", - " embeddings = embeddings.detach().numpy()\n", - "\n", - " return embeddings[0] if input_was_string else embeddings" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Comparison" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's run the function we wrote to get the embeddings of the two sentences:" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Embeddings:\n", - "[[ 1.4549762e-02 -9.6840411e-03 3.7761475e-03 ... -8.5092714e-04\n", - " 2.8417887e-02 6.3214332e-02]\n", - " [ 3.3924331e-05 -3.2998275e-03 1.7206438e-02 ... 3.5703944e-03\n", - " 1.8721525e-02 -2.0371782e-02]]\n", - "Similarity scores:\n", - "[[0.9999997 0.6077381]\n", - " [0.6077381 0.9999999]]\n" - ] - } - ], - "source": [ - "embeddings = _encode(sentences)\n", - "print(f\"Embeddings:\\n{embeddings}\")\n", - "\n", - "scores = embeddings @ embeddings.T\n", - "print(f\"Similarity scores:\\n{scores}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then, run the API provided in FlagEmbedding:" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Embeddings:\n", - "[[ 1.4549762e-02 -9.6840411e-03 3.7761475e-03 ... -8.5092714e-04\n", - " 2.8417887e-02 6.3214332e-02]\n", - " [ 3.3924331e-05 -3.2998275e-03 1.7206438e-02 ... 3.5703944e-03\n", - " 1.8721525e-02 -2.0371782e-02]]\n", - "Similarity scores:\n", - "[[0.9999997 0.6077381]\n", - " [0.6077381 0.9999999]]\n" - ] - } - ], - "source": [ - "from FlagEmbedding import FlagModel\n", - "\n", - "model = FlagModel('BAAI/bge-base-en-v1.5')\n", - "\n", - "embeddings = model.encode(sentences)\n", - "print(f\"Embeddings:\\n{embeddings}\")\n", - "\n", - "scores = embeddings @ embeddings.T\n", - "print(f\"Similarity scores:\\n{scores}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As we expect, the two encoding functions return exactly the same results. The full implementation in FlagEmbedding handles large datasets by batching and contains GPU support and parallelization. Feel free to check the [source code](https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/flag_models.py#L370) for more details." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/source/tutorial/1_Embedding/1.2.3.ipynb b/docs/source/tutorial/1_Embedding/1.2.3.ipynb deleted file mode 100644 index b691f499..00000000 --- a/docs/source/tutorial/1_Embedding/1.2.3.ipynb +++ /dev/null @@ -1,414 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# BGE-M3" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 0. Installation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install the required packages in your environment." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "%%capture\n", - "%pip install -U transformers FlagEmbedding accelerate" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. BGE-M3 structure" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from transformers import AutoTokenizer, AutoModel\n", - "import torch, os\n", - "\n", - "tokenizer = AutoTokenizer.from_pretrained(\"BAAI/bge-m3\")\n", - "raw_model = AutoModel.from_pretrained(\"BAAI/bge-m3\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The base model of BGE-M3 is [XLM-RoBERTa-large](https://huggingface.co/FacebookAI/xlm-roberta-large), which is a multilingual version of RoBERTa." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "XLMRobertaModel(\n", - " (embeddings): XLMRobertaEmbeddings(\n", - " (word_embeddings): Embedding(250002, 1024, padding_idx=1)\n", - " (position_embeddings): Embedding(8194, 1024, padding_idx=1)\n", - " (token_type_embeddings): Embedding(1, 1024)\n", - " (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (encoder): XLMRobertaEncoder(\n", - " (layer): ModuleList(\n", - " (0-23): 24 x XLMRobertaLayer(\n", - " (attention): XLMRobertaAttention(\n", - " (self): XLMRobertaSelfAttention(\n", - " (query): Linear(in_features=1024, out_features=1024, bias=True)\n", - " (key): Linear(in_features=1024, out_features=1024, bias=True)\n", - " (value): Linear(in_features=1024, out_features=1024, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): XLMRobertaSelfOutput(\n", - " (dense): Linear(in_features=1024, out_features=1024, bias=True)\n", - " (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): XLMRobertaIntermediate(\n", - " (dense): Linear(in_features=1024, out_features=4096, bias=True)\n", - " (intermediate_act_fn): GELUActivation()\n", - " )\n", - " (output): XLMRobertaOutput(\n", - " (dense): Linear(in_features=4096, out_features=1024, bias=True)\n", - " (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " )\n", - " )\n", - " (pooler): XLMRobertaPooler(\n", - " (dense): Linear(in_features=1024, out_features=1024, bias=True)\n", - " (activation): Tanh()\n", - " )\n", - ")" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "raw_model.eval()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Multi-Functionality" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 240131.91it/s]\n" - ] - } - ], - "source": [ - "from FlagEmbedding import BGEM3FlagModel\n", - "\n", - "model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)\n", - "\n", - "sentences_1 = [\"What is BGE M3?\", \"Defination of BM25\"]\n", - "sentences_2 = [\"BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.\", \n", - " \"BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.1 Dense Retrieval" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using BGE M3 for dense embedding has similar steps to BGE or BGE 1.5 models.\n", - "\n", - "Use the normalized hidden state of the special token [CLS] as the embedding:\n", - "\n", - "$$e_q = norm(H_q[0])$$\n", - "\n", - "Then compute the relevance score between the query and passage:\n", - "\n", - "$$s_{dense}=f_{sim}(e_p, e_q)$$\n", - "\n", - "where $e_p, e_q$ are the embedding vectors of passage and query, respectively.\n", - "\n", - "$f_{sim}$ is the score function (such as inner product and L2 distance) for comupting two embeddings' similarity." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[0.6259035 0.34749585]\n", - " [0.349868 0.6782462 ]]\n" - ] - } - ], - "source": [ - "# If you don't need such a long length of 8192 input tokens, you can set max_length to a smaller value to speed up encoding.\n", - "embeddings_1 = model.encode(sentences_1, max_length=10)['dense_vecs']\n", - "embeddings_2 = model.encode(sentences_2, max_length=100)['dense_vecs']\n", - "\n", - "# compute the similarity scores\n", - "s_dense = embeddings_1 @ embeddings_2.T\n", - "print(s_dense)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.2 Sparse Retrieval" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set `return_sparse` to true to make the model return sparse vector. If a term token appears multiple times in the sentence, we only retain its max weight.\n", - "\n", - "BGE-M3 generates sparce embeddings by adding a linear layer and a ReLU activation function following the hidden states:\n", - "\n", - "$$w_{qt} = \\text{Relu}(W_{lex}^T H_q [i])$$\n", - "\n", - "where $W_{lex}$ representes the weights of linear layer and $H_q[i]$ is the encoder's output of the $i^{th}$ token." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{'What': 0.08362077, 'is': 0.081469566, 'B': 0.12964639, 'GE': 0.25186998, 'M': 0.17001738, '3': 0.26957875, '?': 0.040755156}, {'De': 0.050144322, 'fin': 0.13689369, 'ation': 0.045134712, 'of': 0.06342201, 'BM': 0.25167602, '25': 0.33353207}]\n" - ] - } - ], - "source": [ - "output_1 = model.encode(sentences_1, return_sparse=True)\n", - "output_2 = model.encode(sentences_2, return_sparse=True)\n", - "\n", - "# you can see the weight for each token:\n", - "print(model.convert_id_to_token(output_1['lexical_weights']))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Based on the tokens' weights of query and passage, the relevance score between them is computed by the joint importance of the co-existed terms within the query and passage:\n", - "\n", - "$$s_{lex} = \\sum_{t\\in q\\cap p}(w_{qt} * w_{pt})$$\n", - "\n", - "where $w_{qt}, w_{pt}$ are the importance weights of each co-existed term $t$ in query and passage, respectively." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.19554448500275612\n", - "0.00880391988903284\n" - ] - } - ], - "source": [ - "# compute the scores via lexical mathcing\n", - "s_lex_10_20 = model.compute_lexical_matching_score(output_1['lexical_weights'][0], output_2['lexical_weights'][0])\n", - "s_lex_10_21 = model.compute_lexical_matching_score(output_1['lexical_weights'][0], output_2['lexical_weights'][1])\n", - "\n", - "print(s_lex_10_20)\n", - "print(s_lex_10_21)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.3 Multi-Vector" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The multi-vector method utilizes the entire output embeddings for the representation of query $E_q$ and passage $E_p$.\n", - "\n", - "$$E_q = norm(W_{mul}^T H_q)$$\n", - "$$E_p = norm(W_{mul}^T H_p)$$\n", - "\n", - "where $W_{mul}$ is the learnable projection matrix." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(8, 1024)\n", - "(30, 1024)\n" - ] - } - ], - "source": [ - "output_1 = model.encode(sentences_1, return_dense=True, return_sparse=True, return_colbert_vecs=True)\n", - "output_2 = model.encode(sentences_2, return_dense=True, return_sparse=True, return_colbert_vecs=True)\n", - "\n", - "print(f\"({len(output_1['colbert_vecs'][0])}, {len(output_1['colbert_vecs'][0][0])})\")\n", - "print(f\"({len(output_2['colbert_vecs'][0])}, {len(output_2['colbert_vecs'][0][0])})\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Following ColBert, we use late-interaction to compute the fine-grained relevance score:\n", - "\n", - "$$s_{mul}=\\frac{1}{N}\\sum_{i=1}^N\\max_{j=1}^M E_q[i]\\cdot E_p^T[j]$$\n", - "\n", - "where $E_q, E_p$ are the entire output embeddings of query and passage, respectively.\n", - "\n", - "This is a summation of average of maximum similarity of each $v\\in E_q$ with vectors in $E_p$" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.7796662449836731\n", - "0.4621177911758423\n" - ] - } - ], - "source": [ - "s_mul_10_20 = model.colbert_score(output_1['colbert_vecs'][0], output_2['colbert_vecs'][0]).item()\n", - "s_mul_10_21 = model.colbert_score(output_1['colbert_vecs'][0], output_2['colbert_vecs'][1]).item()\n", - "\n", - "print(s_mul_10_20)\n", - "print(s_mul_10_21)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2.4 Hybrid Ranking" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "BGE-M3's multi-functionality gives the possibility of hybrid ranking to improve retrieval. Firstly, due to the heavy cost of multi-vector method, we can retrieve the candidate results by either of the dense or sparse method. Then, to get the final result, we can rerank the candidates based on the integrated relevance score:\n", - "\n", - "$$s_{rank} = w_1\\cdot s_{dense}+w_2\\cdot s_{lex} + w_3\\cdot s_{mul}$$\n", - "\n", - "where the values chosen for $w_1, w_2$ and $w_3$ varies depending on the downstream scenario (here 1/3 is just for demonstration)." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.5337047390639782\n", - "0.27280585498859483\n" - ] - } - ], - "source": [ - "s_rank_10_20 = 1/3 * s_dense[0][0] + 1/3 * s_lex_10_20 + 1/3 * s_mul_10_20\n", - "s_rank_10_21 = 1/3 * s_dense[0][1] + 1/3 * s_lex_10_21 + 1/3 * s_mul_10_21\n", - "\n", - "print(s_rank_10_20)\n", - "print(s_rank_10_21)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/source/tutorial/2_Metrics.rst b/docs/source/tutorial/2_Metrics.rst deleted file mode 100644 index af97b3de..00000000 --- a/docs/source/tutorial/2_Metrics.rst +++ /dev/null @@ -1,10 +0,0 @@ -2. Metrics -========== - -.. toctree:: - :hidden: - :maxdepth: 1 - :caption: Metrics - - 2_Metrics/2.1 - 2_Metrics/2.2 \ No newline at end of file diff --git a/docs/source/tutorial/2_Metrics/2.1.ipynb b/docs/source/tutorial/2_Metrics/2.1.ipynb deleted file mode 100644 index da3ec56c..00000000 --- a/docs/source/tutorial/2_Metrics/2.1.ipynb +++ /dev/null @@ -1,798 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "0d0f87e9-657d-46b9-a3f0-ebc1bf0656bd", - "metadata": {}, - "source": [ - "# Similarity" - ] - }, - { - "cell_type": "markdown", - "id": "00c817d5", - "metadata": {}, - "source": [ - "In this section, we will introduce several different ways to measure similarity." - ] - }, - { - "cell_type": "markdown", - "id": "dae49384-2450-425c-b050-c27d3c07d8e7", - "metadata": { - "tags": [] - }, - "source": [ - "## 1. Jaccard Similarity" - ] - }, - { - "cell_type": "markdown", - "id": "03266267-2d6d-4124-9702-f61e0510586c", - "metadata": {}, - "source": [ - "Before directly calculate the similarity between embedding vectors, let's first take a look at the primal method for measuring how similar two sentenses are: Jaccard similarity.\n", - "\n", - "**Definition:** For sets $A$ and $B$, the Jaccard index, or the Jaccard similarity coefficient between them is the size of their intersection divided by the size of their union:\n", - "$$J(A,B)=\\frac{|A\\cap B|}{|A\\cup B|}$$\n", - "\n", - "The value of $J(A,B)$ falls in the range of $[0, 1]$." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "bed533e1-a17c-4595-bdff-7f4a29e4deb3", - "metadata": { - "execution": { - "iopub.execute_input": "2024-07-17T03:12:47.091346Z", - "iopub.status.busy": "2024-07-17T03:12:47.091019Z", - "iopub.status.idle": "2024-07-17T03:12:47.094401Z", - "shell.execute_reply": "2024-07-17T03:12:47.093967Z", - "shell.execute_reply.started": "2024-07-17T03:12:47.091327Z" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "def jaccard_similarity(sentence1, sentence2):\n", - " set1 = set(sentence1.split(\" \"))\n", - " set2 = set(sentence2.split(\" \"))\n", - " intersection = set1.intersection(set2)\n", - " union = set1.union(set2)\n", - " return len(intersection)/len(union)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "ea766de8-572d-4eca-91f7-284a121e8edb", - "metadata": { - "ExecutionIndicator": { - "show": true - }, - "execution": { - "iopub.execute_input": "2024-07-17T03:14:06.133012Z", - "iopub.status.busy": "2024-07-17T03:14:06.132502Z", - "iopub.status.idle": "2024-07-17T03:14:06.135483Z", - "shell.execute_reply": "2024-07-17T03:14:06.135044Z", - "shell.execute_reply.started": "2024-07-17T03:14:06.132992Z" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "s1 = \"Hawaii is a wonderful place for holiday\"\n", - "s2 = \"Peter's favorite place to spend his holiday is Hawaii\"\n", - "s3 = \"Anna enjoys baking during her holiday\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "b359ff4e-21a1-489a-ad46-ba53e974dc48", - "metadata": { - "execution": { - "iopub.execute_input": "2024-07-17T03:13:34.646320Z", - "iopub.status.busy": "2024-07-17T03:13:34.645942Z", - "iopub.status.idle": "2024-07-17T03:13:34.649389Z", - "shell.execute_reply": "2024-07-17T03:13:34.648998Z", - "shell.execute_reply.started": "2024-07-17T03:13:34.646302Z" - }, - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.3333333333333333" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "jaccard_similarity(s1, s2)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "069868a9-d379-4d55-8a23-835a2972d079", - "metadata": { - "execution": { - "iopub.execute_input": "2024-07-17T03:14:13.727400Z", - "iopub.status.busy": "2024-07-17T03:14:13.726949Z", - "iopub.status.idle": "2024-07-17T03:14:13.730545Z", - "shell.execute_reply": "2024-07-17T03:14:13.730121Z", - "shell.execute_reply.started": "2024-07-17T03:14:13.727381Z" - }, - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.08333333333333333" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "jaccard_similarity(s1, s3)" - ] - }, - { - "cell_type": "markdown", - "id": "b0323128", - "metadata": {}, - "source": [ - "We can see that sentence 1 and 2 are sharing 'Hawaii', 'place', and 'holiday'. Thus getting a larger score of similarity (0.333) than that (0.083) of the sentence 1 and 3 that only share 'holiday'." - ] - }, - { - "cell_type": "markdown", - "id": "b509fa6c-87ac-4c59-b40e-fda95fd036d9", - "metadata": { - "tags": [] - }, - "source": [ - "## 2. Euclidean Distance" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "9da366b8-427f-4e8f-b3e6-b453050f0591", - "metadata": { - "ExecutionIndicator": { - "show": true - }, - "execution": { - "iopub.execute_input": "2024-07-17T02:30:37.643857Z", - "iopub.status.busy": "2024-07-17T02:30:37.643302Z", - "iopub.status.idle": "2024-07-17T02:30:37.647921Z", - "shell.execute_reply": "2024-07-17T02:30:37.647513Z", - "shell.execute_reply.started": "2024-07-17T02:30:37.643840Z" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([[5., 2., 2., 6.]]) tensor([[4., 6., 6., 4.]])\n" - ] - } - ], - "source": [ - "import torch\n", - "\n", - "A = torch.randint(1, 7, (1, 4), dtype=torch.float32)\n", - "B = torch.randint(1, 7, (1, 4), dtype=torch.float32)\n", - "print(A, B)" - ] - }, - { - "cell_type": "markdown", - "id": "6c068bb3-90ce-4266-8335-e3fb2ad3e996", - "metadata": {}, - "source": [ - "**Definition:** For vectors $A$ and $B$, the Euclidean distance or L2 distance between them is defined as:\n", - "$$d(A, B) = \\|A-B\\|_2 = \\sqrt{\\sum_{i=1}^n (A_i-B_i)^2}$$\n", - "\n", - "The value of $d(A, B)$ falls in the range of [0, $+\\infty$). Since this is the measurement of distance, the closer the value is to 0, the more similar the two vector is. And the larger the value is, the two vectors are more dissimilar." - ] - }, - { - "cell_type": "markdown", - "id": "1d6c734d-cc03-4dd1-bb9e-3243006dcff4", - "metadata": {}, - "source": [ - "You can calculate Euclidean distance step by step or directly call *torch.cdist()*" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "0773acf4-eb53-4058-85da-af82af20c469", - "metadata": { - "ExecutionIndicator": { - "show": true - }, - "execution": { - "iopub.execute_input": "2024-07-17T02:32:45.240684Z", - "iopub.status.busy": "2024-07-17T02:32:45.240216Z", - "iopub.status.idle": "2024-07-17T02:32:45.244248Z", - "shell.execute_reply": "2024-07-17T02:32:45.243843Z", - "shell.execute_reply.started": "2024-07-17T02:32:45.240665Z" - }, - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "6.082762718200684" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dist = torch.sqrt(torch.sum(torch.pow(torch.subtract(A, B), 2), dim=-1))\n", - "dist.item()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "1dd45446-f7d6-4aab-b078-1d34f0a949e4", - "metadata": { - "ExecutionIndicator": { - "show": true - }, - "execution": { - "iopub.execute_input": "2024-07-17T02:32:57.551560Z", - "iopub.status.busy": "2024-07-17T02:32:57.550896Z", - "iopub.status.idle": "2024-07-17T02:32:57.555031Z", - "shell.execute_reply": "2024-07-17T02:32:57.554638Z", - "shell.execute_reply.started": "2024-07-17T02:32:57.551536Z" - }, - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "6.082762718200684" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "torch.cdist(A, B, p=2).item()" - ] - }, - { - "cell_type": "markdown", - "id": "da4435c0-98da-4397-8a45-c954dd3ada56", - "metadata": {}, - "source": [ - "### (Maximum inner-product search)" - ] - }, - { - "cell_type": "markdown", - "id": "0e0fa5c2-e619-4a0f-a785-9cc209f1503b", - "metadata": { - "tags": [] - }, - "source": [ - "## 3. Cosine Similarity" - ] - }, - { - "cell_type": "markdown", - "id": "790e1ce3-1468-4819-a956-fc8eac690d89", - "metadata": {}, - "source": [ - "For vectors $A$ and $B$, their cosine similarity is defined as:\n", - "$$\\cos(\\theta)=\\frac{A\\cdot B}{\\|A\\|\\|B\\|}$$\n", - "\n", - "The value of $\\cos(\\theta)$ falls in the range of $[-1, 1]$. Different from Euclidean distance, close to -1 denotes not similar at all and close to +1 means very similar." - ] - }, - { - "cell_type": "markdown", - "id": "d0a64b4b-5caf-4bee-be0f-2e26b1c7ed6e", - "metadata": { - "tags": [] - }, - "source": [ - "### 3.1 Naive Approach" - ] - }, - { - "cell_type": "markdown", - "id": "350cc48d-6e73-4e20-86dd-c05d1238ef60", - "metadata": {}, - "source": [ - "The naive approach is just expanding the expression:\n", - "$$\\frac{A\\cdot B}{\\|A\\|\\|B\\|}=\\frac{\\sum_{i=1}^{i=n}A_i B_i}{\\sqrt{\\sum_{i=1}^{n}A_i^2}\\cdot\\sqrt{\\sum_{i=1}^{n}B_i^2}}$$" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "20c7cff0-55a7-4222-9e5a-f5450171fb00", - "metadata": { - "ExecutionIndicator": { - "show": true - }, - "execution": { - "iopub.execute_input": "2024-07-17T02:24:35.239550Z", - "iopub.status.busy": "2024-07-17T02:24:35.239073Z", - "iopub.status.idle": "2024-07-17T02:24:35.242844Z", - "shell.execute_reply": "2024-07-17T02:24:35.242417Z", - "shell.execute_reply.started": "2024-07-17T02:24:35.239531Z" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# Compute the dot product of A and B\n", - "dot_prod = sum(a*b for a, b in zip(A[0], B[0]))\n", - "\n", - "# Compute the magnitude of A and B\n", - "A_norm = torch.sqrt(sum(a*a for a in A[0]))\n", - "B_norm = torch.sqrt(sum(b*b for b in B[0]))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "f4dce1fb-9cff-4a0d-bc7f-a503be6a37ae", - "metadata": { - "ExecutionIndicator": { - "show": true - }, - "execution": { - "iopub.execute_input": "2024-07-17T02:24:36.533667Z", - "iopub.status.busy": "2024-07-17T02:24:36.533224Z", - "iopub.status.idle": "2024-07-17T02:24:36.536611Z", - "shell.execute_reply": "2024-07-17T02:24:36.536181Z", - "shell.execute_reply.started": "2024-07-17T02:24:36.533650Z" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.802726686000824\n" - ] - } - ], - "source": [ - "cos_1 = dot_prod / (A_norm * B_norm)\n", - "print(cos_1.item())" - ] - }, - { - "cell_type": "markdown", - "id": "4665f38f-c1f1-42dd-914d-d1d69c038e88", - "metadata": { - "tags": [] - }, - "source": [ - "### 3.2 PyTorch Implementation" - ] - }, - { - "cell_type": "markdown", - "id": "6154391d-1dea-4673-8502-b496cf87d4b0", - "metadata": {}, - "source": [ - "The naive approach has few issues:\n", - "- There are chances of losing precision in the numerator and the denominator\n", - "- Losing precision may cause the computed cosine similarity > 1.0\n", - "\n", - "Thus PyTorch uses the following way:\n", - "\n", - "$$\n", - "\\frac{A\\cdot B}{\\|A\\|\\|B\\|}=\\frac{A}{\\|A\\|}\\cdot\\frac{B}{\\|B\\|}\n", - "$$" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "b8be02be-3ac3-4e5f-a450-c53f05781ab4", - "metadata": { - "ExecutionIndicator": { - "show": true - }, - "execution": { - "iopub.execute_input": "2024-07-17T02:24:38.945105Z", - "iopub.status.busy": "2024-07-17T02:24:38.944403Z", - "iopub.status.idle": "2024-07-17T02:24:38.948117Z", - "shell.execute_reply": "2024-07-17T02:24:38.947698Z", - "shell.execute_reply.started": "2024-07-17T02:24:38.945085Z" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.802726686000824\n" - ] - } - ], - "source": [ - "res = torch.mm(A / A.norm(dim=1), B.T / B.norm(dim=1))\n", - "print(res.item())" - ] - }, - { - "cell_type": "markdown", - "id": "988acff0-e6b5-41db-92d6-8f175dd3e272", - "metadata": { - "tags": [] - }, - "source": [ - "### 3.3 PyTorch Function Call" - ] - }, - { - "cell_type": "markdown", - "id": "a61b4871-4039-4c6e-b5ee-f66a12156be9", - "metadata": {}, - "source": [ - "In practice, the most convinient way is directly use *cosine_similarity()* in torch.nn.functional:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "1ac4012e-b90a-4e60-97b8-e42636fde1c9", - "metadata": { - "ExecutionIndicator": { - "show": true - }, - "execution": { - "iopub.execute_input": "2024-07-17T02:24:55.804298Z", - "iopub.status.busy": "2024-07-17T02:24:55.803810Z", - "iopub.status.idle": "2024-07-17T02:24:55.807551Z", - "shell.execute_reply": "2024-07-17T02:24:55.807146Z", - "shell.execute_reply.started": "2024-07-17T02:24:55.804278Z" - }, - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0.802726686000824" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import torch.nn.functional as F\n", - "\n", - "F.cosine_similarity(A, B).item()" - ] - }, - { - "cell_type": "markdown", - "id": "f4ab87cc", - "metadata": {}, - "source": [ - "## 4. Inner Product/Dot Product" - ] - }, - { - "cell_type": "markdown", - "id": "e3c025ab", - "metadata": {}, - "source": [ - "Coordinate definition:\n", - "$$A\\cdot B = \\sum_{i=1}^{i=n}A_i B_i$$\n", - "\n", - "Geometric definition:\n", - "$$A\\cdot B = \\|A\\|\\|B\\|\\cos(\\theta)$$" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "f0291d42", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "68.0" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dot_prod = A @ B.T\n", - "dot_prod.item()" - ] - }, - { - "cell_type": "markdown", - "id": "33099a2e", - "metadata": {}, - "source": [ - "### Relationship with Cosine similarity" - ] - }, - { - "cell_type": "markdown", - "id": "2790e183", - "metadata": {}, - "source": [ - "For computing the distance/similarity between two vectors, dot product and Cos similarity are closely related. Cos similarity only cares about the angle difference (because it is normalized by the product of two vectors' magnitude), while dot product takes both magnitude and angle into consideration. So the two metrics are preferred in different use cases.\n", - "\n", - "The BGE series models already normalized the output embedding vector to have the magnitude of 1. Thus using dot product and cos similarity will have the same result." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "e0f40534", - "metadata": {}, - "outputs": [], - "source": [ - "from FlagEmbedding import FlagModel\n", - "\n", - "model = FlagModel('BAAI/bge-large-en-v1.5',\n", - " query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n", - " use_fp16=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "78445a86", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1.0" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sentence = \"I am very interested in natural language processing\"\n", - "embedding = torch.tensor(model.encode(sentence))\n", - "torch.norm(embedding).item()" - ] - }, - { - "cell_type": "markdown", - "id": "9e1822ee", - "metadata": {}, - "source": [ - "## 5. Examples" - ] - }, - { - "cell_type": "markdown", - "id": "6c665e3a", - "metadata": {}, - "source": [ - "Now we've learned the mechanism of different types of similarity. Let's look at a real example." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "73012cbb", - "metadata": {}, - "outputs": [], - "source": [ - "sentence_1 = \"I will watch a show tonight\"\n", - "sentence_2 = \"I will show you my watch tonight\"\n", - "sentence_3 = \"I'm going to enjoy a performance this evening\"" - ] - }, - { - "cell_type": "markdown", - "id": "3cb79a47", - "metadata": {}, - "source": [ - "It's clear to us that in sentence 1, 'watch' is a verb and 'show' is a noun. \n", - "\n", - "But in sentence 2, 'show' is a verb and 'watch' is a noun, which leads to different meaning of the two sentences.\n", - "\n", - "While sentence 3 has very similar meaning to sentence 1." - ] - }, - { - "cell_type": "markdown", - "id": "dc44dee9", - "metadata": {}, - "source": [ - "Now let's see how does different similarity metrics tell us the relationship of the sentences." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "98bfcc6d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.625\n", - "0.07692307692307693\n" - ] - } - ], - "source": [ - "print(jaccard_similarity(sentence_1, sentence_2))\n", - "print(jaccard_similarity(sentence_1, sentence_3))" - ] - }, - { - "cell_type": "markdown", - "id": "b7e4cd15", - "metadata": {}, - "source": [ - "The results show that sentence 1 and 2 (0.625) are way more similar than sentence 1 and 3 (0.077), which indicate the opposite conclusion compare to what we have made." - ] - }, - { - "cell_type": "markdown", - "id": "cff73692", - "metadata": {}, - "source": [ - "Now let's first get the embeddings of these sentences." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "426c0b42", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "torch.Size([1, 1024])\n" - ] - } - ], - "source": [ - "embeddings = torch.from_numpy(model.encode([sentence_1, sentence_2, sentence_3]))\n", - "embedding_1 = embeddings[0].view(1, -1)\n", - "embedding_2 = embeddings[1].view(1, -1)\n", - "embedding_3 = embeddings[2].view(1, -1)\n", - "\n", - "print(embedding_1.shape)" - ] - }, - { - "cell_type": "markdown", - "id": "63fe1b31", - "metadata": {}, - "source": [ - "Then let's compute the Euclidean distance:" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "d9bb35cf", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.714613139629364\n", - "0.5931472182273865\n" - ] - } - ], - "source": [ - "euc_dist1_2 = torch.cdist(embedding_1, embedding_2, p=2).item()\n", - "euc_dist1_3 = torch.cdist(embedding_1, embedding_3, p=2).item()\n", - "print(euc_dist1_2)\n", - "print(euc_dist1_3)" - ] - }, - { - "cell_type": "markdown", - "id": "402e6ea8", - "metadata": {}, - "source": [ - "Then, let's see the cosine similarity:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "29e70bbc", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.7446640729904175\n", - "0.8240882158279419\n" - ] - } - ], - "source": [ - "cos_dist1_2 = F.cosine_similarity(embedding_1, embedding_2).item()\n", - "cos_dist1_3 = F.cosine_similarity(embedding_1, embedding_3).item()\n", - "print(cos_dist1_2)\n", - "print(cos_dist1_3)" - ] - }, - { - "cell_type": "markdown", - "id": "c353d8cc", - "metadata": {}, - "source": [ - "Using embedding, we can get the correct result different from Jaccard similarity that sentence 1 and 2 should be more similar than sentence 1 and 3 using either Euclidean distance or cos similarity as the metric." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/source/tutorial/2_Metrics/2.2.ipynb b/docs/source/tutorial/2_Metrics/2.2.ipynb deleted file mode 100644 index 6fdc09f4..00000000 --- a/docs/source/tutorial/2_Metrics/2.2.ipynb +++ /dev/null @@ -1,472 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Evaluation Metrics" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this tutorial, we'll cover a list of metrics that are widely used for evaluating embedding model's performance." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 0. Preparation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install numpy scikit-learn" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Suppose we have a corpus with document ids from 0 - 30. \n", - "- `ground_truth` contains the actual relevant document ids to each query.\n", - "- `results` contains the search results of each query by some retrieval system." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "ground_truth = [\n", - " [11, 1, 7, 17, 21],\n", - " [ 4, 16, 1],\n", - " [26, 10, 22, 8],\n", - "]\n", - "\n", - "results = [\n", - " [11, 1, 17, 7, 21, 8, 0, 28, 9, 20],\n", - " [16, 1, 6, 18, 3, 4, 25, 19, 8, 14],\n", - " [24, 10, 26, 2, 8, 28, 4, 23, 13, 21],\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13, 14, 16, 17, 18, 19,\n", - " 21, 22, 24, 25, 26, 28])" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.intersect1d(ground_truth, results)" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0],\n", - " [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],\n", - " [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "np.isin(ground_truth, results).astype(int)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And we are interested in the following cutoffs:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "cutoffs = [1, 5, 10]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this tutorial, we will use the above small example to show how different metrics evaluate the retrieval system's quality." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Recall" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Recall represents the model's capability of correctly predicting positive instances from all the actual positive samples in the dataset.\n", - "\n", - "$$\\textbf{Recall}=\\frac{\\text{True Positives}}{\\text{True Positives}+\\text{False Negatives}}$$\n", - "\n", - "to write it in the form of information retrieval, which is the ratio of relevant documents retrieved to the total number of relevant documents in the corpus. In practice, we usually make the denominator to be the minimum between the current cutoff (usually 1, 5, 10, 100, etc) and the total number of relevant documents in the corpus:\n", - "\n", - "$$\\textbf{Recall}=\\frac{|\\text{\\{Relevant docs\\}}\\cap\\text{\\{Retrieved docs\\}}|}{\\text{min}(|\\text{\\{Retrieved docs\\}}|, |\\text{\\{Relevant docs\\}}|)}$$" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "def calc_recall(preds, truths, cutoffs):\n", - " recalls = np.zeros(len(cutoffs))\n", - " for text, truth in zip(preds, truths):\n", - " for i, c in enumerate(cutoffs):\n", - " hits = np.intersect1d(truth, text[:c])\n", - " recalls[i] += len(hits) / max(min(c, len(truth)), 1)\n", - " recalls /= len(preds)\n", - " return recalls" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "recall@1: 0.6666666666666666\n", - "recall@5: 0.8055555555555555\n", - "recall@10: 0.9166666666666666\n" - ] - } - ], - "source": [ - "recalls = calc_recall(results, ground_truth, cutoffs)\n", - "for i, c in enumerate(cutoffs):\n", - " print(f\"recall@{c}: {recalls[i]}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. MRR" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Mean Reciprocal Rank ([MRR](https://en.wikipedia.org/wiki/Mean_reciprocal_rank)) is a widely used metric in information retrieval to evaluate the effectiveness of a system. It measures the rank position of the first relevant result in a list of search results.\n", - "\n", - "$$MRR=\\frac{1}{|Q|}\\sum_{i=1}^{|Q|}\\frac{1}{rank_i}$$\n", - "\n", - "where \n", - "- $|Q|$ is the total number of queries.\n", - "- $rank_i$ is the rank position of the first relevant document of the i-th query." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "def calc_MRR(preds, truth, cutoffs):\n", - " mrr = [0 for _ in range(len(cutoffs))]\n", - " for pred, t in zip(preds, truth):\n", - " for i, c in enumerate(cutoffs):\n", - " for j, p in enumerate(pred):\n", - " if j < c and p in t:\n", - " mrr[i] += 1/(j+1)\n", - " break\n", - " mrr = [k/len(preds) for k in mrr]\n", - " return mrr" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MRR@1: 0.6666666666666666\n", - "MRR@5: 0.8333333333333334\n", - "MRR@10: 0.8333333333333334\n" - ] - } - ], - "source": [ - "mrr = calc_MRR(results, ground_truth, cutoffs)\n", - "for i, c in enumerate(cutoffs):\n", - " print(f\"MRR@{c}: {mrr[i]}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. nDCG" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Normalized Discounted Cumulative Gain (nDCG) measures the quality of a ranked list of search results by considering both the position of the relevant documents and their graded relevance scores. The calculation of nDCG involves two main steps:\n", - "\n", - "1. Discounted cumulative gain (DCG) measures the ranking quality in retrieval tasks.\n", - "\n", - "$$DCG_p=\\sum_{i=1}^p\\frac{2^{rel_i}-1}{\\log_2(i+1)}$$\n", - "\n", - "2. Normalized by ideal DCG to make it comparable across queries.\n", - "$$nDCG_p=\\frac{DCG_p}{IDCG_p}$$\n", - "where $IDCG$ is the maximum possible DCG for a given set of documents, assuming they are perfectly ranked in order of relevance." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "pred_hard_encodings = []\n", - "for pred, label in zip(results, ground_truth):\n", - " pred_hard_encoding = list(np.isin(pred, label).astype(int))\n", - " pred_hard_encodings.append(pred_hard_encoding)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "nDCG@1: 0.0\n", - "nDCG@5: 0.3298163165186628\n", - "nDCG@10: 0.5955665344840209\n" - ] - } - ], - "source": [ - "from sklearn.metrics import ndcg_score\n", - "\n", - "for i, c in enumerate(cutoffs):\n", - " nDCG = ndcg_score(pred_hard_encodings, results, k=c)\n", - " print(f\"nDCG@{c}: {nDCG}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Precision" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Precision \n", - "\n", - "$$\\textbf{Recall}=\\frac{\\text{True Positives}}{\\text{True Positives}+\\text{False Positive}}$$\n", - "\n", - "in information retrieval, it's the ratio of relevant documents retrieved to the totoal number of documents retrieved:\n", - "\n", - "$$\\textbf{Recall}=\\frac{|\\text{\\{Relevant docs\\}}\\cap\\text{\\{Retrieved docs\\}}|}{|\\text{\\{Retrieved docs\\}}|}$$" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "def calc_precision(preds, truths, cutoffs):\n", - " prec = np.zeros(len(cutoffs))\n", - " for text, truth in zip(preds, truths):\n", - " for i, c in enumerate(cutoffs):\n", - " hits = np.intersect1d(truth, text[:c])\n", - " prec[i] += len(hits) / c\n", - " prec /= len(preds)\n", - " return prec" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "precision@1: 0.6666666666666666\n", - "precision@5: 0.6666666666666666\n", - "precision@10: 0.3666666666666667\n" - ] - } - ], - "source": [ - "precisions = calc_precision(results, ground_truth, cutoffs)\n", - "for i, c in enumerate(cutoffs):\n", - " print(f\"precision@{c}: {precisions[i]}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. MAP" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Mean Average Precision (MAP) measures the effectiveness of a system at returning relevant documents across multiple queries. \n", - "\n", - "First, Average Precision (AP) evals how well relevant documents are ranked within the retrieved documents. It's computed by averaging the precision values for each position of relevant document in the ranking of all the retrieved documents:\n", - "\n", - "$$\\textbf{AP}=\\frac{\\sum_{k=1}^{M}\\text{Relevance}(k) \\times \\text{Precision}(k)}{|\\{\\text{Relevant Docs}\\}|}$$\n", - "\n", - "where \n", - "- $M$ is the total number of documents retrieved.\n", - "- $\\text{Relevance}(k)$ is a binary value, indicating whether document at position $k$ is relevant (=1) or not (=0).\n", - "- $\\text{Precision}(k)$ is the precision when considering only top $k$ retrieved items." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then calculate the average AP across multiple queries to get the MAP:\n", - "\n", - "$$\\textbf{MAP}=\\frac{1}{N}\\sum_{i=1}^{N}\\text{AP}_i$$\n", - "\n", - "where\n", - "- $N$ is the total number of queries.\n", - "- $\\text{AP}_i$ is the average precision of the $i^{th}$ query." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "def calc_AP(encoding):\n", - " rel = 0\n", - " precs = 0.0\n", - " for k, hit in enumerate(encoding, start=1):\n", - " if hit == 1:\n", - " rel += 1\n", - " precs += rel/k\n", - "\n", - " return 0 if rel == 0 else precs/rel" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "def calc_MAP(encodings, cutoffs):\n", - " res = []\n", - " for c in cutoffs:\n", - " ap_sum = 0.0\n", - " for encoding in encodings:\n", - " ap_sum += calc_AP(encoding[:c])\n", - " res.append(ap_sum/len(encodings))\n", - " \n", - " return res" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MAP@1: 0.6666666666666666\n", - "MAP@5: 0.862962962962963\n", - "MAP@10: 0.8074074074074075\n" - ] - } - ], - "source": [ - "maps = calc_MAP(pred_hard_encodings, cutoffs)\n", - "for i, c in enumerate(cutoffs):\n", - " print(f\"MAP@{c}: {maps[i]}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "test", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/source/tutorial/3_Indexing.rst b/docs/source/tutorial/3_Indexing.rst deleted file mode 100644 index f4eddca4..00000000 --- a/docs/source/tutorial/3_Indexing.rst +++ /dev/null @@ -1,13 +0,0 @@ -3. Indexing -=========== - -.. toctree:: - :hidden: - :maxdepth: 1 - :caption: Indexing - - 3_Indexing/3.1.1 - 3_Indexing/3.1.2 - 3_Indexing/3.1.3 - 3_Indexing/3.1.4 - 3_Indexing/3.1.5 \ No newline at end of file diff --git a/docs/source/tutorial/3_Indexing/3.1.1.ipynb b/docs/source/tutorial/3_Indexing/3.1.1.ipynb deleted file mode 100644 index 46a157d2..00000000 --- a/docs/source/tutorial/3_Indexing/3.1.1.ipynb +++ /dev/null @@ -1,411 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Indexing Using Faiss" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In practical cases, datasets contain thousands or millions of rows. Looping through the whole corpus to find the best answer to a query is very time and space consuming. In this tutorial, we'll introduce how to use indexing to make our retrieval fast and neat." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 0: Setup" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install the dependencies in the environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -U FlagEmbedding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### faiss-gpu on Linux (x86_64)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Faiss maintain the latest updates on conda. So if you have GPUs on Linux x86_64, create a conda virtual environment and run:\n", - "\n", - "```conda install -c pytorch -c nvidia faiss-gpu=1.8.0```\n", - "\n", - "and make sure you select that conda env as the kernel for this notebook." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### faiss-cpu\n", - "\n", - "Otherwise it's simple, just run the following cell to install `faiss-cpu`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -U faiss-cpu" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 1: Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Below is a super tiny courpus with only 10 sentences, which will be the dataset we use.\n", - "\n", - "Each sentence is a concise discription of a famous people in specific domain." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "corpus = [\n", - " \"Michael Jackson was a legendary pop icon known for his record-breaking music and dance innovations.\",\n", - " \"Fei-Fei Li is a professor in Stanford University, revolutionized computer vision with the ImageNet project.\",\n", - " \"Brad Pitt is a versatile actor and producer known for his roles in films like 'Fight Club' and 'Once Upon a Time in Hollywood.'\",\n", - " \"Geoffrey Hinton, as a foundational figure in AI, received Turing Award for his contribution in deep learning.\",\n", - " \"Eminem is a renowned rapper and one of the best-selling music artists of all time.\",\n", - " \"Taylor Swift is a Grammy-winning singer-songwriter known for her narrative-driven music.\",\n", - " \"Sam Altman leads OpenAI as its CEO, with astonishing works of GPT series and pursuing safe and beneficial AI.\",\n", - " \"Morgan Freeman is an acclaimed actor famous for his distinctive voice and diverse roles.\",\n", - " \"Andrew Ng spread AI knowledge globally via public courses on Coursera and Stanford University.\",\n", - " \"Robert Downey Jr. is an iconic actor best known for playing Iron Man in the Marvel Cinematic Universe.\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And a few queries (add your own queries and check the result!): " - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "queries = [\n", - " \"Who is Robert Downey Jr.?\",\n", - " \"An expert of neural network\",\n", - " \"A famous female singer\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 2: Text Embedding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here, for the sake of speed, we just embed the first 500 docs in the corpus." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shape of the corpus embeddings: (10, 768)\n", - "data type of the embeddings: float32\n" - ] - } - ], - "source": [ - "from FlagEmbedding import FlagModel\n", - "\n", - "# get the BGE embedding model\n", - "model = FlagModel('BAAI/bge-base-en-v1.5',\n", - " query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n", - " use_fp16=True)\n", - "\n", - "# get the embedding of the corpus\n", - "corpus_embeddings = model.encode(corpus)\n", - "\n", - "print(\"shape of the corpus embeddings:\", corpus_embeddings.shape)\n", - "print(\"data type of the embeddings: \", corpus_embeddings.dtype)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Faiss only accepts float32 inputs.\n", - "\n", - "So make sure the dtype of corpus_embeddings is float32 before adding them to the index." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "corpus_embeddings = corpus_embeddings.astype(np.float32)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 3: Indexing" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this step, we build an index and add the embedding vectors to it." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "import faiss\n", - "\n", - "# get the length of our embedding vectors, vectors by bge-base-en-v1.5 have length 768\n", - "dim = corpus_embeddings.shape[-1]\n", - "\n", - "# create the faiss index and store the corpus embeddings into the vector space\n", - "index = faiss.index_factory(dim, 'Flat', faiss.METRIC_INNER_PRODUCT)\n", - "\n", - "# if you installed faiss-gpu, uncomment the following lines to make the index on your GPUs.\n", - "\n", - "# co = faiss.GpuMultipleClonerOptions()\n", - "# index = faiss.index_cpu_to_all_gpus(index, co)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "No need to train if we use \"Flat\" quantizer and METRIC_INNER_PRODUCT as metric. Some other indices that using quantization might need training." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "True\n", - "total number of vectors: 10\n" - ] - } - ], - "source": [ - "# check if the index is trained\n", - "print(index.is_trained) \n", - "# index.train(corpus_embeddings)\n", - "\n", - "# add all the vectors to the index\n", - "index.add(corpus_embeddings)\n", - "\n", - "print(f\"total number of vectors: {index.ntotal}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 3.5 (Optional): Saving Faiss index" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Once you have your index with the embedding vectors, you can save it locally for future usage." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "# change the path to where you want to save the index\n", - "path = \"./index.bin\"\n", - "faiss.write_index(index, path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you already have stored index in your local directory, you can load it by:" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "index = faiss.read_index(\"./index.bin\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 4: Find answers to the query" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, get the embeddings of all the queries:" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "query_embeddings = model.encode_queries(queries)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then, use the Faiss index to do a knn search in the vector space:" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[0.6686779 0.37858668 0.3767978 ]\n", - " [0.6062041 0.59364545 0.527691 ]\n", - " [0.5409331 0.5097007 0.42427146]]\n", - "[[9 7 2]\n", - " [3 1 8]\n", - " [5 0 4]]\n" - ] - } - ], - "source": [ - "dists, ids = index.search(query_embeddings, k=3)\n", - "print(dists)\n", - "print(ids)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's see the result:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "query:\tWho is Robert Downey Jr.?\n", - "answer:\tRobert Downey Jr. is an iconic actor best known for playing Iron Man in the Marvel Cinematic Universe.\n", - "\n", - "query:\tAn expert of neural network\n", - "answer:\tGeoffrey Hinton, as a foundational figure in AI, received Turing Award for his contribution in deep learning.\n", - "\n", - "query:\tA famous female singer\n", - "answer:\tTaylor Swift is a Grammy-winning singer-songwriter known for her narrative-driven music.\n", - "\n" - ] - } - ], - "source": [ - "for i, q in enumerate(queries):\n", - " print(f\"query:\\t{q}\\nanswer:\\t{corpus[ids[i][0]]}\\n\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/source/tutorial/3_Indexing/3.1.2.ipynb b/docs/source/tutorial/3_Indexing/3.1.2.ipynb deleted file mode 100644 index b75cb5ed..00000000 --- a/docs/source/tutorial/3_Indexing/3.1.2.ipynb +++ /dev/null @@ -1,373 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Faiss GPU" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the last tutorial, we went through the basics of indexing using faiss-cpu. While for the use cases in research and industry. The size of dataset for indexing will be extremely large, the frequency of searching might also be very high. In this tutorial we'll see how to combine Faiss and GPU almost seamlessly." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Installation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Faiss maintain the latest updates on conda. And its gpu version only supports Linux x86_64\n", - "\n", - "create a conda virtual environment and run:\n", - "\n", - "```conda install -c pytorch -c nvidia faiss-gpu=1.8.0```\n", - "\n", - "make sure you select that conda env as the kernel for this notebook. After installation, restart the kernal." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If your system does not satisfy the requirement, install faiss-cpu and just skip the steps with gpu related codes." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Data Preparation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First let's create two datasets with \"fake embeddings\" of corpus and queries:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import faiss\n", - "import numpy as np\n", - "\n", - "dim = 768\n", - "corpus_size = 1000\n", - "# np.random.seed(111)\n", - "\n", - "corpus = np.random.random((corpus_size, dim)).astype('float32')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Create Index on CPU" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Option 1:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Faiss provides a great amount of choices of indexes by initializing directly:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# first build a flat index (on CPU)\n", - "index = faiss.IndexFlatIP(dim)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Option 2:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Besides the basic index class, we can also use the index_factory function to produce composite Faiss index." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "index = faiss.index_factory(dim, \"Flat\", faiss.METRIC_L2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Build GPU Index and Search" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "All the GPU indexes are built with `StandardGpuResources` object. It contains all the needed resources for each GPU in use. By default it will allocate 18% of the total VRAM as a temporary scratch space." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `GpuClonerOptions` and `GpuMultipleClonerOptions` objects are optional when creating index from cpu to gpu. They are used to adjust the way the GPUs stores the objects." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Single GPU:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# use a single GPU\n", - "rs = faiss.StandardGpuResources()\n", - "co = faiss.GpuClonerOptions()\n", - "\n", - "# then make it to gpu index\n", - "index_gpu = faiss.index_cpu_to_gpu(provider=rs, device=0, index=index, options=co)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 5.31 ms, sys: 6.26 ms, total: 11.6 ms\n", - "Wall time: 8.94 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "index_gpu.add(corpus)\n", - "D, I = index_gpu.search(corpus, 4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### All Available GPUs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If your system contains multiple GPUs, Faiss provides the option to deploy al available GPUs. You can control their usages through `GpuMultipleClonerOptions`, e.g. whether to shard or replicate the index acrross GPUs." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# cloner options for multiple GPUs\n", - "co = faiss.GpuMultipleClonerOptions()\n", - "\n", - "index_gpu = faiss.index_cpu_to_all_gpus(index=index, co=co)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 29.8 ms, sys: 26.8 ms, total: 56.6 ms\n", - "Wall time: 33.9 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "index_gpu.add(corpus)\n", - "D, I = index_gpu.search(corpus, 4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Multiple GPUs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There's also option that use multiple GPUs but not all:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "ngpu = 4\n", - "resources = [faiss.StandardGpuResources() for _ in range(ngpu)]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create vectors for the GpuResources and divices, then pass them to the index_cpu_to_gpu_multiple() function." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "vres = faiss.GpuResourcesVector()\n", - "vdev = faiss.Int32Vector()\n", - "for i, res in zip(range(ngpu), resources):\n", - " vdev.push_back(i)\n", - " vres.push_back(res)\n", - "index_gpu = faiss.index_cpu_to_gpu_multiple(vres, vdev, index)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 3.49 ms, sys: 13.4 ms, total: 16.9 ms\n", - "Wall time: 9.03 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "index_gpu.add(corpus)\n", - "D, I = index_gpu.search(corpus, 4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Results" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "All the three approaches should lead to identical result. Now let's do a quick sanity check:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "# The nearest neighbor of each vector in the corpus is itself\n", - "assert np.all(corpus[:] == corpus[I[:, 0]])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And the corresponding distance should be 0." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[ 0. 111.30057 113.2251 113.342316]\n", - " [ 0. 111.158875 111.742325 112.09038 ]\n", - " [ 0. 116.44429 116.849915 117.30502 ]]\n" - ] - } - ], - "source": [ - "print(D[:3])" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "faiss", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/source/tutorial/3_Indexing/3.1.3.ipynb b/docs/source/tutorial/3_Indexing/3.1.3.ipynb deleted file mode 100644 index 4444d8fc..00000000 --- a/docs/source/tutorial/3_Indexing/3.1.3.ipynb +++ /dev/null @@ -1,417 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Faiss Indexes" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This tutorial will go through several widely used indexes in Faiss that fits different requirements, and how to use them." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Preparation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For CPU usage, use:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install faiss-cpu" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For GPU on Linux x86_64 system, use Conda:\n", - "\n", - "```conda install -c pytorch -c nvidia faiss-gpu=1.8.0```" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import faiss\n", - "import numpy as np\n", - "\n", - "np.random.seed(768)\n", - "\n", - "data = np.random.random((1000, 128))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. `IndexFlat*`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Flat index is the very fundamental index structure. It does not do any preprocess for the incoming vectors. All the vectors are stored directly without compression or quantization. Thus no training is need for flat indexes.\n", - "\n", - "When searching, Flat index will decode all the vectors sequentially and compute the similarity score to the query vectors. Thus, Flat Index guarantees the global optimum of results." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Flat index family is small: just `IndexFlatL2` and `IndexFlatIP`, which are just different by the similarity metrics of Euclidean distance and inner product." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Usage:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "d = 128 # dimension of the vector\n", - "k = 3 # number of nearest neighbors to search\n", - "\n", - "# just simply create the index and add all the data\n", - "index = faiss.IndexFlatL2(d)\n", - "index.add(data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Sanity check:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "closest elements: [[ 0 471 188]]\n", - "distance: [[ 0. 16.257435 16.658928]]\n" - ] - } - ], - "source": [ - "# search for the k nearest neighbor for the first element in data\n", - "D, I = index.search(data[:1], k)\n", - "\n", - "print(f\"closest elements: {I}\")\n", - "print(f\"distance: {D}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Flat Indexes guarantee the perfect quality but with terrible speed. It works well on small datasets or the cases that speed is not a crucial factor. \n", - "\n", - "But what about the cases that speed is important? There's no way to have it all. So we want some indexes that only sacrifice as small as possible quality to speed up. That's why approximate nearest-neighbors (ANN) algorithms are widely accepted. Now we will go through a few popular ANN methods used in vector searching." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. `IndexIVF*`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Intro\n", - "\n", - "Inverted File Flat (IVF) Index is a widely accepted technique to speed up searching by using k-means or Voronoi diagram to create a number of cells (or say, clusters) in the whole space. Then when given a query, an amount of closest cells will be searched. After that, `k` closest elements to the query will be searched in those cells.\n", - "\n", - "- `quantizer` is another index/quantizer to assign vectors to inverted lists.\n", - "- `nlist` is the number of cells the space to be partitioned.\n", - "- `nprob` is the nuber of closest cells to visit for searching in query time." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Tradeoff\n", - "\n", - "Increasing `nlist` will shrink the size of each cell, which speed up the search process. But the smaller coverage will sacrifice accuracy and increase the possibility of the edge/surface problem discribed above.\n", - "\n", - "Increasing `nprob` will have a greater scope, preferring search quality by the tradeoff of slower speed." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Shortage\n", - "\n", - "There could be a problem when the query vector lands on the edge/surface of the cell. It is possible that the closest element falls into the neighbor cell, which may not be considered due to `nprob` is not large enough." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Example" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "nlist = 5\n", - "nprob = 2\n", - "\n", - "# the quantizer defines how to store and compare the vectors\n", - "quantizer = faiss.IndexFlatL2(d)\n", - "index = faiss.IndexIVFFlat(quantizer, d, nlist)\n", - "\n", - "# note different from flat index, IVF index first needs training to create the cells\n", - "index.train(data)\n", - "index.add(data)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "closest elements: [[ 0 471 188]]\n", - "distance: [[ 0. 16.257435 16.658928]]\n" - ] - } - ], - "source": [ - "# set nprob before searching\n", - "index.nprobe = 8\n", - "D, I = index.search(data[:1], k)\n", - "\n", - "print(f\"closest elements: {I}\")\n", - "print(f\"distance: {D}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. `IndexHNSW*`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Intro\n", - "\n", - "Hierarchical Navigable Small World (HNSW) indexing is a graph based method, which is an extension of navigable small world (NSW). It builds a multi-layered graph where nodes (vectors) are connected based on their proximity, forming \"small-world\" structures that allow efficient navigation through the space.\n", - "\n", - "- `M` is the number of neighbors each vector has in the graph.\n", - "- `efConstruction` is the number of entry points to explore when building the index.\n", - "- `efSearch` is the number of entry points to explore when searching." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Tradeoff\n", - "\n", - "Increasing `M` or `efSearch` will make greater fidelity with reasonable longer time. Larger `efConstruction` mainly increases the index construction time.\n", - "\n", - "HNSW has great searching quality and speed. But it is memory-consuming due to the graph structure. Scaling up `M` will cause a linear increase of memory usage.\n", - "\n", - "Note that HNSW index does not support vector's removal because removing nodes will distroy graph structure.\n", - "\n", - "Thus HNSW is a great index to choose when RAM is not a limiting factor." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Example" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "M = 32\n", - "ef_search = 16\n", - "ef_construction = 32\n", - "\n", - "index = faiss.IndexHNSWFlat(d, M)\n", - "# set the two parameters before adding data\n", - "index.hnsw.efConstruction = ef_construction\n", - "index.hnsw.efSearch = ef_search\n", - "\n", - "index.add(data)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "closest elements: [[ 0 471 188]]\n", - "distance: [[ 0. 16.257435 16.658928]]\n" - ] - } - ], - "source": [ - "D, I = index.search(data[:1], k)\n", - "\n", - "print(f\"closest elements: {I}\")\n", - "print(f\"distance: {D}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 4. `IndexLSH`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Intro\n", - "\n", - "Locality Sensitive Hashing (LSH) is an ANN method that hashing data points into buckets. While well known use cases of hash function such as dictionary/hashtabel are trying to avoid hashing collisions, LSH trys to maximize hashing collisions. Similar vectors will be grouped into same hash bucket.\n", - "\n", - "In Faiss, `IndexLSH` is a Flat index with binary codes. Vectors are hashed into binary codes and compared by Hamming distances.\n", - "\n", - "- `nbits` can be seen as the \"resolution\" of hashed vectors." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Tradeoff\n", - "\n", - "Increasing `nbits` can get higher fidelity with the cost of more memory and longer searching time.\n", - "\n", - "LSH suffers the curse of dimensionality when using a larger `d`. In order to get similar search quality, the `nbits` value needs to be scaled up to maintain the search quality." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Shortage\n", - "\n", - "LSH speeds up searching time with a reasonable sacrifice of quality. But that only applies to small dimension `d`. Even 128 is already too large for LSH. Thus for vectors generated by transformer based embedding models, LSH index is not a common choice." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Example" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "nbits = d * 8\n", - "\n", - "index = faiss.IndexLSH(d, nbits)\n", - "index.train(data)\n", - "index.add(data)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "closest elements: [[ 0 471 392]]\n", - "distance: [[ 0. 197. 199.]]\n" - ] - } - ], - "source": [ - "D, I = index.search(data[:1], k)\n", - "\n", - "print(f\"closest elements: {I}\")\n", - "print(f\"distance: {D}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "faiss", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/source/tutorial/3_Indexing/3.1.4.ipynb b/docs/source/tutorial/3_Indexing/3.1.4.ipynb deleted file mode 100644 index f45fee2e..00000000 --- a/docs/source/tutorial/3_Indexing/3.1.4.ipynb +++ /dev/null @@ -1,354 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Faiss Quantizers" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this notebook, we will introduce the quantizer object in Faiss and how to use them." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Preparation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For CPU usage, run:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install faiss-cpu" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For GPU on Linux x86_64 system, use Conda:\n", - "\n", - "```conda install -c pytorch -c nvidia faiss-gpu=1.8.0```" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import faiss\n", - "import numpy as np\n", - "\n", - "np.random.seed(768)\n", - "\n", - "data = np.random.random((1000, 128))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Scalar Quantizer" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Normal data type of vector embeedings is usually 32 bit floats. Scalar quantization is transforming the 32 float representation to, for example, 8 bit interger. Thus with a 4x reduction in size. In this way, it can be seen as we distribute each dimension into 256 buckets." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "| Name | Class | Parameters |\n", - "|:------------:|:--------:|:-----------|\n", - "| `ScalarQuantizer` | Quantizer class | `d`: dimension of vectors
`qtype`: map dimension into $2^\\text{qtype}$ clusters |\n", - "| `IndexScalarQuantizer` | Flat index class | `d`: dimension of vectors
`qtype`: map dimension into $2^\\text{qtype}$ clusters
`metric`: similarity metric (L2 or IP) |\n", - "| `IndexIVFScalarQuantizer` | IVF index class | `d`: dimension of vectors
`nlist`: number of cells/clusters to partition the inverted file space
`qtype`: map dimension into $2^\\text{qtype}$ clusters
`metric`: similarity metric (L2 or IP)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Quantizer class objects are used to compress the data before adding into indexes. Flat index class objects and IVF index class objects can be used direct as and index. Quantization will be done automatically." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Scalar Quantizer" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[156 180 46 226 13 130 41 187 63 251 16 199 205 166 117 122 214 2\n", - " 206 137 71 186 20 131 59 57 68 114 35 45 28 210 27 93 74 245\n", - " 167 5 32 42 44 128 10 189 10 13 42 162 179 221 241 104 205 21\n", - " 70 87 52 219 172 138 193 0 228 175 144 34 59 88 170 1 233 220\n", - " 20 64 245 241 5 161 41 55 30 247 107 8 229 90 201 10 43 158\n", - " 238 184 187 114 232 90 116 205 14 214 135 158 237 192 205 141 232 176\n", - " 124 176 163 68 49 91 125 70 6 170 55 44 215 84 46 48 218 56\n", - " 107 176]\n" - ] - } - ], - "source": [ - "d = 128\n", - "qtype = faiss.ScalarQuantizer.QT_8bit\n", - "\n", - "quantizer = faiss.ScalarQuantizer(d, qtype)\n", - "\n", - "quantizer.train(data)\n", - "new_data = quantizer.compute_codes(data)\n", - "\n", - "print(new_data[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Scalar Quantizer Index" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "d = 128\n", - "k = 3\n", - "qtype = faiss.ScalarQuantizer.QT_8bit\n", - "# nlist = 5\n", - "\n", - "index = faiss.IndexScalarQuantizer(d, qtype, faiss.METRIC_L2)\n", - "# index = faiss.IndexIVFScalarQuantizer(d, nlist, faiss.ScalarQuantizer.QT_8bit, faiss.METRIC_L2)\n", - "\n", - "index.train(data)\n", - "index.add(data)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "closest elements: [[ 0 471 188]]\n", - "distance: [[1.6511828e-04 1.6252808e+01 1.6658131e+01]]\n" - ] - } - ], - "source": [ - "D, I = index.search(data[:1], k)\n", - "\n", - "print(f\"closest elements: {I}\")\n", - "print(f\"distance: {D}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Product Quantizer" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When speed and memory are crucial factors in searching, product quantizer becomes a top choice. It is one of the effective quantizer on reducing memory size. \n", - "\n", - "The first step of PQ is dividing the original vectors with dimension `d` into smaller, low-dimensional sub-vectors with dimension `d/m`. Here `m` is the number of sub-vectors.\n", - "\n", - "Then clustering algorithms are used to create codebook of a fixed number of centroids.\n", - "\n", - "Next, each sub-vector of a vector is replaced by the index of the closest centroid from its corresponding codebook. Now each vector will be stored with only the indices instead of the full vector.\n", - "\n", - "When comuputing the distance between a query vector. Only the distances to the centroids in the codebooks are calculated, thus enable the quick approximate nearest neighbor searches." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "| Name | Class | Parameters |\n", - "|:------------:|:--------:|:-----------|\n", - "| `ProductQuantizer` | Quantizer class | `d`: dimension of vectors
`M`: number of sub-vectors that D % M == 0
`nbits`: number of bits per subquantizer, so each contain $2^\\text{nbits}$ centroids |\n", - "| `IndexPQ` | Flat index class | `d`: dimension of vectors
`M`: number of sub-vectors that D % M == 0
`nbits`: number of bits per subquantizer, so each contain $2^\\text{nbits}$ centroids
`metric`: similarity metric (L2 or IP) |\n", - "| `IndexIVFPQ` | IVF index class | `quantizer`: the quantizer used in computing distance phase.
`d`: dimension of vectors
`nlist`: number of cells/clusters to partition the inverted file space
`M`: number of sub-vectors that D % M == 0
`nbits`: number of bits per subquantizer, so each contain $2^\\text{nbits}$ centroids
`metric`: similarity metric (L2 or IP) |" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Product Quantizer" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "255\n", - "[[ 90 169 226 45]\n", - " [ 33 51 34 15]]\n" - ] - } - ], - "source": [ - "d = 128\n", - "M = 8\n", - "nbits = 4\n", - "\n", - "quantizer = faiss.ProductQuantizer(d, M, nbits)\n", - "\n", - "quantizer.train(data)\n", - "new_data = quantizer.compute_codes(data)\n", - "\n", - "print(new_data.max())\n", - "print(new_data[:2])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Product Quantizer Index" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "index = faiss.IndexPQ(d, M, nbits, faiss.METRIC_L2)\n", - "\n", - "index.train(data)\n", - "index.add(data)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "closest elements: [[ 0 946 330]]\n", - "distance: [[ 8.823908 11.602461 11.746731]]\n" - ] - } - ], - "source": [ - "D, I = index.search(data[:1], k)\n", - "\n", - "print(f\"closest elements: {I}\")\n", - "print(f\"distance: {D}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Product Quantizer IVF Index" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "nlist = 5\n", - "\n", - "quantizer = faiss.IndexFlat(d, faiss.METRIC_L2)\n", - "index = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits, faiss.METRIC_L2)\n", - "\n", - "index.train(data)\n", - "index.add(data)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "closest elements: [[ 0 899 521]]\n", - "distance: [[ 8.911423 12.088312 12.104569]]\n" - ] - } - ], - "source": [ - "D, I = index.search(data[:1], k)\n", - "\n", - "print(f\"closest elements: {I}\")\n", - "print(f\"distance: {D}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/source/tutorial/3_Indexing/3.1.5.ipynb b/docs/source/tutorial/3_Indexing/3.1.5.ipynb deleted file mode 100644 index f4b771e2..00000000 --- a/docs/source/tutorial/3_Indexing/3.1.5.ipynb +++ /dev/null @@ -1,624 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Choosing Index" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Give a great amount of indexes and quantizers, how to choose the one in the experiment/application? In this part, we will give a general suggestion on how to choose the one fits your need." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 0. Preparation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Packages" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For CPU usage, run:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# %pip install -U faiss-cpu numpy h5py" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For GPU on Linux x86_64 system, use Conda:\n", - "\n", - "```conda install -c pytorch -c nvidia faiss-gpu=1.8.0```" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from urllib.request import urlretrieve\n", - "import h5py\n", - "import faiss\n", - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this tutorial, we'll use [SIFT1M](http://corpus-texmex.irisa.fr/), a very popular dataset for ANN evaluation, as our dataset to demonstrate the comparison.\n", - "\n", - "Run the following cell to download the dataset or you can also manually download from the repo [ann-benchmarks](https://github.com/erikbern/ann-benchmarks?tab=readme-ov-file#data-sets))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_url = \"http://ann-benchmarks.com/sift-128-euclidean.hdf5\"\n", - "destination = \"data.hdf5\"\n", - "urlretrieve(data_url, destination)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then load the data from the hdf5 file." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(1000000, 128) float32\n", - "(10000, 128) float32\n" - ] - } - ], - "source": [ - "with h5py.File('data.hdf5', 'r') as f:\n", - " corpus = f['train'][:]\n", - " query = f['test'][:]\n", - "\n", - "print(corpus.shape, corpus.dtype)\n", - "print(query.shape, corpus.dtype)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "d = corpus[0].shape[0]\n", - "k = 100" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Helper function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following is a helper function for computing recall." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# compute recall from the prediction results and ground truth\n", - "def compute_recall(res, truth):\n", - " recall = 0\n", - " for i in range(len(res)):\n", - " intersect = np.intersect1d(res[i], truth[i])\n", - " recall += len(intersect) / len(res[i])\n", - " recall /= len(res)\n", - "\n", - " return recall" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Flat Index" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Flat index use brute force to search neighbors for each query. It guarantees the optimal result with 100% recall. Thus we use the result from it as the ground truth." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 69.2 ms, sys: 80.6 ms, total: 150 ms\n", - "Wall time: 149 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "index = faiss.IndexFlatL2(d)\n", - "index.add(corpus)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 17min 30s, sys: 1.62 s, total: 17min 31s\n", - "Wall time: 2min 1s\n" - ] - } - ], - "source": [ - "%%time\n", - "D, I_truth = index.search(query, k)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. IVF Index" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 10.6 s, sys: 831 ms, total: 11.4 s\n", - "Wall time: 419 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "nlist = 5\n", - "nprob = 3\n", - "\n", - "quantizer = faiss.IndexFlatL2(d)\n", - "index = faiss.IndexIVFFlat(quantizer, d, nlist)\n", - "index.nprobe = nprob\n", - "\n", - "index.train(corpus)\n", - "index.add(corpus)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 9min 15s, sys: 598 ms, total: 9min 16s\n", - "Wall time: 12.5 s\n" - ] - } - ], - "source": [ - "%%time\n", - "D, I = index.search(query, k)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Recall: 0.9999189999999997\n" - ] - } - ], - "source": [ - "recall = compute_recall(I, I_truth)\n", - "print(f\"Recall: {recall}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the test we can see that IVFFlatL2 has a pretty good promotion for the searching speed with a very tiny loss of recall." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. HNSW Index" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 11min 21s, sys: 595 ms, total: 11min 22s\n", - "Wall time: 17 s\n" - ] - } - ], - "source": [ - "%%time\n", - "M = 64\n", - "ef_search = 32\n", - "ef_construction = 64\n", - "\n", - "index = faiss.IndexHNSWFlat(d, M)\n", - "# set the two parameters before adding data\n", - "index.hnsw.efConstruction = ef_construction\n", - "index.hnsw.efSearch = ef_search\n", - "\n", - "index.add(corpus)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 5.14 s, sys: 3.94 ms, total: 5.14 s\n", - "Wall time: 110 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "D, I = index.search(query, k)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Recall: 0.8963409999999716\n" - ] - } - ], - "source": [ - "recall = compute_recall(I, I_truth)\n", - "print(f\"Recall: {recall}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the searching time of less than 1 second, we can see why HNSW is one of the best choice when looking for an extreme speed during searching phase. The reduction of recall is acceptable. But the longer time during creation of index and large memory footprint need to be considered." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. LSH" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 13.7 s, sys: 660 ms, total: 14.4 s\n", - "Wall time: 12.1 s\n" - ] - } - ], - "source": [ - "%%time\n", - "nbits = d * 8\n", - "\n", - "index = faiss.IndexLSH(d, nbits)\n", - "index.train(corpus)\n", - "index.add(corpus)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 3min 20s, sys: 84.2 ms, total: 3min 20s\n", - "Wall time: 5.64 s\n" - ] - } - ], - "source": [ - "%%time\n", - "D, I = index.search(query, k)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Recall: 0.5856720000000037\n" - ] - } - ], - "source": [ - "recall = compute_recall(I, I_truth)\n", - "print(f\"Recall: {recall}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As we covered in the last notebook, LSH is not a good choice when the data dimension is large. Here 128 is already burdened for LSH. As we can see, even we choose a relatively small `nbits` of d * 8, the index creating time and search time are still pretty long. And the recall of about 58.6% is not satisfactory." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Scalar Quantizer Index" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 550 ms, sys: 18 ms, total: 568 ms\n", - "Wall time: 87.4 ms\n" - ] - } - ], - "source": [ - "%%time\n", - "qtype = faiss.ScalarQuantizer.QT_8bit\n", - "metric = faiss.METRIC_L2\n", - "\n", - "index = faiss.IndexScalarQuantizer(d, qtype, metric)\n", - "index.train(corpus)\n", - "index.add(corpus)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 7min 36s, sys: 169 ms, total: 7min 36s\n", - "Wall time: 12.7 s\n" - ] - } - ], - "source": [ - "%%time\n", - "D, I = index.search(query, k)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Recall: 0.990444999999872\n" - ] - } - ], - "source": [ - "recall = compute_recall(I, I_truth)\n", - "print(f\"Recall: {recall}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here scalar quantizer index's performance looks very similar to the Flat index. Because the elements of vectors in the SIFT dataset are integers in the range of [0, 218]. Thus the index does not lose to much information during scalar quantization. For the dataset with more complex distribution in float32. The difference will be more obvious." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. Product Quantizer Index" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 46.7 s, sys: 22.3 ms, total: 46.7 s\n", - "Wall time: 1.36 s\n" - ] - } - ], - "source": [ - "%%time\n", - "M = 16\n", - "nbits = 8\n", - "metric = faiss.METRIC_L2\n", - "\n", - "index = faiss.IndexPQ(d, M, nbits, metric)\n", - "\n", - "index.train(corpus)\n", - "index.add(corpus)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 1min 37s, sys: 106 ms, total: 1min 37s\n", - "Wall time: 2.8 s\n" - ] - } - ], - "source": [ - "%%time\n", - "D, I = index.search(query, k)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Recall: 0.630898999999999\n" - ] - } - ], - "source": [ - "recall = compute_recall(I, I_truth)\n", - "print(f\"Recall: {recall}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Product quantizer index is not standout in any one of the aspect. But it somewhat balance the tradeoffs. It is widely used in real applications with the combination of other indexes such as IVF or HNSW." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/source/tutorial/4_Evaluation.rst b/docs/source/tutorial/4_Evaluation.rst deleted file mode 100644 index 403b804d..00000000 --- a/docs/source/tutorial/4_Evaluation.rst +++ /dev/null @@ -1,12 +0,0 @@ -4. Evaluation -============= - -.. toctree:: - :hidden: - :maxdepth: 1 - :caption: Evaluation - - 4_Evaluation/4.1.1 - 4_Evaluation/4.2.1 - 4_Evaluation/4.2.2 - 4_Evaluation/4.3.1 diff --git a/docs/source/tutorial/4_Evaluation/4.1.1.ipynb b/docs/source/tutorial/4_Evaluation/4.1.1.ipynb deleted file mode 100644 index ad24b17a..00000000 --- a/docs/source/tutorial/4_Evaluation/4.1.1.ipynb +++ /dev/null @@ -1,509 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Evaluation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Evaluation is a crucial part in all machine learning tasks. In this notebook, we will walk through the whole pipeline of evaluating the performance of an embedding model on [MS Marco](https://microsoft.github.io/msmarco/), and use three metrics to show its performance." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 0: Setup" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install the dependencies in the environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -U FlagEmbedding faiss-cpu" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 1: Load Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, download the queries and MS Marco from Huggingface Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "from datasets import load_dataset\n", - "import numpy as np\n", - "\n", - "data = load_dataset(\"namespace-Pt/msmarco\", split=\"dev\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Considering time cost, we will use the truncated dataset in this tutorial. `queries` contains the first 100 queries from the dataset. `corpus` is formed by the positives of the the first 5,000 queries." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "queries = np.array(data[:100][\"query\"])\n", - "corpus = sum(data[:5000][\"positive\"], [])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you have GPU and would like to try out the full evaluation of MS Marco, uncomment and run the following cell:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# data = load_dataset(\"namespace-Pt/msmarco\", split=\"dev\")\n", - "# queries = np.array(data[\"query\"])\n", - "\n", - "# corpus = load_dataset(\"namespace-PT/msmarco-corpus\", split=\"train\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 2: Embedding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Choose the embedding model that we would like to evaluate, and encode the corpus to embeddings." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Inference Embeddings: 100%|██████████| 21/21 [02:10<00:00, 6.22s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shape of the corpus embeddings: (5331, 768)\n", - "data type of the embeddings: float32\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "from FlagEmbedding import FlagModel\n", - "\n", - "# get the BGE embedding model\n", - "model = FlagModel('BAAI/bge-base-en-v1.5',\n", - " query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n", - " use_fp16=True)\n", - "\n", - "# get the embedding of the corpus\n", - "corpus_embeddings = model.encode(corpus)\n", - "\n", - "print(\"shape of the corpus embeddings:\", corpus_embeddings.shape)\n", - "print(\"data type of the embeddings: \", corpus_embeddings.dtype)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 3: Indexing" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We use the index_factory() functions to create a Faiss index we want:\n", - "\n", - "- The first argument `dim` is the dimension of the vector space, in this case is 768 if you're using bge-base-en-v1.5.\n", - "\n", - "- The second argument `'Flat'` makes the index do exhaustive search.\n", - "\n", - "- The thrid argument `faiss.METRIC_INNER_PRODUCT` tells the index to use inner product as the distance metric." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total number of vectors: 5331\n" - ] - } - ], - "source": [ - "import faiss\n", - "\n", - "# get the length of our embedding vectors, vectors by bge-base-en-v1.5 have length 768\n", - "dim = corpus_embeddings.shape[-1]\n", - "\n", - "# create the faiss index and store the corpus embeddings into the vector space\n", - "index = faiss.index_factory(dim, 'Flat', faiss.METRIC_INNER_PRODUCT)\n", - "corpus_embeddings = corpus_embeddings.astype(np.float32)\n", - "# train and add the embeddings to the index\n", - "index.train(corpus_embeddings)\n", - "index.add(corpus_embeddings)\n", - "\n", - "print(f\"total number of vectors: {index.ntotal}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Since the embedding process is time consuming, it's a good choice to save the index for reproduction or other experiments.\n", - "\n", - "Uncomment the following lines to save the index." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# path = \"./index.bin\"\n", - "# faiss.write_index(index, path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you already have stored index in your local directory, you can load it by:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# index = faiss.read_index(\"./index.bin\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 4: Retrieval" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get the embeddings of all the queries, and get their corresponding ground truth answers for evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "query_embeddings = model.encode_queries(queries)\n", - "ground_truths = [d[\"positive\"] for d in data]\n", - "corpus = np.asarray(corpus)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use the faiss index to search top $k$ answers of each query." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Searching: 100%|██████████| 1/1 [00:00<00:00, 20.91it/s]\n" - ] - } - ], - "source": [ - "from tqdm import tqdm\n", - "\n", - "res_scores, res_ids, res_text = [], [], []\n", - "query_size = len(query_embeddings)\n", - "batch_size = 256\n", - "# The cutoffs we will use during evaluation, and set k to be the maximum of the cutoffs.\n", - "cut_offs = [1, 10]\n", - "k = max(cut_offs)\n", - "\n", - "for i in tqdm(range(0, query_size, batch_size), desc=\"Searching\"):\n", - " q_embedding = query_embeddings[i: min(i+batch_size, query_size)].astype(np.float32)\n", - " # search the top k answers for each of the queries\n", - " score, idx = index.search(q_embedding, k=k)\n", - " res_scores += list(score)\n", - " res_ids += list(idx)\n", - " res_text += list(corpus[idx])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 5: Evaluate" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 5.1 Recall" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Recall represents the model's capability of correctly predicting positive instances from all the actual positive samples in the dataset.\n", - "\n", - "$$\\textbf{Recall}=\\frac{\\text{True Positives}}{\\text{True Positives}+\\text{False Negatives}}$$" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Recall is useful when the cost of false negatives is high. In other words, we are trying to find all objects of the positive class, even if this results in some false positives. This attribute makes recall a useful metric for text retrieval tasks." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "recall@1: 0.97\n", - "recall@10: 1.0\n" - ] - } - ], - "source": [ - "def calc_recall(preds, truths, cutoffs):\n", - " recalls = np.zeros(len(cutoffs))\n", - " for text, truth in zip(preds, truths):\n", - " for i, c in enumerate(cutoffs):\n", - " recall = np.intersect1d(truth, text[:c])\n", - " recalls[i] += len(recall) / max(min(c, len(truth)), 1)\n", - " recalls /= len(preds)\n", - " return recalls\n", - "\n", - "recalls = calc_recall(res_text, ground_truths, cut_offs)\n", - "for i, c in enumerate(cut_offs):\n", - " print(f\"recall@{c}: {recalls[i]}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 5.2 MRR" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Mean Reciprocal Rank ([MRR](https://en.wikipedia.org/wiki/Mean_reciprocal_rank)) is a widely used metric in information retrieval to evaluate the effectiveness of a system. It measures the rank position of the first relevant result in a list of search results.\n", - "\n", - "$$MRR=\\frac{1}{|Q|}\\sum_{i=1}^{|Q|}\\frac{1}{rank_i}$$\n", - "\n", - "where \n", - "- $|Q|$ is the total number of queries.\n", - "- $rank_i$ is the rank position of the first relevant document of the i-th query." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "def MRR(preds, truth, cutoffs):\n", - " mrr = [0 for _ in range(len(cutoffs))]\n", - " for pred, t in zip(preds, truth):\n", - " for i, c in enumerate(cutoffs):\n", - " for j, p in enumerate(pred):\n", - " if j < c and p in t:\n", - " mrr[i] += 1/(j+1)\n", - " break\n", - " mrr = [k/len(preds) for k in mrr]\n", - " return mrr" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MRR@1: 0.97\n", - "MRR@10: 0.9825\n" - ] - } - ], - "source": [ - "mrr = MRR(res_text, ground_truths, cut_offs)\n", - "for i, c in enumerate(cut_offs):\n", - " print(f\"MRR@{c}: {mrr[i]}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 5.3 nDCG" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Normalized Discounted cumulative gain (nDCG) measures the quality of a ranked list of search results by considering both the position of the relevant documents and their graded relevance scores. The calculation of nDCG involves two main steps:\n", - "\n", - "1. Discounted cumulative gain (DCG) measures the ranking quality in retrieval tasks.\n", - "\n", - "$$DCG_p=\\sum_{i=1}^p\\frac{2^{rel_i}-1}{\\log_2(i+1)}$$\n", - "\n", - "2. Normalized by ideal DCG to make it comparable across queries.\n", - "$$nDCG_p=\\frac{DCG_p}{IDCG_p}$$\n", - "where $IDCG$ is the maximum possible DCG for a given set of documents, assuming they are perfectly ranked in order of relevance." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "pred_hard_encodings = []\n", - "for pred, label in zip(res_text, ground_truths):\n", - " pred_hard_encoding = list(np.isin(pred, label).astype(int))\n", - " pred_hard_encodings.append(pred_hard_encoding)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "nDCG@1: 0.97\n", - "nDCG@10: 0.9869253606521631\n" - ] - } - ], - "source": [ - "from sklearn.metrics import ndcg_score\n", - "\n", - "for i, c in enumerate(cut_offs):\n", - " nDCG = ndcg_score(pred_hard_encodings, res_scores, k=c)\n", - " print(f\"nDCG@{c}: {nDCG}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Congrats! You have walked through a full pipeline of evaluating an embedding model. Feel free to play with different datasets and models!" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/source/tutorial/4_Evaluation/4.2.1.ipynb b/docs/source/tutorial/4_Evaluation/4.2.1.ipynb deleted file mode 100644 index 3f636f19..00000000 --- a/docs/source/tutorial/4_Evaluation/4.2.1.ipynb +++ /dev/null @@ -1,436 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# MTEB" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For evaluation of embedding models, MTEB is one of the most well-known benchmark. In this tutorial, we'll introduce MTEB, its basic usage, and evaluate how your model performs on the MTEB leaderboard." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 0. Installation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install the packages we will use in your environment:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%capture\n", - "%pip install sentence_transformers mteb" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Intro" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The [Massive Text Embedding Benchmark (MTEB)](https://github.com/embeddings-benchmark/mteb) is a large-scale evaluation framework designed to assess the performance of text embedding models across a wide variety of natural language processing (NLP) tasks. Introduced to standardize and improve the evaluation of text embeddings, MTEB is crucial for assessing how well these models generalize across various real-world applications. It contains a wide range of datasets in eight main NLP tasks and different languages, and provides an easy pipeline for evaluation.\n", - "\n", - "MTEB is also well known for the MTEB leaderboard, which contains a ranking of the latest first-class embedding models. We'll cover that in the next tutorial. Now let's have a look on how to use MTEB to do evaluation easily." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "import mteb\n", - "from sentence_transformers import SentenceTransformer" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's take a look at how to use MTEB to do a quick evaluation.\n", - "\n", - "First we load the model that we would like to evaluate on:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "model_name = \"BAAI/bge-base-en-v1.5\"\n", - "model = SentenceTransformer(model_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Below is the list of datasets of retrieval used by MTEB's English leaderboard.\n", - "\n", - "MTEB directly use the open source benchmark BEIR in its retrieval part, which contains 15 datasets (note there are 12 subsets of CQADupstack)." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "retrieval_tasks = [\n", - " \"ArguAna\",\n", - " \"ClimateFEVER\",\n", - " \"CQADupstackAndroidRetrieval\",\n", - " \"CQADupstackEnglishRetrieval\",\n", - " \"CQADupstackGamingRetrieval\",\n", - " \"CQADupstackGisRetrieval\",\n", - " \"CQADupstackMathematicaRetrieval\",\n", - " \"CQADupstackPhysicsRetrieval\",\n", - " \"CQADupstackProgrammersRetrieval\",\n", - " \"CQADupstackStatsRetrieval\",\n", - " \"CQADupstackTexRetrieval\",\n", - " \"CQADupstackUnixRetrieval\",\n", - " \"CQADupstackWebmastersRetrieval\",\n", - " \"CQADupstackWordpressRetrieval\",\n", - " \"DBPedia\",\n", - " \"FEVER\",\n", - " \"FiQA2018\",\n", - " \"HotpotQA\",\n", - " \"MSMARCO\",\n", - " \"NFCorpus\",\n", - " \"NQ\",\n", - " \"QuoraRetrieval\",\n", - " \"SCIDOCS\",\n", - " \"SciFact\",\n", - " \"Touche2020\",\n", - " \"TRECCOVID\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For demonstration, let's just run the first one, \"ArguAna\".\n", - "\n", - "For a full list of tasks and languages that MTEB supports, check the [page](https://github.com/embeddings-benchmark/mteb/blob/18662380f0f476db3d170d0926892045aa9f74ee/docs/tasks.md)." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "tasks = mteb.get_tasks(tasks=retrieval_tasks[:1])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then, create and initialize an MTEB instance with our chosen tasks, and run the evaluation process." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
───────────────────────────────────────────────── Selected tasks  ─────────────────────────────────────────────────\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[38;5;235m───────────────────────────────────────────────── \u001b[0m\u001b[1mSelected tasks \u001b[0m\u001b[38;5;235m ─────────────────────────────────────────────────\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
Retrieval\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1mRetrieval\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
    - ArguAna, s2p\n",
-       "
\n" - ], - "text/plain": [ - " - ArguAna, \u001b[3;38;5;241ms2p\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n",
-       "\n",
-       "
\n" - ], - "text/plain": [ - "\n", - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Batches: 100%|██████████| 44/44 [00:41<00:00, 1.06it/s]\n", - "Batches: 100%|██████████| 272/272 [03:36<00:00, 1.26it/s]\n" - ] - } - ], - "source": [ - "# use the tasks we chose to initialize the MTEB instance\n", - "evaluation = mteb.MTEB(tasks=tasks)\n", - "\n", - "# call run() with the model and output_folder\n", - "results = evaluation.run(model, output_folder=\"results\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The results should be stored in `{output_folder}/{model_name}/{model_revision}/{task_name}.json`.\n", - "\n", - "Openning the json file you should see contents as below, which are the evaluation results on \"ArguAna\" with different metrics on cutoffs from 1 to 1000." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```python\n", - "{\n", - " \"dataset_revision\": \"c22ab2a51041ffd869aaddef7af8d8215647e41a\",\n", - " \"evaluation_time\": 260.14976954460144,\n", - " \"kg_co2_emissions\": null,\n", - " \"mteb_version\": \"1.14.17\",\n", - " \"scores\": {\n", - " \"test\": [\n", - " {\n", - " \"hf_subset\": \"default\",\n", - " \"languages\": [\n", - " \"eng-Latn\"\n", - " ],\n", - " \"main_score\": 0.63616,\n", - " \"map_at_1\": 0.40754,\n", - " \"map_at_10\": 0.55773,\n", - " \"map_at_100\": 0.56344,\n", - " \"map_at_1000\": 0.56347,\n", - " \"map_at_20\": 0.56202,\n", - " \"map_at_3\": 0.51932,\n", - " \"map_at_5\": 0.54023,\n", - " \"mrr_at_1\": 0.4139402560455192,\n", - " \"mrr_at_10\": 0.5603739077423295,\n", - " \"mrr_at_100\": 0.5660817425350153,\n", - " \"mrr_at_1000\": 0.5661121884705748,\n", - " \"mrr_at_20\": 0.564661930998293,\n", - " \"mrr_at_3\": 0.5208629682313899,\n", - " \"mrr_at_5\": 0.5429113323850182,\n", - " \"nauc_map_at_1000_diff1\": 0.15930478114759905,\n", - " \"nauc_map_at_1000_max\": -0.06396189194646361,\n", - " \"nauc_map_at_1000_std\": -0.13168797291549253,\n", - " \"nauc_map_at_100_diff1\": 0.15934819555197366,\n", - " \"nauc_map_at_100_max\": -0.06389635013430676,\n", - " \"nauc_map_at_100_std\": -0.13164524259533786,\n", - " \"nauc_map_at_10_diff1\": 0.16057318234658585,\n", - " \"nauc_map_at_10_max\": -0.060962623117325254,\n", - " \"nauc_map_at_10_std\": -0.1300413865104607,\n", - " \"nauc_map_at_1_diff1\": 0.17346152653542332,\n", - " \"nauc_map_at_1_max\": -0.09705499215630589,\n", - " \"nauc_map_at_1_std\": -0.14726476953035533,\n", - " \"nauc_map_at_20_diff1\": 0.15956349246366208,\n", - " \"nauc_map_at_20_max\": -0.06259296677860492,\n", - " \"nauc_map_at_20_std\": -0.13097093150054095,\n", - " \"nauc_map_at_3_diff1\": 0.15620049317363813,\n", - " \"nauc_map_at_3_max\": -0.06690213479396273,\n", - " \"nauc_map_at_3_std\": -0.13440904793529648,\n", - " \"nauc_map_at_5_diff1\": 0.1557795701081579,\n", - " \"nauc_map_at_5_max\": -0.06255283252590663,\n", - " \"nauc_map_at_5_std\": -0.1355361594910923,\n", - " \"nauc_mrr_at_1000_diff1\": 0.1378988612808882,\n", - " \"nauc_mrr_at_1000_max\": -0.07507962333910836,\n", - " \"nauc_mrr_at_1000_std\": -0.12969109830101241,\n", - " \"nauc_mrr_at_100_diff1\": 0.13794450668758515,\n", - " \"nauc_mrr_at_100_max\": -0.07501290390362861,\n", - " \"nauc_mrr_at_100_std\": -0.12964855554504057,\n", - " \"nauc_mrr_at_10_diff1\": 0.1396047981645623,\n", - " \"nauc_mrr_at_10_max\": -0.07185174301688693,\n", - " \"nauc_mrr_at_10_std\": -0.12807325096717753,\n", - " \"nauc_mrr_at_1_diff1\": 0.15610387932529113,\n", - " \"nauc_mrr_at_1_max\": -0.09824591983546396,\n", - " \"nauc_mrr_at_1_std\": -0.13914318784294258,\n", - " \"nauc_mrr_at_20_diff1\": 0.1382786098284509,\n", - " \"nauc_mrr_at_20_max\": -0.07364476417961506,\n", - " \"nauc_mrr_at_20_std\": -0.12898192060943495,\n", - " \"nauc_mrr_at_3_diff1\": 0.13118224861025093,\n", - " \"nauc_mrr_at_3_max\": -0.08164985279853691,\n", - " \"nauc_mrr_at_3_std\": -0.13241573571401533,\n", - " \"nauc_mrr_at_5_diff1\": 0.1346130730317385,\n", - " \"nauc_mrr_at_5_max\": -0.07404093236468848,\n", - " \"nauc_mrr_at_5_std\": -0.1340775377068567,\n", - " \"nauc_ndcg_at_1000_diff1\": 0.15919987960292029,\n", - " \"nauc_ndcg_at_1000_max\": -0.05457945565481172,\n", - " \"nauc_ndcg_at_1000_std\": -0.12457339152558143,\n", - " \"nauc_ndcg_at_100_diff1\": 0.1604091882521101,\n", - " \"nauc_ndcg_at_100_max\": -0.05281549383775287,\n", - " \"nauc_ndcg_at_100_std\": -0.12347288098914058,\n", - " \"nauc_ndcg_at_10_diff1\": 0.1657018523692905,\n", - " \"nauc_ndcg_at_10_max\": -0.036222943297402846,\n", - " \"nauc_ndcg_at_10_std\": -0.11284619565817842,\n", - " \"nauc_ndcg_at_1_diff1\": 0.17346152653542332,\n", - " \"nauc_ndcg_at_1_max\": -0.09705499215630589,\n", - " \"nauc_ndcg_at_1_std\": -0.14726476953035533,\n", - " \"nauc_ndcg_at_20_diff1\": 0.16231721725673165,\n", - " \"nauc_ndcg_at_20_max\": -0.04147115653921931,\n", - " \"nauc_ndcg_at_20_std\": -0.11598700704312062,\n", - " \"nauc_ndcg_at_3_diff1\": 0.15256475371124711,\n", - " \"nauc_ndcg_at_3_max\": -0.05432154580979357,\n", - " \"nauc_ndcg_at_3_std\": -0.12841084787822227,\n", - " \"nauc_ndcg_at_5_diff1\": 0.15236205846534961,\n", - " \"nauc_ndcg_at_5_max\": -0.04356123278888682,\n", - " \"nauc_ndcg_at_5_std\": -0.12942556865700913,\n", - " \"nauc_precision_at_1000_diff1\": -0.038790629929866066,\n", - " \"nauc_precision_at_1000_max\": 0.3630826341915611,\n", - " \"nauc_precision_at_1000_std\": 0.4772189839676386,\n", - " \"nauc_precision_at_100_diff1\": 0.32118609204433185,\n", - " \"nauc_precision_at_100_max\": 0.4740132817600036,\n", - " \"nauc_precision_at_100_std\": 0.3456396169952022,\n", - " \"nauc_precision_at_10_diff1\": 0.22279659689895104,\n", - " \"nauc_precision_at_10_max\": 0.16823918613191954,\n", - " \"nauc_precision_at_10_std\": 0.0377209694331257,\n", - " \"nauc_precision_at_1_diff1\": 0.17346152653542332,\n", - " \"nauc_precision_at_1_max\": -0.09705499215630589,\n", - " \"nauc_precision_at_1_std\": -0.14726476953035533,\n", - " \"nauc_precision_at_20_diff1\": 0.23025740175221762,\n", - " \"nauc_precision_at_20_max\": 0.2892313928157665,\n", - " \"nauc_precision_at_20_std\": 0.13522755012490692,\n", - " \"nauc_precision_at_3_diff1\": 0.1410889527057097,\n", - " \"nauc_precision_at_3_max\": -0.010771302313530132,\n", - " \"nauc_precision_at_3_std\": -0.10744937823276193,\n", - " \"nauc_precision_at_5_diff1\": 0.14012953903010988,\n", - " \"nauc_precision_at_5_max\": 0.03977485677045894,\n", - " \"nauc_precision_at_5_std\": -0.10292184602358977,\n", - " \"nauc_recall_at_1000_diff1\": -0.03879062992990034,\n", - " \"nauc_recall_at_1000_max\": 0.36308263419153386,\n", - " \"nauc_recall_at_1000_std\": 0.47721898396760526,\n", - " \"nauc_recall_at_100_diff1\": 0.3211860920443005,\n", - " \"nauc_recall_at_100_max\": 0.4740132817599919,\n", - " \"nauc_recall_at_100_std\": 0.345639616995194,\n", - " \"nauc_recall_at_10_diff1\": 0.22279659689895054,\n", - " \"nauc_recall_at_10_max\": 0.16823918613192046,\n", - " \"nauc_recall_at_10_std\": 0.037720969433127145,\n", - " \"nauc_recall_at_1_diff1\": 0.17346152653542332,\n", - " \"nauc_recall_at_1_max\": -0.09705499215630589,\n", - " \"nauc_recall_at_1_std\": -0.14726476953035533,\n", - " \"nauc_recall_at_20_diff1\": 0.23025740175221865,\n", - " \"nauc_recall_at_20_max\": 0.2892313928157675,\n", - " \"nauc_recall_at_20_std\": 0.13522755012490456,\n", - " \"nauc_recall_at_3_diff1\": 0.14108895270570979,\n", - " \"nauc_recall_at_3_max\": -0.010771302313529425,\n", - " \"nauc_recall_at_3_std\": -0.10744937823276134,\n", - " \"nauc_recall_at_5_diff1\": 0.14012953903010958,\n", - " \"nauc_recall_at_5_max\": 0.039774856770459645,\n", - " \"nauc_recall_at_5_std\": -0.10292184602358935,\n", - " \"ndcg_at_1\": 0.40754,\n", - " \"ndcg_at_10\": 0.63616,\n", - " \"ndcg_at_100\": 0.66063,\n", - " \"ndcg_at_1000\": 0.6613,\n", - " \"ndcg_at_20\": 0.65131,\n", - " \"ndcg_at_3\": 0.55717,\n", - " \"ndcg_at_5\": 0.59461,\n", - " \"precision_at_1\": 0.40754,\n", - " \"precision_at_10\": 0.08841,\n", - " \"precision_at_100\": 0.00991,\n", - " \"precision_at_1000\": 0.001,\n", - " \"precision_at_20\": 0.04716,\n", - " \"precision_at_3\": 0.22238,\n", - " \"precision_at_5\": 0.15149,\n", - " \"recall_at_1\": 0.40754,\n", - " \"recall_at_10\": 0.88407,\n", - " \"recall_at_100\": 0.99147,\n", - " \"recall_at_1000\": 0.99644,\n", - " \"recall_at_20\": 0.9431,\n", - " \"recall_at_3\": 0.66714,\n", - " \"recall_at_5\": 0.75747\n", - " }\n", - " ]\n", - " },\n", - " \"task_name\": \"ArguAna\"\n", - "}\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we've successfully run the evaluation using mteb! In the next tutorial, we'll show how to evaluate your model on the whole 56 tasks of English MTEB and compete with models on the leaderboard." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/source/tutorial/4_Evaluation/4.2.2.ipynb b/docs/source/tutorial/4_Evaluation/4.2.2.ipynb deleted file mode 100644 index aa71df61..00000000 --- a/docs/source/tutorial/4_Evaluation/4.2.2.ipynb +++ /dev/null @@ -1,302 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# MTEB Leaderboard" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the last tutorial we show how to evaluate an embedding model on an dataset supported by MTEB. In this tutorial, we will go through how to do a full evaluation and compare the results with MTEB English leaderboard.\n", - "\n", - "Caution: Evaluation on the full Eng MTEB is very time consuming even with GPU. So we encourage you to go through the notebook to have an idea. And run the experiment when you have enough computing resource and time." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 0. Installation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install the packages we will use in your environment:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%capture\n", - "%pip install sentence_transformers mteb" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Run the Evaluation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The MTEB English leaderboard contains 56 datasets on 7 tasks:\n", - "1. **Classification**: Use the embeddings to train a logistic regression on the train set and is scored on the test set. F1 is the main metric.\n", - "2. **Clustering**: Train a mini-batch k-means model with batch size 32 and k equals to the number of different labels. Then score using v-measure.\n", - "3. **Pair Classification**: A pair of text inputs is provided and a label which is a binary variable needs to be assigned. The main metric is average precision score.\n", - "4. **Reranking**: Rank a list of relevant and irrelevant reference texts according to a query. Metrics are mean MRR@k and MAP.\n", - "5. **Retrieval**: Each dataset comprises corpus, queries, and a mapping that links each query to its relevant documents within the corpus. The goal is to retrieve relevant documents for each query. The main metric is nDCG@k. MTEB directly adopts BEIR for the retrieval task.\n", - "6. **Semantic Textual Similarity (STS)**: Determine the similarity between each sentence pair. Spearman correlation based on cosine\n", - "similarity serves as the main metric.\n", - "7. **Summarization**: Only 1 dataset is used in this task. Score the machine-generated summaries to human-written summaries by computing distances of their embeddings. The main metric is also Spearman correlation based on cosine similarity.\n", - "\n", - "The benchmark is widely accepted by researchers and engineers to fairly evaluate and compare the performance of the models they train. Now let's take a look at the whole evaluation pipeline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Import the `MTEB_MAIN_EN` to check the all 56 datasets." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['AmazonCounterfactualClassification', 'AmazonPolarityClassification', 'AmazonReviewsClassification', 'ArguAna', 'ArxivClusteringP2P', 'ArxivClusteringS2S', 'AskUbuntuDupQuestions', 'BIOSSES', 'Banking77Classification', 'BiorxivClusteringP2P', 'BiorxivClusteringS2S', 'CQADupstackAndroidRetrieval', 'CQADupstackEnglishRetrieval', 'CQADupstackGamingRetrieval', 'CQADupstackGisRetrieval', 'CQADupstackMathematicaRetrieval', 'CQADupstackPhysicsRetrieval', 'CQADupstackProgrammersRetrieval', 'CQADupstackStatsRetrieval', 'CQADupstackTexRetrieval', 'CQADupstackUnixRetrieval', 'CQADupstackWebmastersRetrieval', 'CQADupstackWordpressRetrieval', 'ClimateFEVER', 'DBPedia', 'EmotionClassification', 'FEVER', 'FiQA2018', 'HotpotQA', 'ImdbClassification', 'MSMARCO', 'MTOPDomainClassification', 'MTOPIntentClassification', 'MassiveIntentClassification', 'MassiveScenarioClassification', 'MedrxivClusteringP2P', 'MedrxivClusteringS2S', 'MindSmallReranking', 'NFCorpus', 'NQ', 'QuoraRetrieval', 'RedditClustering', 'RedditClusteringP2P', 'SCIDOCS', 'SICK-R', 'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STS17', 'STS22', 'STSBenchmark', 'SciDocsRR', 'SciFact', 'SprintDuplicateQuestions', 'StackExchangeClustering', 'StackExchangeClusteringP2P', 'StackOverflowDupQuestions', 'SummEval', 'TRECCOVID', 'Touche2020', 'ToxicConversationsClassification', 'TweetSentimentExtractionClassification', 'TwentyNewsgroupsClustering', 'TwitterSemEval2015', 'TwitterURLCorpus']\n" - ] - } - ], - "source": [ - "import mteb\n", - "from mteb.benchmarks import MTEB_MAIN_EN\n", - "\n", - "print(MTEB_MAIN_EN.tasks)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Load the model we want to evaluate:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sentence_transformers import SentenceTransformer\n", - "\n", - "model_name = \"BAAI/bge-base-en-v1.5\"\n", - "model = SentenceTransformer(model_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Alternatively, MTEB provides popular models on their leaderboard in order to reproduce their results." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model_name = \"BAAI/bge-base-en-v1.5\"\n", - "model = mteb.get_model(model_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then start to evaluate on each dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for task in MTEB_MAIN_EN.tasks:\n", - " # get the test set to evaluate on\n", - " eval_splits = [\"dev\"] if task == \"MSMARCO\" else [\"test\"]\n", - " evaluation = mteb.MTEB(\n", - " tasks=[task], task_langs=[\"en\"]\n", - " ) # Remove \"en\" to run all available languages\n", - " evaluation.run(\n", - " model, output_folder=\"results\", eval_splits=eval_splits\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Submit to MTEB Leaderboard" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After the evaluation is done, all the evaluation results should be stored in `results/{model_name}/{model_revision}`.\n", - "\n", - "Then run the following shell command to create the model_card.md. Change {model_name} and {model_revision} to your path." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!mteb create_meta --results_folder results/{model_name}/{model_revision} --output_path model_card.md" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For the case that the readme of that model already exists:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# !mteb create_meta --results_folder results/{model_name}/{model_revision} --output_path model_card.md --from_existing your_existing_readme.md " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copy and paste the contents of model_card.md to the top of README.md of your model on HF Hub. Now relax and wait for the daily refresh of leaderboard. Your model will show up soon!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Partially Evaluate" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that you don't need to finish all the tasks to get on to the leaderboard.\n", - "\n", - "For example you fine-tune a model's ability on clustering. And you only care about how your model performs with respoect to clustering, but not the other tasks. Then you can just test its performance on the clustering tasks of MTEB and submit to the leaderboard." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "TASK_LIST_CLUSTERING = [\n", - " \"ArxivClusteringP2P\",\n", - " \"ArxivClusteringS2S\",\n", - " \"BiorxivClusteringP2P\",\n", - " \"BiorxivClusteringS2S\",\n", - " \"MedrxivClusteringP2P\",\n", - " \"MedrxivClusteringS2S\",\n", - " \"RedditClustering\",\n", - " \"RedditClusteringP2P\",\n", - " \"StackExchangeClustering\",\n", - " \"StackExchangeClusteringP2P\",\n", - " \"TwentyNewsgroupsClustering\",\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run the evaluation with only clustering tasks:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "evaluation = mteb.MTEB(tasks=TASK_LIST_CLUSTERING)\n", - "\n", - "results = evaluation.run(model, output_folder=\"results\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then repeat Step 2 to submit your model. After the leaderboard refresh, you can find your model in the \"Clustering\" section of the leaderboard." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Future Work" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "MTEB is working on a new version of English benchmark. It contains updated and concise tasks and will make the evaluation process faster.\n", - "\n", - "Please check out their [GitHub](https://github.com/embeddings-benchmark/mteb) page for future updates and releases." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/source/tutorial/4_Evaluation/4.3.1.ipynb b/docs/source/tutorial/4_Evaluation/4.3.1.ipynb deleted file mode 100644 index 5832680f..00000000 --- a/docs/source/tutorial/4_Evaluation/4.3.1.ipynb +++ /dev/null @@ -1,240 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# C-MTEB" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "C-MTEB is the largest benchmark for Chinese text embeddings, similar to MTEB. In this tutorial, we will go through how to evaluate an embedding model's ability on Chinese tasks in C-MTEB." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 0. Installation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First install dependent packages:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install FlagEmbedding mteb" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Datasets" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "C-MTEB uses similar task splits and metrics as English MTEB. It contains 35 datasets in 6 different tasks: Classification, Clustering, Pair Classification, Reranking, Retrieval, and Semantic Textual Similarity (STS). \n", - "\n", - "1. **Classification**: Use the embeddings to train a logistic regression on the train set and is scored on the test set. F1 is the main metric.\n", - "2. **Clustering**: Train a mini-batch k-means model with batch size 32 and k equals to the number of different labels. Then score using v-measure.\n", - "3. **Pair Classification**: A pair of text inputs is provided and a label which is a binary variable needs to be assigned. The main metric is average precision score.\n", - "4. **Reranking**: Rank a list of relevant and irrelevant reference texts according to a query. Metrics are mean MRR@k and MAP.\n", - "5. **Retrieval**: Each dataset comprises corpus, queries, and a mapping that links each query to its relevant documents within the corpus. The goal is to retrieve relevant documents for each query. The main metric is nDCG@k. MTEB directly adopts BEIR for the retrieval task.\n", - "6. **Semantic Textual Similarity (STS)**: Determine the similarity between each sentence pair. Spearman correlation based on cosine\n", - "similarity serves as the main metric.\n", - "\n", - "\n", - "Check the [HF page](https://huggingface.co/C-MTEB) for the details of each dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ChineseTaskList = [\n", - " 'TNews', 'IFlyTek', 'MultilingualSentiment', 'JDReview', 'OnlineShopping', 'Waimai',\n", - " 'CLSClusteringS2S.v2', 'CLSClusteringP2P.v2', 'ThuNewsClusteringS2S.v2', 'ThuNewsClusteringP2P.v2',\n", - " 'Ocnli', 'Cmnli',\n", - " 'T2Reranking', 'MMarcoReranking', 'CMedQAv1-reranking', 'CMedQAv2-reranking',\n", - " 'T2Retrieval', 'MMarcoRetrieval', 'DuRetrieval', 'CovidRetrieval', 'CmedqaRetrieval', 'EcomRetrieval', 'MedicalRetrieval', 'VideoRetrieval',\n", - " 'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STSB', 'AFQMC', 'QBQTC'\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, load the model for evaluation. Note that the instruction here is used for retreival tasks." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from ...C_MTEB.flag_dres_model import FlagDRESModel\n", - "\n", - "instruction = \"为这个句子生成表示以用于检索相关文章:\"\n", - "model_name = \"BAAI/bge-base-zh-v1.5\"\n", - "\n", - "model = FlagDRESModel(model_name_or_path=\"BAAI/bge-base-zh-v1.5\",\n", - " query_instruction_for_retrieval=instruction,\n", - " pooling_method=\"cls\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Otherwise, you can load a model using sentence_transformers:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sentence_transformers import SentenceTransformer\n", - "\n", - "model = SentenceTransformer(\"PATH_TO_MODEL\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Or implement a class following the structure below:\n", - "\n", - "```python\n", - "class MyModel():\n", - " def __init__(self):\n", - " \"\"\"initialize the tokenizer and model\"\"\"\n", - " pass\n", - "\n", - " def encode(self, sentences, batch_size=32, **kwargs):\n", - " \"\"\" Returns a list of embeddings for the given sentences.\n", - " Args:\n", - " sentences (`List[str]`): List of sentences to encode\n", - " batch_size (`int`): Batch size for the encoding\n", - "\n", - " Returns:\n", - " `List[np.ndarray]` or `List[tensor]`: List of embeddings for the given sentences\n", - " \"\"\"\n", - " pass\n", - "\n", - "model = MyModel()\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Evaluate" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After we've prepared the dataset and model, we can start the evaluation. For time efficiency, we highly recommend to use GPU for evaluation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import mteb\n", - "from mteb import MTEB\n", - "\n", - "tasks = mteb.get_tasks(ChineseTaskList)\n", - "\n", - "for task in tasks:\n", - " evaluation = MTEB(tasks=[task])\n", - " evaluation.run(model, output_folder=f\"zh_results/{model_name.split('/')[-1]}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Submit to MTEB Leaderboard" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After the evaluation is done, all the evaluation results should be stored in `zh_results/{model_name}/`.\n", - "\n", - "Then run the following shell command to create the model_card.md. Change {model_name} and its following to your path." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!!mteb create_meta --results_folder results/{model_name}/ --output_path model_card.md" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copy and paste the contents of model_card.md to the top of README.md of your model on HF Hub. Then goto the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard) and choose the Chinese leaderboard to find your model! It will appear soon after the website's daily refresh." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/source/tutorial/5_Reranking.rst b/docs/source/tutorial/5_Reranking.rst deleted file mode 100644 index 9f9a8cbc..00000000 --- a/docs/source/tutorial/5_Reranking.rst +++ /dev/null @@ -1,9 +0,0 @@ -5. Reranking -============ - -.. toctree:: - :hidden: - :maxdepth: 1 - :caption: Reranking - - 5_Reranking/5.1 \ No newline at end of file diff --git a/docs/source/tutorial/5_Reranking/5.1.ipynb b/docs/source/tutorial/5_Reranking/5.1.ipynb deleted file mode 100644 index b87c70ff..00000000 --- a/docs/source/tutorial/5_Reranking/5.1.ipynb +++ /dev/null @@ -1,574 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Reranker" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Reranker is disigned in cross-encoder architecture that takes the query and text at the same time and directly output their score of similarity. It is more capable on scoring the query-text relevance, but with the tradeoff of slower speed. Thus, complete retrieval system usually contains retrievers in the first stage to do a large scope retrieval, and then follows by rerankers to rerank the results more precisely.\n", - "\n", - "In this tutorial, we will go through text retrieval pipeline with reranker and evaluate the results before and after reranking.\n", - "\n", - "Note: Step 1-4 are identical to the tutorial of [evaluation](https://github.com/FlagOpen/FlagEmbedding/tree/master/Tutorials/4_Evaluation). We suggest to first go through that if you are not familiar with retrieval." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 0. Setup" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install the dependencies in the environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -U FlagEmbedding faiss-cpu" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Download and preprocess the MS Marco dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "from datasets import load_dataset\n", - "import numpy as np\n", - "\n", - "data = load_dataset(\"namespace-Pt/msmarco\", split=\"dev\")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "queries = np.array(data[:100][\"query\"])\n", - "corpus = sum(data[:5000][\"positive\"], [])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Embedding" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Inference Embeddings: 100%|██████████| 21/21 [01:59<00:00, 5.68s/it]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "shape of the corpus embeddings: (5331, 768)\n", - "data type of the embeddings: float32\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "from FlagEmbedding import FlagModel\n", - "\n", - "# get the BGE embedding model\n", - "model = FlagModel('BAAI/bge-base-en-v1.5',\n", - " query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n", - " use_fp16=True)\n", - "\n", - "# get the embedding of the corpus\n", - "corpus_embeddings = model.encode(corpus)\n", - "\n", - "print(\"shape of the corpus embeddings:\", corpus_embeddings.shape)\n", - "print(\"data type of the embeddings: \", corpus_embeddings.dtype)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Indexing" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total number of vectors: 5331\n" - ] - } - ], - "source": [ - "import faiss\n", - "\n", - "# get the length of our embedding vectors, vectors by bge-base-en-v1.5 have length 768\n", - "dim = corpus_embeddings.shape[-1]\n", - "\n", - "# create the faiss index and store the corpus embeddings into the vector space\n", - "index = faiss.index_factory(dim, 'Flat', faiss.METRIC_INNER_PRODUCT)\n", - "corpus_embeddings = corpus_embeddings.astype(np.float32)\n", - "index.train(corpus_embeddings)\n", - "index.add(corpus_embeddings)\n", - "\n", - "print(f\"total number of vectors: {index.ntotal}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Retrieval" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "query_embeddings = model.encode_queries(queries)\n", - "ground_truths = [d[\"positive\"] for d in data]\n", - "corpus = np.asarray(corpus)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Searching: 100%|██████████| 1/1 [00:00<00:00, 22.35it/s]\n" - ] - } - ], - "source": [ - "from tqdm import tqdm\n", - "\n", - "res_scores, res_ids, res_text = [], [], []\n", - "query_size = len(query_embeddings)\n", - "batch_size = 256\n", - "# The cutoffs we will use during evaluation, and set k to be the maximum of the cutoffs.\n", - "cut_offs = [1, 10]\n", - "k = max(cut_offs)\n", - "\n", - "for i in tqdm(range(0, query_size, batch_size), desc=\"Searching\"):\n", - " q_embedding = query_embeddings[i: min(i+batch_size, query_size)].astype(np.float32)\n", - " # search the top k answers for each of the queries\n", - " score, idx = index.search(q_embedding, k=k)\n", - " res_scores += list(score)\n", - " res_ids += list(idx)\n", - " res_text += list(corpus[idx])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Reranking" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we will use a reranker to rerank the list of answers we retrieved using our index. Hopefully, this will lead to better results." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following table lists the available BGE rerankers. Feel free to try out to see their differences!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "| Model | Language | Parameters | Description | Base Model |\n", - "|:-------|:--------:|:----:|:-----------------:|:--------------------------------------:|\n", - "| [BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) | Multilingual | 568M | a lightweight cross-encoder model, possesses strong multilingual capabilities, easy to deploy, with fast inference. | XLM-RoBERTa-Large |\n", - "| [BAAI/bge-reranker-v2-gemma](https://huggingface.co/BAAI/bge-reranker-v2-gemma) | Multilingual | 2.51B | a cross-encoder model which is suitable for multilingual contexts, performs well in both English proficiency and multilingual capabilities. | Gemma2-2B |\n", - "| [BAAI/bge-reranker-v2-minicpm-layerwise](https://huggingface.co/BAAI/bge-reranker-v2-minicpm-layerwise) | Multilingual | 2.72B | a cross-encoder model which is suitable for multilingual contexts, performs well in both English and Chinese proficiency, allows freedom to select layers for output, facilitating accelerated inference. | MiniCPM |\n", - "| [BAAI/bge-reranker-v2.5-gemma2-lightweight](https://huggingface.co/BAAI/bge-reranker-v2.5-gemma2-lightweight) | Multilingual | 9.24B | a cross-encoder model which is suitable for multilingual contexts, performs well in both English and Chinese proficiency, allows freedom to select layers, compress ratio and compress layers for output, facilitating accelerated inference. | Gemma2-9B |\n", - "| [BAAI/bge-reranker-large](https://huggingface.co/BAAI/bge-reranker-large) | Chinese and English | 560M | a cross-encoder model which is more accurate but less efficient | XLM-RoBERTa-Large |\n", - "| [BAAI/bge-reranker-base](https://huggingface.co/BAAI/bge-reranker-base) | Chinese and English | 278M | a cross-encoder model which is more accurate but less efficient | XLM-RoBERTa-Base |" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, let's use a small example to see how reranker works:" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[-9.474676132202148, -2.823843240737915, 5.76226806640625]\n" - ] - } - ], - "source": [ - "from FlagEmbedding import FlagReranker\n", - "\n", - "reranker = FlagReranker('BAAI/bge-reranker-large', use_fp16=True) \n", - "# Setting use_fp16 to True speeds up computation with a slight performance degradation\n", - "\n", - "# use the compute_score() function to calculate scores for each input sentence pair\n", - "scores = reranker.compute_score([\n", - " ['what is panda?', 'Today is a sunny day'], \n", - " ['what is panda?', 'The tiger (Panthera tigris) is a member of the genus Panthera and the largest living cat species native to Asia.'],\n", - " ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']\n", - " ])\n", - "print(scores)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, let's use the reranker to rerank our previously retrieved results:" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "new_ids, new_scores, new_text = [], [], []\n", - "for i in range(len(queries)):\n", - " # get the new scores of the previously retrieved results\n", - " new_score = reranker.compute_score([[queries[i], text] for text in res_text[i]])\n", - " # sort the lists of ids and scores by the new scores\n", - " new_id = [tup[1] for tup in sorted(list(zip(new_score, res_ids[i])), reverse=True)]\n", - " new_scores.append(sorted(new_score, reverse=True))\n", - " new_ids.append(new_id)\n", - " new_text.append(corpus[new_id])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. Evaluate" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For details of these metrics, please checkout the tutorial of [evaluation](https://github.com/FlagOpen/FlagEmbedding/tree/master/Tutorials/4_Evaluation)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 6.1 Recall" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "def calc_recall(preds, truths, cutoffs):\n", - " recalls = np.zeros(len(cutoffs))\n", - " for text, truth in zip(preds, truths):\n", - " for i, c in enumerate(cutoffs):\n", - " recall = np.intersect1d(truth, text[:c])\n", - " recalls[i] += len(recall) / max(min(len(recall), len(truth)), 1)\n", - " recalls /= len(preds)\n", - " return recalls" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Before reranking:" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "recall@1:\t0.97\n", - "recall@10:\t1.0\n" - ] - } - ], - "source": [ - "recalls_init = calc_recall(res_text, ground_truths, cut_offs)\n", - "for i, c in enumerate(cut_offs):\n", - " print(f\"recall@{c}:\\t{recalls_init[i]}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After reranking:" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "recall@1:\t0.99\n", - "recall@10:\t1.0\n" - ] - } - ], - "source": [ - "recalls_rerank = calc_recall(new_text, ground_truths, cut_offs)\n", - "for i, c in enumerate(cut_offs):\n", - " print(f\"recall@{c}:\\t{recalls_rerank[i]}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 6.2 MRR" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "def MRR(preds, truth, cutoffs):\n", - " mrr = [0 for _ in range(len(cutoffs))]\n", - " for pred, t in zip(preds, truth):\n", - " for i, c in enumerate(cutoffs):\n", - " for j, p in enumerate(pred):\n", - " if j < c and p in t:\n", - " mrr[i] += 1/(j+1)\n", - " break\n", - " mrr = [k/len(preds) for k in mrr]\n", - " return mrr" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Before reranking:" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MRR@1:\t0.97\n", - "MRR@10:\t0.9825\n" - ] - } - ], - "source": [ - "mrr_init = MRR(res_text, ground_truths, cut_offs)\n", - "for i, c in enumerate(cut_offs):\n", - " print(f\"MRR@{c}:\\t{mrr_init[i]}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After reranking:" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MRR@1:\t0.99\n", - "MRR@10:\t0.995\n" - ] - } - ], - "source": [ - "mrr_rerank = MRR(new_text, ground_truths, cut_offs)\n", - "for i, c in enumerate(cut_offs):\n", - " print(f\"MRR@{c}:\\t{mrr_rerank[i]}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 6.3 nDCG" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Before reranking:" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "nDCG@1: 0.97\n", - "nDCG@10: 0.9869253606521631\n" - ] - } - ], - "source": [ - "from sklearn.metrics import ndcg_score\n", - "\n", - "pred_hard_encodings = []\n", - "for pred, label in zip(res_text, ground_truths):\n", - " pred_hard_encoding = list(np.isin(pred, label).astype(int))\n", - " pred_hard_encodings.append(pred_hard_encoding)\n", - "\n", - "for i, c in enumerate(cut_offs):\n", - " nDCG = ndcg_score(pred_hard_encodings, res_scores, k=c)\n", - " print(f\"nDCG@{c}: {nDCG}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After reranking:" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "nDCG@1: 0.99\n", - "nDCG@10: 0.9963092975357145\n" - ] - } - ], - "source": [ - "pred_hard_encodings_rerank = []\n", - "for pred, label in zip(new_text, ground_truths):\n", - " pred_hard_encoding = list(np.isin(pred, label).astype(int))\n", - " pred_hard_encodings_rerank.append(pred_hard_encoding)\n", - "\n", - "for i, c in enumerate(cut_offs):\n", - " nDCG = ndcg_score(pred_hard_encodings_rerank, new_scores, k=c)\n", - " print(f\"nDCG@{c}: {nDCG}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/source/tutorial/6_RAG.rst b/docs/source/tutorial/6_RAG.rst deleted file mode 100644 index 2f3896df..00000000 --- a/docs/source/tutorial/6_RAG.rst +++ /dev/null @@ -1,11 +0,0 @@ -6. RAG -====== - -.. toctree:: - :hidden: - :maxdepth: 1 - :caption: RAG - - 6_RAG/6.1 - 6_RAG/6.2 - 6_RAG/6.3 \ No newline at end of file diff --git a/docs/source/tutorial/6_RAG/6.1.ipynb b/docs/source/tutorial/6_RAG/6.1.ipynb deleted file mode 100644 index bfa1afea..00000000 --- a/docs/source/tutorial/6_RAG/6.1.ipynb +++ /dev/null @@ -1,327 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Simple RAG From Scratch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this tutorial, we will use BGE, Faiss, and OpenAI's GPT-4o-mini to build a simple RAG system from scratch." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 0. Preparation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Install the required packages in the environment:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install -U numpy faiss-cpu FlagEmbedding openai" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Suppose I'm a resident of New York Manhattan, and I want the AI bot to provide suggestion on where should I go for dinner. It's not reliable to let it recommend some random restaurant. So let's provide a bunch of our favorate restaurants." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "corpus = [\n", - " \"Cheli: A downtown Chinese restaurant presents a distinctive dining experience with authentic and sophisticated flavors of Shanghai cuisine. Avg cost: $40-50\",\n", - " \"Masa: Midtown Japanese restaurant with exquisite sushi and omakase experiences crafted by renowned chef Masayoshi Takayama. The restaurant offers a luxurious dining atmosphere with a focus on the freshest ingredients and exceptional culinary artistry. Avg cost: $500-600\",\n", - " \"Per Se: A midtown restaurant features daily nine-course tasting menu and a nine-course vegetable tasting menu using classic French technique and the finest quality ingredients available. Avg cost: $300-400\",\n", - " \"Ortomare: A casual, earthy Italian restaurant locates uptown, offering wood-fired pizza, delicious pasta, wine & spirits & outdoor seating. Avg cost: $30-50\",\n", - " \"Banh: Relaxed, narrow restaurant in uptown, offering Vietnamese cuisine & sandwiches, famous for its pho and Vietnam sandwich. Avg cost: $20-30\",\n", - " \"Living Thai: An uptown typical Thai cuisine with different kinds of curry, Tom Yum, fried rice, Thai ice tea, etc. Avg cost: $20-30\",\n", - " \"Chick-fil-A: A Fast food restaurant with great chicken sandwich, fried chicken, fries, and salad, which can be found everywhere in New York. Avg cost: 10-20\",\n", - " \"Joe's Pizza: Most famous New York pizza locates midtown, serving different flavors including classic pepperoni, cheese, spinach, and also innovative pizza. Avg cost: $15-25\",\n", - " \"Red Lobster: In midtown, Red Lobster is a lively chain restaurant serving American seafood standards amid New England-themed decor, with fair price lobsters, shrips and crabs. Avg cost: $30-50\",\n", - " \"Bourbon Steak: It accomplishes all the traditions expected from a steakhouse, offering the finest cuts of premium beef and seafood complimented by wine and spirits program. Avg cost: $100-150\",\n", - " \"Da Long Yi: Locates in downtown, Da Long Yi is a Chinese Szechuan spicy hotpot restaurant that serves good quality meats. Avg cost: $30-50\",\n", - " \"Mitr Thai: An exquisite midtown Thai restaurant with traditional dishes as well as creative dishes, with a wonderful bar serving cocktails. Avg cost: $40-60\",\n", - " \"Yichiran Ramen: Famous Japenese ramen restaurant in both midtown and downtown, serving ramen that can be designed by customers themselves. Avg cost: $20-40\",\n", - " \"BCD Tofu House: Located in midtown, it's famous for its comforting and flavorful soondubu jjigae (soft tofu stew) and a variety of authentic Korean dishes. Avg cost: $30-50\",\n", - "]\n", - "\n", - "user_input = \"I want some Chinese food\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Indexing" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we need to figure out a fast but powerful enough method to retrieve docs in the corpus that are most closely related to our questions. Indexing is a good choice for us.\n", - "\n", - "The first step is embed each document into a vector. We use bge-base-en-v1.5 as our embedding model." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "from FlagEmbedding import FlagModel\n", - "\n", - "model = FlagModel('BAAI/bge-base-en-v1.5',\n", - " query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n", - " use_fp16=True)\n", - "\n", - "embeddings = model.encode(corpus, convert_to_numpy=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(14, 768)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "embeddings.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then, let's create a Faiss index and add all the vectors into it.\n", - "\n", - "If you want to know more about Faiss, refer to the tutorial of [Faiss and indexing](https://github.com/FlagOpen/FlagEmbedding/tree/master/Tutorials/3_Indexing)." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "import faiss\n", - "import numpy as np\n", - "\n", - "index = faiss.IndexFlatIP(embeddings.shape[1])\n", - "\n", - "index.add(embeddings)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "14" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "index.ntotal" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Retrieve and Generate" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we come to the most exciting part. Let's first embed our query and retrieve 3 most relevant document from it:" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([['Cheli: A downtown Chinese restaurant presents a distinctive dining experience with authentic and sophisticated flavors of Shanghai cuisine. Avg cost: $40-50',\n", - " 'Da Long Yi: Locates in downtown, Da Long Yi is a Chinese Szechuan spicy hotpot restaurant that serves good quality meats. Avg cost: $30-50',\n", - " 'Yichiran Ramen: Famous Japenese ramen restaurant in both midtown and downtown, serving ramen that can be designed by customers themselves. Avg cost: $20-40']],\n", - " dtype='\n", - "{context}\n", - "\n", - "\n", - "Question: {input}\n", - "\"\"\"\n", - "\n", - "# Create a prompt template\n", - "prompt = ChatPromptTemplate.from_template(template)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now everything is ready. Assemble them to a chain and let the magic happen!" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.chains.combine_documents import create_stuff_documents_chain\n", - "from langchain.chains import create_retrieval_chain\n", - "\n", - "doc_chain = create_stuff_documents_chain(llm, prompt)\n", - "chain = create_retrieval_chain(retriever, doc_chain)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run the following cell, we can see that the chatbot can answer the question correctly!" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "M3-Embedding stands for a new embedding model that is distinguished for its versatility in multi-linguality, multi-functionality, and multi-granularity.\n" - ] - } - ], - "source": [ - "response = chain.invoke({\"input\": \"What does M3-Embedding stands for?\"})\n", - "\n", - "# print the answer only\n", - "print(response['answer'])" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/source/tutorial/6_RAG/6.3.ipynb b/docs/source/tutorial/6_RAG/6.3.ipynb deleted file mode 100644 index 2defb7c3..00000000 --- a/docs/source/tutorial/6_RAG/6.3.ipynb +++ /dev/null @@ -1,384 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# RAG with LlamaIndex" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "LlamaIndex is a very popular framework to help build connections between data sources and LLMs. It is also a top choice when people would like to build an RAG framework. In this tutorial, we will go through how to use LlamaIndex to aggregate bge-base-en-v1.5 and GPT-4o-mini to an RAG application." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 0. Preparation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First install the required packages in the environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install llama-index-llms-openai llama-index-embeddings-huggingface llama-index-vector-stores-faiss\n", - "%pip install llama_index " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then fill the OpenAI API key below:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# For openai key\n", - "import os\n", - "os.environ[\"OPENAI_API_KEY\"] = \"YOUR_API_KEY\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "BGE-M3 is a very powerful embedding model, We would like to know what does that 'M3' stands for.\n", - "\n", - "Let's first ask GPT the question:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "M3-Embedding stands for Multimodal Multiscale Embedding. It is a technique used in machine learning and data analysis to embed high-dimensional data into a lower-dimensional space while preserving the structure and relationships within the data. This technique is particularly useful for analyzing complex datasets that contain multiple modalities or scales of information.\n" - ] - } - ], - "source": [ - "from llama_index.llms.openai import OpenAI\n", - "\n", - "# non-streaming\n", - "response = OpenAI().complete(\"What does M3-Embedding stands for?\")\n", - "print(response)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "By checking the description in GitHub [repo](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/BGE_M3) of BGE-M3, we are pretty sure that GPT is giving us hallucination. Let's build an RAG pipeline to solve the problem!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, download BGE-M3 [paper](https://arxiv.org/pdf/2402.03216) to a directory, and load it through `SimpleDirectoryReader`. \n", - "\n", - "Note that `SimpleDirectoryReader` can read all the documents under that directory and supports a lot of commonly used [file types](https://docs.llamaindex.ai/en/stable/module_guides/loading/simpledirectoryreader/#supported-file-types)." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index.core import SimpleDirectoryReader\n", - "\n", - "reader = SimpleDirectoryReader(\"data\")\n", - "# reader = SimpleDirectoryReader(\"DIR_TO_FILE\")\n", - "documents = reader.load_data()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `Settings` object is a global settings for the RAG pipeline. Attributes in it have default settings and can be modified by users (OpenAI's GPT and embedding model). Large attributes like models will be only loaded when being used.\n", - "\n", - "Here, we specify the `node_parser` to `SentenceSplitter()` with our chosen parameters, use the open-source `bge-base-en-v1.5` as our embedding model, and `gpt-4o-mini` as our llm." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "from llama_index.core import Settings\n", - "from llama_index.core.node_parser import SentenceSplitter\n", - "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n", - "from llama_index.llms.openai import OpenAI\n", - "\n", - "# set the parser with parameters\n", - "Settings.node_parser = SentenceSplitter(\n", - " chunk_size=1000, # Maximum size of chunks to return\n", - " chunk_overlap=150, # number of overlap characters between chunks\n", - ")\n", - "\n", - "# set the specific embedding model\n", - "Settings.embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-base-en-v1.5\")\n", - "\n", - "# set the llm we want to use\n", - "Settings.llm = OpenAI(model=\"gpt-4o-mini\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Indexing" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Indexing is one of the most important part in RAG. LlamaIndex integrates a great amount of vector databases. Here we will use Faiss as an example." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First check the dimension of the embeddings, which will need for initializing a Faiss index." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "768\n" - ] - } - ], - "source": [ - "embedding = Settings.embed_model.get_text_embedding(\"Hello world\")\n", - "dim = len(embedding)\n", - "print(dim)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Then create the index with Faiss and our documents. Here LlamaIndex help capsulate the Faiss function calls. If you would like to know more about Faiss, refer to the tutorial of [Faiss and indexing](https://github.com/FlagOpen/FlagEmbedding/tree/master/Tutorials/3_Indexing)." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "import faiss\n", - "from llama_index.vector_stores.faiss import FaissVectorStore\n", - "from llama_index.core import StorageContext, VectorStoreIndex\n", - "\n", - "# init Faiss and create a vector store\n", - "faiss_index = faiss.IndexFlatL2(dim)\n", - "vector_store = FaissVectorStore(faiss_index=faiss_index)\n", - "\n", - "# customize the storage context using our vector store\n", - "storage_context = StorageContext.from_defaults(\n", - " vector_store=vector_store\n", - ")\n", - "\n", - "# use the loaded documents to build the index\n", - "index = VectorStoreIndex.from_documents(\n", - " documents, storage_context=storage_context\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Retrieve and Generate" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "With a well constructed index, we can now build the query engine to accomplish our task:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "query_engine = index.as_query_engine()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following cell displays the default prompt template for Q&A in our pipeline:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Context information is below.\n", - "---------------------\n", - "{context_str}\n", - "---------------------\n", - "Given the context information and not prior knowledge, answer the query.\n", - "Query: {query_str}\n", - "Answer: \n" - ] - } - ], - "source": [ - "# check the default promt template\n", - "prompt_template = query_engine.get_prompts()['response_synthesizer:text_qa_template']\n", - "print(prompt_template.get_template())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "(Optional) You could modify the prompt to match your use cases:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "You are a Q&A chat bot.\n", - "Use the given context only, answer the question.\n", - "\n", - "\n", - "{context_str}\n", - "\n", - "\n", - "Question: {query_str}\n", - "\n" - ] - } - ], - "source": [ - "from llama_index.core import PromptTemplate\n", - "\n", - "template = \"\"\"\n", - "You are a Q&A chat bot.\n", - "Use the given context only, answer the question.\n", - "\n", - "\n", - "{context_str}\n", - "\n", - "\n", - "Question: {query_str}\n", - "\"\"\"\n", - "\n", - "new_template = PromptTemplate(template)\n", - "query_engine.update_prompts(\n", - " {\"response_synthesizer:text_qa_template\": new_template}\n", - ")\n", - "\n", - "prompt_template = query_engine.get_prompts()['response_synthesizer:text_qa_template']\n", - "print(prompt_template.get_template())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, let's see how does the RAG application performs on our query!" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "M3-Embedding stands for Multi-Linguality, Multi-Functionality, and Multi-Granularity.\n" - ] - } - ], - "source": [ - "response = query_engine.query(\"What does M3-Embedding stands for?\")\n", - "print(response)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "test", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}