Polars benchmarks methods (#424)

Benchmarks methods for dependencies using alternative dataframe engine polars. --------- Co-authored-by: Christian Geng <cgeng@audeering.com> Co-authored-by: Hagen Wierstorf <hwierstorf@audeering.com>
audeering · Jul 26, 2024 · a821c8c · a821c8c
1 parent 245118e
commit a821c8c
Show file tree

Hide file tree

Showing 6 changed files with 1,396 additions and 0 deletions.
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -80,6 +80,72 @@ using `pyarrow` dtypes).
 | Dependencies._update_media()                    |    0.087 |    0.086 |     0.145 |
 | Dependencies._update_media_version(10000 files) |    0.011 |    0.011 |     0.020 |
 
+## audb.Dependencies methods using polars
+
+Handling of the dependency table with `pandas`
+was further compared to handling it with `polars`,
+by reimplementing all methods of `audb.Dependencies` using `polars`.
+
+This benchmark was executed on:
+
+* CPU: 12th Gen Intel Core i7-1255U
+* RAM: 15.66 GB
+* Hard Drive: KBG5AZNT1T02 LA KIOXIA
+* Linux: Ubuntu 22.04.4 LTS
+* Python 3.11.9
+
+To run the benchmark execute:
+
+```bash
+$ python benchmark-dependencies-methods-polars.py
+```
+
+The data were compared to
+the results from `benchmark-dependencies-methods.py` using
+
+```bash
+python compare_dependency_methods_polars.py
+```
+Both steps require that `benchmark-dependencies-methods.py`
+has been run previously in order to create the test data
+and results. The comparison in the `pandas` column is
+based on the pyarrow column in the tabulation in the previous
+section.
+
+
+| method                                          |   pandas |   polars | winner   |   factor |
+|-------------------------------------------------|----------|----------|----------|----------|
+| Dependencies.\_\_call__()                         |    0.000 |    0.000 | polars   |    2.667 |
+| Dependencies.\_\_contains__(10000 files)          |    0.003 |    0.002 | polars   |    2.005 |
+| Dependencies.\_\_get_item__(10000 files)          |    0.648 |    0.013 | polars   |   50.382 |
+| Dependencies.\_\_len__()                          |    0.000 |    0.000 | pandas   |    1.300 |
+| Dependencies.\_\_str__()                          |    0.004 |    0.000 | polars   |   24.677 |
+| Dependencies._add_attachment()                  |    0.171 |    0.104 | polars   |    1.645 |
+| Dependencies._add_media(10000 files)            |    0.073 |    0.008 | polars   |    9.589 |
+| Dependencies._add_meta()                        |    0.127 |    0.100 | polars   |    1.260 |
+| Dependencies._drop()                            |    0.118 |    0.021 | polars   |    5.628 |
+| Dependencies._remove()                          |    0.067 |    0.002 | polars   |   39.324 |
+| Dependencies._update_media()                    |    0.142 |    0.066 | polars   |    2.148 |
+| Dependencies._update_media_version(10000 files) |    0.021 |    0.016 | polars   |    1.341 |
+| Dependencies.archive(10000 files)               |    0.045 |    0.014 | polars   |    3.250 |
+| Dependencies.archives                           |    0.145 |    0.151 | pandas   |    1.045 |
+| Dependencies.attachment_ids                     |    0.018 |    0.008 | polars   |    2.375 |
+| Dependencies.attachments                        |    0.017 |    0.008 | polars   |    2.194 |
+| Dependencies.bit_depth(10000 files)             |    0.029 |    0.014 | polars   |    2.031 |
+| Dependencies.channels(10000 files)              |    0.030 |    0.013 | polars   |    2.224 |
+| Dependencies.checksum(10000 files)              |    0.030 |    0.014 | polars   |    2.201 |
+| Dependencies.duration(10000 files)              |    0.028 |    0.014 | polars   |    2.066 |
+| Dependencies.files                              |    0.012 |    0.011 | polars   |    1.040 |
+| Dependencies.format(10000 files)                |    0.033 |    0.014 | polars   |    2.345 |
+| Dependencies.media                              |    0.068 |    0.040 | polars   |    1.702 |
+| Dependencies.removed(10000 files)               |    0.029 |    0.014 | polars   |    2.118 |
+| Dependencies.removed_media                      |    0.068 |    0.038 | polars   |    1.809 |
+| Dependencies.sampling_rate(10000 files)         |    0.029 |    0.014 | polars   |    2.102 |
+| Dependencies.table_ids                          |    0.025 |    0.013 | polars   |    1.927 |
+| Dependencies.tables                             |    0.017 |    0.008 | polars   |    2.166 |
+| Dependencies.type(10000 files)                  |    0.028 |    0.014 | polars   |    2.063 |
+| Dependencies.version(10000 files)               |    0.032 |    0.013 | polars   |    2.372 |
+
 
 ## audb.Dependencies loading/writing to file
 

diff --git a/benchmarks/benchmark-dependencies-methods-polars.py b/benchmarks/benchmark-dependencies-methods-polars.py
@@ -0,0 +1,297 @@
+import random
+import time
+
+import pandas as pd
+import tabulate
+
+import audeer
+
+import audb
+
+
+random.seed(1)
+
+cache = audeer.mkdir("./cache")
+
+CACHE_EXT: str = "pkl"
+CACHE_EXT: str = "parquet"
+PARQUET_SAVE_OPTS: dict = {"engine": "pyarrow"}
+# dtypes : list = ["string", "object", "pyarrow"]
+dtypes: list = [
+    "polars",
+]
+
+
+def set_dependency_module():
+    r"""Monkeypatch dependency modult to use polars module."""
+    import polars as pl
+
+    from audb.core import define
+
+    depend_index_colname = "file"
+    depend_index_dtype = pl.datatypes.Object
+    depend_field_dtypes = dict(
+        zip(
+            define.DEPEND_FIELD_DTYPES.keys(),
+            [
+                pl.datatypes.String,
+                pl.datatypes.Int32,
+                pl.datatypes.Int32,
+                pl.datatypes.String,
+                pl.datatypes.Float64,
+                pl.datatypes.String,
+                pl.datatypes.Int32,
+                pl.datatypes.Int32,
+                pl.datatypes.Int32,
+                pl.datatypes.String,
+            ],
+        )
+    )
+
+    audb.core.define.DEPEND_INDEX_COLNAME = depend_index_colname
+    audb.core.define.DEPEND_FIELD_DTYPES_PANDAS = audb.core.define.DEPEND_FIELD_DTYPES
+    audb.core.define.DEPEND_FIELD_DTYPES = depend_field_dtypes
+    audb.core.define.DEPEND_INDEX_DTYPE_PANDAS = audb.core.define.DEPEND_INDEX_DTYPE
+    audb.core.define.DEPEND_INDEX_DTYPE = depend_index_dtype
+
+    import dependencies_polars
+
+    audb.Dependencies = dependencies_polars.Dependencies
+
+
+# === Dependencies load via pickle before monkey_patching ===
+data_cache = audeer.path(cache, "df.pkl")
+deps = audb.Dependencies()
+deps.load(data_cache)
+
+# save cache in parquet format as the polars load method depends on it
+parquet_cache = audeer.path(cache, "df.parquet")
+deps.save(parquet_cache)
+
+file = "file-10.wav"
+n_files = 10000
+results = pd.DataFrame(columns=["polars"])
+results.index.name = "method"
+set_dependency_module()
+dtype = "polars"
+
+for dtype in dtypes:
+    # load them
+    deps = audb.Dependencies()
+    deps.load(parquet_cache)
+    _files = deps._df["file"][:n_files].to_list()
+
+    # only string meanningful
+    # expected_dtype = pl.String
+
+    # assert deps._df["archive"].dtype == expected_dtype
+
+    method = "Dependencies.__call__()"
+    t0 = time.time()
+    # deps()
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    # Access the index one time.
+    # Further calls will be faster
+    file in deps
+
+    method = f"Dependencies.__contains__({n_files} files)"
+    t0 = time.time()
+    [file in deps for file in _files]
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = f"Dependencies.__get_item__({n_files} files)"
+    t0 = time.time()
+    [deps[file] for file in _files]
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = "Dependencies.__len__()"
+    t0 = time.time()
+    len(deps)
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = "Dependencies.__str__()"
+    t0 = time.time()
+    str(deps)
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = "Dependencies.archives"
+    t0 = time.time()
+    deps.archives
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = "Dependencies.attachments"
+    t0 = time.time()
+    deps.attachments
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = "Dependencies.attachment_ids"
+    t0 = time.time()
+    deps.attachment_ids
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = "Dependencies.files"
+    t0 = time.time()
+    deps.files
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = "Dependencies.media"
+    t0 = time.time()
+    deps.media
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = "Dependencies.removed_media"
+    t0 = time.time()
+    deps.removed_media
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = "Dependencies.table_ids"
+    t0 = time.time()
+    deps.table_ids
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = "Dependencies.tables"
+    t0 = time.time()
+    deps.tables
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = f"Dependencies.archive({n_files} files)"
+    t0 = time.time()
+    [deps.archive(file) for file in _files]
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = f"Dependencies.bit_depth({n_files} files)"
+    t0 = time.time()
+    [deps.bit_depth(file) for file in _files]
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = f"Dependencies.channels({n_files} files)"
+    t0 = time.time()
+    [deps.channels(file) for file in _files]
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = f"Dependencies.checksum({n_files} files)"
+    t0 = time.time()
+    [deps.checksum(file) for file in _files]
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = f"Dependencies.duration({n_files} files)"
+    t0 = time.time()
+    [deps.duration(file) for file in _files]
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = f"Dependencies.format({n_files} files)"
+    t0 = time.time()
+    [deps.format(file) for file in _files]
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = f"Dependencies.removed({n_files} files)"
+    t0 = time.time()
+    [deps.removed(file) for file in _files]
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = f"Dependencies.sampling_rate({n_files} files)"
+    t0 = time.time()
+    [deps.sampling_rate(file) for file in _files]
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = f"Dependencies.type({n_files} files)"
+    t0 = time.time()
+    [deps.type(file) for file in _files]
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = f"Dependencies.version({n_files} files)"
+    t0 = time.time()
+    [deps.version(file) for file in _files]
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    # -------------------------------------------------------------------------
+
+    # TODO: Reimplement
+    method = "Dependencies._add_attachment()"
+    t0 = time.time()
+    deps._add_attachment("attachment.txt", "1.0.0", "archive", "checksum")
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = f"Dependencies._add_media({n_files} files)"
+    values = [
+        (
+            f"file-new-{n}.wav",  # file
+            f"archive-new-{n}",  # archive
+            16,  # bit_depth
+            1,  # channels
+            f"checksum-{n}",  # checksum
+            0.4,  # duration
+            "wav",  # format
+            0,  # removed
+            16000,  # sampling_rate
+            1,  # type
+            "1.0.0",  # version
+        )
+        for n in range(n_files)
+    ]
+    t0 = time.time()
+    deps._add_media(values)
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = "Dependencies._add_meta()"
+    t0 = time.time()
+    deps._add_meta("db.new-table.csv", "1.0.0", "archive", "checksum")
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = "Dependencies._drop()"
+    t0 = time.time()
+    deps._drop(["file-90000.wav"])
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = "Dependencies._remove()"
+    t0 = time.time()
+    deps._remove(file)
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = "Dependencies._update_media()"
+    t0 = time.time()
+    deps._update_media(values)
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+    method = f"Dependencies._update_media_version({n_files} files)"
+    t0 = time.time()
+    deps._update_media_version([f"file-{n}.wav" for n in range(n_files)], "version")
+    t = time.time() - t0
+    results.at[method, dtype] = t
+
+# ===== Print results =====
+table = tabulate.tabulate(results, headers="keys", tablefmt="github", floatfmt=".3f")
+fp_results = audeer.path(cache, "results_polars.csv")
+results.to_csv(fp_results)
+
+print(table)
diff --git a/benchmarks/benchmark-dependencies-methods.py b/benchmarks/benchmark-dependencies-methods.py
@@ -366,6 +366,11 @@ def astype(df, dtype):
     t = time.time() - t0
     results.at[method, dtype] = t
 
+
+# ===== Save results =====
+fp_results = audeer.path(cache, "results.csv")
+results.to_csv(fp_results)
+
 # ===== Print results =====
 table = tabulate.tabulate(results, headers="keys", tablefmt="github", floatfmt=".3f")
 print(table)