Skip to content

Commit

Permalink
Polars benchmarks methods (#424)
Browse files Browse the repository at this point in the history
Benchmarks methods for dependencies using alternative dataframe engine polars.

---------

Co-authored-by: Christian Geng <cgeng@audeering.com>
Co-authored-by: Hagen Wierstorf <hwierstorf@audeering.com>
  • Loading branch information
3 people authored Jul 26, 2024
1 parent 245118e commit a821c8c
Show file tree
Hide file tree
Showing 6 changed files with 1,396 additions and 0 deletions.
66 changes: 66 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,72 @@ using `pyarrow` dtypes).
| Dependencies._update_media() | 0.087 | 0.086 | 0.145 |
| Dependencies._update_media_version(10000 files) | 0.011 | 0.011 | 0.020 |

## audb.Dependencies methods using polars

Handling of the dependency table with `pandas`
was further compared to handling it with `polars`,
by reimplementing all methods of `audb.Dependencies` using `polars`.

This benchmark was executed on:

* CPU: 12th Gen Intel Core i7-1255U
* RAM: 15.66 GB
* Hard Drive: KBG5AZNT1T02 LA KIOXIA
* Linux: Ubuntu 22.04.4 LTS
* Python 3.11.9

To run the benchmark execute:

```bash
$ python benchmark-dependencies-methods-polars.py
```

The data were compared to
the results from `benchmark-dependencies-methods.py` using

```bash
python compare_dependency_methods_polars.py
```
Both steps require that `benchmark-dependencies-methods.py`
has been run previously in order to create the test data
and results. The comparison in the `pandas` column is
based on the pyarrow column in the tabulation in the previous
section.


| method | pandas | polars | winner | factor |
|-------------------------------------------------|----------|----------|----------|----------|
| Dependencies.\_\_call__() | 0.000 | 0.000 | polars | 2.667 |
| Dependencies.\_\_contains__(10000 files) | 0.003 | 0.002 | polars | 2.005 |
| Dependencies.\_\_get_item__(10000 files) | 0.648 | 0.013 | polars | 50.382 |
| Dependencies.\_\_len__() | 0.000 | 0.000 | pandas | 1.300 |
| Dependencies.\_\_str__() | 0.004 | 0.000 | polars | 24.677 |
| Dependencies._add_attachment() | 0.171 | 0.104 | polars | 1.645 |
| Dependencies._add_media(10000 files) | 0.073 | 0.008 | polars | 9.589 |
| Dependencies._add_meta() | 0.127 | 0.100 | polars | 1.260 |
| Dependencies._drop() | 0.118 | 0.021 | polars | 5.628 |
| Dependencies._remove() | 0.067 | 0.002 | polars | 39.324 |
| Dependencies._update_media() | 0.142 | 0.066 | polars | 2.148 |
| Dependencies._update_media_version(10000 files) | 0.021 | 0.016 | polars | 1.341 |
| Dependencies.archive(10000 files) | 0.045 | 0.014 | polars | 3.250 |
| Dependencies.archives | 0.145 | 0.151 | pandas | 1.045 |
| Dependencies.attachment_ids | 0.018 | 0.008 | polars | 2.375 |
| Dependencies.attachments | 0.017 | 0.008 | polars | 2.194 |
| Dependencies.bit_depth(10000 files) | 0.029 | 0.014 | polars | 2.031 |
| Dependencies.channels(10000 files) | 0.030 | 0.013 | polars | 2.224 |
| Dependencies.checksum(10000 files) | 0.030 | 0.014 | polars | 2.201 |
| Dependencies.duration(10000 files) | 0.028 | 0.014 | polars | 2.066 |
| Dependencies.files | 0.012 | 0.011 | polars | 1.040 |
| Dependencies.format(10000 files) | 0.033 | 0.014 | polars | 2.345 |
| Dependencies.media | 0.068 | 0.040 | polars | 1.702 |
| Dependencies.removed(10000 files) | 0.029 | 0.014 | polars | 2.118 |
| Dependencies.removed_media | 0.068 | 0.038 | polars | 1.809 |
| Dependencies.sampling_rate(10000 files) | 0.029 | 0.014 | polars | 2.102 |
| Dependencies.table_ids | 0.025 | 0.013 | polars | 1.927 |
| Dependencies.tables | 0.017 | 0.008 | polars | 2.166 |
| Dependencies.type(10000 files) | 0.028 | 0.014 | polars | 2.063 |
| Dependencies.version(10000 files) | 0.032 | 0.013 | polars | 2.372 |


## audb.Dependencies loading/writing to file

Expand Down
297 changes: 297 additions & 0 deletions benchmarks/benchmark-dependencies-methods-polars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
import random
import time

import pandas as pd
import tabulate

import audeer

import audb


random.seed(1)

cache = audeer.mkdir("./cache")

CACHE_EXT: str = "pkl"
CACHE_EXT: str = "parquet"
PARQUET_SAVE_OPTS: dict = {"engine": "pyarrow"}
# dtypes : list = ["string", "object", "pyarrow"]
dtypes: list = [
"polars",
]


def set_dependency_module():
r"""Monkeypatch dependency modult to use polars module."""
import polars as pl

from audb.core import define

depend_index_colname = "file"
depend_index_dtype = pl.datatypes.Object
depend_field_dtypes = dict(
zip(
define.DEPEND_FIELD_DTYPES.keys(),
[
pl.datatypes.String,
pl.datatypes.Int32,
pl.datatypes.Int32,
pl.datatypes.String,
pl.datatypes.Float64,
pl.datatypes.String,
pl.datatypes.Int32,
pl.datatypes.Int32,
pl.datatypes.Int32,
pl.datatypes.String,
],
)
)

audb.core.define.DEPEND_INDEX_COLNAME = depend_index_colname
audb.core.define.DEPEND_FIELD_DTYPES_PANDAS = audb.core.define.DEPEND_FIELD_DTYPES
audb.core.define.DEPEND_FIELD_DTYPES = depend_field_dtypes
audb.core.define.DEPEND_INDEX_DTYPE_PANDAS = audb.core.define.DEPEND_INDEX_DTYPE
audb.core.define.DEPEND_INDEX_DTYPE = depend_index_dtype

import dependencies_polars

audb.Dependencies = dependencies_polars.Dependencies


# === Dependencies load via pickle before monkey_patching ===
data_cache = audeer.path(cache, "df.pkl")
deps = audb.Dependencies()
deps.load(data_cache)

# save cache in parquet format as the polars load method depends on it
parquet_cache = audeer.path(cache, "df.parquet")
deps.save(parquet_cache)

file = "file-10.wav"
n_files = 10000
results = pd.DataFrame(columns=["polars"])
results.index.name = "method"
set_dependency_module()
dtype = "polars"

for dtype in dtypes:
# load them
deps = audb.Dependencies()
deps.load(parquet_cache)
_files = deps._df["file"][:n_files].to_list()

# only string meanningful
# expected_dtype = pl.String

# assert deps._df["archive"].dtype == expected_dtype

method = "Dependencies.__call__()"
t0 = time.time()
# deps()
t = time.time() - t0
results.at[method, dtype] = t

# Access the index one time.
# Further calls will be faster
file in deps

method = f"Dependencies.__contains__({n_files} files)"
t0 = time.time()
[file in deps for file in _files]
t = time.time() - t0
results.at[method, dtype] = t

method = f"Dependencies.__get_item__({n_files} files)"
t0 = time.time()
[deps[file] for file in _files]
t = time.time() - t0
results.at[method, dtype] = t

method = "Dependencies.__len__()"
t0 = time.time()
len(deps)
t = time.time() - t0
results.at[method, dtype] = t

method = "Dependencies.__str__()"
t0 = time.time()
str(deps)
t = time.time() - t0
results.at[method, dtype] = t

method = "Dependencies.archives"
t0 = time.time()
deps.archives
t = time.time() - t0
results.at[method, dtype] = t

method = "Dependencies.attachments"
t0 = time.time()
deps.attachments
t = time.time() - t0
results.at[method, dtype] = t

method = "Dependencies.attachment_ids"
t0 = time.time()
deps.attachment_ids
t = time.time() - t0
results.at[method, dtype] = t

method = "Dependencies.files"
t0 = time.time()
deps.files
t = time.time() - t0
results.at[method, dtype] = t

method = "Dependencies.media"
t0 = time.time()
deps.media
t = time.time() - t0
results.at[method, dtype] = t

method = "Dependencies.removed_media"
t0 = time.time()
deps.removed_media
t = time.time() - t0
results.at[method, dtype] = t

method = "Dependencies.table_ids"
t0 = time.time()
deps.table_ids
t = time.time() - t0
results.at[method, dtype] = t

method = "Dependencies.tables"
t0 = time.time()
deps.tables
t = time.time() - t0
results.at[method, dtype] = t

method = f"Dependencies.archive({n_files} files)"
t0 = time.time()
[deps.archive(file) for file in _files]
t = time.time() - t0
results.at[method, dtype] = t

method = f"Dependencies.bit_depth({n_files} files)"
t0 = time.time()
[deps.bit_depth(file) for file in _files]
t = time.time() - t0
results.at[method, dtype] = t

method = f"Dependencies.channels({n_files} files)"
t0 = time.time()
[deps.channels(file) for file in _files]
t = time.time() - t0
results.at[method, dtype] = t

method = f"Dependencies.checksum({n_files} files)"
t0 = time.time()
[deps.checksum(file) for file in _files]
t = time.time() - t0
results.at[method, dtype] = t

method = f"Dependencies.duration({n_files} files)"
t0 = time.time()
[deps.duration(file) for file in _files]
t = time.time() - t0
results.at[method, dtype] = t

method = f"Dependencies.format({n_files} files)"
t0 = time.time()
[deps.format(file) for file in _files]
t = time.time() - t0
results.at[method, dtype] = t

method = f"Dependencies.removed({n_files} files)"
t0 = time.time()
[deps.removed(file) for file in _files]
t = time.time() - t0
results.at[method, dtype] = t

method = f"Dependencies.sampling_rate({n_files} files)"
t0 = time.time()
[deps.sampling_rate(file) for file in _files]
t = time.time() - t0
results.at[method, dtype] = t

method = f"Dependencies.type({n_files} files)"
t0 = time.time()
[deps.type(file) for file in _files]
t = time.time() - t0
results.at[method, dtype] = t

method = f"Dependencies.version({n_files} files)"
t0 = time.time()
[deps.version(file) for file in _files]
t = time.time() - t0
results.at[method, dtype] = t

# -------------------------------------------------------------------------

# TODO: Reimplement
method = "Dependencies._add_attachment()"
t0 = time.time()
deps._add_attachment("attachment.txt", "1.0.0", "archive", "checksum")
t = time.time() - t0
results.at[method, dtype] = t

method = f"Dependencies._add_media({n_files} files)"
values = [
(
f"file-new-{n}.wav", # file
f"archive-new-{n}", # archive
16, # bit_depth
1, # channels
f"checksum-{n}", # checksum
0.4, # duration
"wav", # format
0, # removed
16000, # sampling_rate
1, # type
"1.0.0", # version
)
for n in range(n_files)
]
t0 = time.time()
deps._add_media(values)
t = time.time() - t0
results.at[method, dtype] = t

method = "Dependencies._add_meta()"
t0 = time.time()
deps._add_meta("db.new-table.csv", "1.0.0", "archive", "checksum")
t = time.time() - t0
results.at[method, dtype] = t

method = "Dependencies._drop()"
t0 = time.time()
deps._drop(["file-90000.wav"])
t = time.time() - t0
results.at[method, dtype] = t

method = "Dependencies._remove()"
t0 = time.time()
deps._remove(file)
t = time.time() - t0
results.at[method, dtype] = t

method = "Dependencies._update_media()"
t0 = time.time()
deps._update_media(values)
t = time.time() - t0
results.at[method, dtype] = t

method = f"Dependencies._update_media_version({n_files} files)"
t0 = time.time()
deps._update_media_version([f"file-{n}.wav" for n in range(n_files)], "version")
t = time.time() - t0
results.at[method, dtype] = t

# ===== Print results =====
table = tabulate.tabulate(results, headers="keys", tablefmt="github", floatfmt=".3f")
fp_results = audeer.path(cache, "results_polars.csv")
results.to_csv(fp_results)

print(table)
5 changes: 5 additions & 0 deletions benchmarks/benchmark-dependencies-methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,11 @@ def astype(df, dtype):
t = time.time() - t0
results.at[method, dtype] = t


# ===== Save results =====
fp_results = audeer.path(cache, "results.csv")
results.to_csv(fp_results)

# ===== Print results =====
table = tabulate.tabulate(results, headers="keys", tablefmt="github", floatfmt=".3f")
print(table)
Loading

0 comments on commit a821c8c

Please sign in to comment.