optimize memory view using lttb sample (#776)

Summary: Enhancement for Issue #760. Hey guys, I've optimized the speed of the memory view using the LTTB sampling (Largest-Triangle-Three-Buckets sampling, which is able to downsample time series–like data while retaining the overall shape and variability in the data). I've tested this using a PyTorch profiler trace of 2G, and the memory view page will not get crashed and the scaling operation is smooth and rather acceptable to me. Pull Request resolved: #776 Reviewed By: chaekit Differential Revision: D47850048 Pulled By: aaronenyeshi fbshipit-source-id: 4d32666f972c7f1b5d18817f69c3266bcb619d92
pytorch · Jul 31, 2023 · 170d45a · 170d45a
1 parent 465ff4c
commit 170d45a
Show file tree

Hide file tree

Showing 2 changed files with 77 additions and 1 deletion.
diff --git a/tb_plugin/torch_tb_profiler/run.py b/tb_plugin/torch_tb_profiler/run.py
@@ -9,7 +9,7 @@
 from .profiler.memory_parser import MemoryMetrics, MemoryRecord, MemorySnapshot
 from .profiler.module_op import Stats
 from .profiler.node import OperatorNode
-from .utils import Canonicalizer, DisplayRounder
+from .utils import Canonicalizer, DisplayRounder, lttb_sample
 
 logger = utils.get_logger()
 
@@ -294,6 +294,8 @@ def patch_curves_for_step_plot(curves: Dict[str, List]):
                 default_device = dev
                 break
 
+        curves = lttb_sample(curves)
+
         return {
             'metadata': {
                 'default_device': default_device,

diff --git a/tb_plugin/torch_tb_profiler/utils.py b/tb_plugin/torch_tb_profiler/utils.py
@@ -7,6 +7,7 @@
 import time
 from contextlib import contextmanager
 from math import pow
+import numpy as np
 
 from . import consts
 
@@ -120,3 +121,76 @@ def timing(description: str, force: bool = False) -> None:
         logger.info(f'{description}: {elapsed_time}')
     else:
         yield
+
+
+def _areas_of_triangles(a, bs, c):
+    """Calculate areas of triangles from duples of vertex coordinates.
+
+    Uses implicit numpy broadcasting along first axis of ``bs``.
+
+    Returns
+    -------
+    numpy.array
+        Array of areas of shape (len(bs),)
+    """
+    bs_minus_a = bs - a
+    a_minus_bs = a - bs
+    return 0.5 * abs(
+        (a[0] - c[0]) * (bs_minus_a[:, 1]) - (a_minus_bs[:, 0]) * (c[1] - a[1])
+    )
+
+
+def lttb_sample(memory_curves, n_out = 10240):
+    """
+    sample ``memory_curves`` to ``n_out`` points using the LTTB algorithm.
+
+    Parameters
+    ----------
+    memory_curves : dict(str, list(list(time,allocated,reverved)))
+        A dict, key for device (cpu, gpu0, gpu1, ...), 
+        value is a list of list of (time,allocated,reverved)
+    n_out : int
+        Number of data points to downsample to
+    
+    Returns
+    -------
+    sumpled memory_curves with at most n_out points.
+    """
+    sampled_memory_curves = {}
+    for key in memory_curves:
+        data = memory_curves[key]
+        length = len(data)
+        if n_out >= length:
+            sampled_memory_curves[key] = memory_curves[key]
+            continue
+
+        # Split data into bins
+        n_bins = n_out - 2
+        data = np.array(data)
+        data_bins = np.array_split(data[1 : length - 1], n_bins)
+
+        # Prepare output array
+        # First and last points are the same as in the input.
+        out = np.zeros((n_out, 3))
+        out[0] = data[0]
+        out[len(out) - 1] = data[length - 1]
+
+        # note that we only need to perform LTTB on (time,allocated)
+        # Largest Triangle Three Buckets (LTTB):
+        # In each bin, find the point that makes the largest triangle
+        # with the point saved in the previous bin
+        # and the centroid of the points in the next bin.
+        for i in range(len(data_bins)):
+            this_bin = data_bins[i]
+            if i < n_bins - 1:
+                next_bin = data_bins[i + 1]
+            else:
+                next_bin = data[len(data) - 1 :]
+            a = out[i]
+            bs = this_bin
+            c = next_bin.mean(axis=0)
+            areas = _areas_of_triangles(a, bs, c)
+            out[i + 1] = bs[np.argmax(areas)]
+
+        sampled_memory_curves[key] = out.tolist()
+    return sampled_memory_curves