calculating memory metrics for cpu usage (fastai#2411)

* calculating memory metrics for cpu usage * small fixes
pushkarravi · Nov 20, 2019 · 08abcde · 08abcde
1 parent 02d7776
commit 08abcde
Show file tree

Hide file tree

Showing 2 changed files with 62 additions and 0 deletions.
diff --git a/fastai/callbacks/cpu_mem.py b/fastai/callbacks/cpu_mem.py
@@ -0,0 +1,42 @@
+" Memory profiling callbacks "
+
+import tracemalloc, threading, torch, time
+from ..utils.mem import *
+from ..basic_train import *
+from ..torch_core import *
+
+class CpuPeakMemMetric(LearnerCallback):
+    "Callback that measures used and peaked general and CPU memory."
+
+    _order = -20  # Needs to run before the recorder
+
+    def peak_monitor_start(self):
+        self.peak_monitoring = True
+
+        # start RAM tracing
+        tracemalloc.start()
+
+        # this thread samples RAM usage as long as the current epoch of the fit loop is running
+        peak_monitor_thread = threading.Thread(target=self.peak_monitor_func)
+        peak_monitor_thread.daemon = True
+        peak_monitor_thread.start()
+
+    def peak_monitor_stop(self):
+        tracemalloc.stop()
+        self.peak_monitoring = False
+
+    def peak_monitor_func(self):
+        self.cpu_mem_used_peak = -1
+        while True:
+            if not self.peak_monitoring: break
+            time.sleep(0.001)  # 1msec
+
+    def on_train_begin(self, **kwargs): self.learn.recorder.add_metric_names(['cpu used', 'cpu_peak'])
+
+    def on_epoch_begin(self, **kwargs): self.peak_monitor_start()
+
+    def on_epoch_end(self, last_metrics, **kwargs):
+        cpu_used, cpu_peak = list(map(lambda x: float(x / 2 ** 20), tracemalloc.get_traced_memory()))
+        self.peak_monitor_stop()
+        # The numbers are deltas in MBs (beginning of the epoch and the end)
+        return add_metrics(last_metrics, [cpu_used, cpu_peak])
diff --git a/tests/test_callbacks_cpu_mem.py b/tests/test_callbacks_cpu_mem.py
@@ -0,0 +1,20 @@
+import pytest
+from fastai.callbacks.cpu_mem import *
+from fastai.gen_doc.doctest import this_tests
+from utils.fakes import *
+from utils.text import CaptureStdout
+
+@pytest.mark.skip("occassional random failures")
+@pytest.mark.cuda
+def test_peak_mem_metric():
+    learn = fake_learner()
+    learn.callbacks.append(CpuPeakMemMetric(learn))
+    this_tests(CpuPeakMemMetric)
+    with CaptureStdout() as cs:
+        learn.fit_one_cycle(3, max_lr=1e-2)
+    for s in ['cpu used', 'cpu_peak']:
+        assert s in cs.out, f"expecting '{s}' in \n{cs.out}"
+    # XXX: needs a better test to assert some numbers here (at least >0)
+    # epochs 2-3 it shouldn't allocate more general or CPU RAM
+    for s in ['0         0']:
+        assert s in cs.out, f"expecting '{s}' in \n{cs.out}"