From 52b75d9154ea6426a9b30ca599a9703d484b126c Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Wed, 28 Jun 2017 16:34:07 +0200 Subject: [PATCH 01/26] signal handling improvements, more debug logging --- setup.py | 2 +- stackimpact/agent.py | 21 +++++++++++++++----- stackimpact/frame_selector.py | 6 +----- stackimpact/message_queue.py | 2 -- stackimpact/reporters/allocation_reporter.py | 7 +++++++ stackimpact/reporters/block_reporter.py | 10 ++++++---- stackimpact/reporters/cpu_reporter.py | 4 ++++ stackimpact/reporters/error_reporter.py | 2 -- stackimpact/runtime.py | 15 ++++++++++++++ 9 files changed, 50 insertions(+), 19 deletions(-) diff --git a/setup.py b/setup.py index c7c964e..d05bb5a 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name = 'stackimpact', - version = '1.0.0', + version = '1.0.1', description = 'StackImpact Python Agent', author = 'StackImpact', author_email = 'devops@stackimpact.com', diff --git a/stackimpact/agent.py b/stackimpact/agent.py index 8fd1cce..b32c842 100644 --- a/stackimpact/agent.py +++ b/stackimpact/agent.py @@ -9,8 +9,9 @@ import os import signal import atexit +import platform -from .runtime import min_version, runtime_info +from .runtime import min_version, runtime_info, register_signal from .utils import timestamp, generate_uuid from .config import Config from .config_loader import ConfigLoader @@ -25,7 +26,7 @@ class Agent: - AGENT_VERSION = "1.0.0" + AGENT_VERSION = "1.0.1" SAAS_DASHBOARD_ADDRESS = "https://agent-api.stackimpact.com" def __init__(self, **kwargs): @@ -62,6 +63,9 @@ def start(self, **kwargs): if not min_version(2, 7) and not min_version(3, 4): raise Exception('Supported Python versions 2.6 or highter and 3.4 or higher') + if platform.python_implementation() != 'CPython': + raise Exception('Supported Python interpreter is CPython') + if self.agent_destroyed: self.log('Destroyed agent cannot be started') return @@ -106,10 +110,12 @@ def _signal_handler(signum, frame): except Exception: self.exception() - signal.signal(signal.SIGUSR2, _signal_handler) + return True + + register_signal(signal.SIGUSR2, _signal_handler) # destroy agent on exit - def _exit(): + def _exit_handler(*arg): if not self.agent_started or self.agent_destroyed: return @@ -120,7 +126,12 @@ def _exit(): self.exception() - atexit.register(_exit) + atexit.register(_exit_handler) + + register_signal(signal.SIGQUIT, _exit_handler) + register_signal(signal.SIGINT, _exit_handler) + register_signal(signal.SIGTERM, _exit_handler) + register_signal(signal.SIGHUP, _exit_handler) self.agent_started = True self.log('Agent started') diff --git a/stackimpact/frame_selector.py b/stackimpact/frame_selector.py index 521365e..6e9b79f 100644 --- a/stackimpact/frame_selector.py +++ b/stackimpact/frame_selector.py @@ -64,11 +64,7 @@ def start(self): def destroy(self): - self.agent_frame_cache = None - self.system_frame_cache = None - self.http_frame_cache = None - - self.http_frame_regexp = None + pass def is_agent_frame(self, filename): diff --git a/stackimpact/message_queue.py b/stackimpact/message_queue.py index 2cd6698..c36aae2 100644 --- a/stackimpact/message_queue.py +++ b/stackimpact/message_queue.py @@ -25,8 +25,6 @@ def start(self): def destroy(self): - self.queue = [] - if self.flush_timer: self.flush_timer.cancel() self.flush_timer = None diff --git a/stackimpact/reporters/allocation_reporter.py b/stackimpact/reporters/allocation_reporter.py index ea4a2e3..0588c65 100644 --- a/stackimpact/reporters/allocation_reporter.py +++ b/stackimpact/reporters/allocation_reporter.py @@ -63,6 +63,11 @@ def reset(self): def record(self, max_duration): + if self.agent.config.is_profiling_disabled(): + return + + self.agent.log('Activating memory allocation profiler.') + def start(): tracemalloc.start(self.MAX_TRACEBACK_SIZE) self.agent.run_in_main_thread(start) @@ -76,6 +81,8 @@ def start(): if tracemalloc.get_tracemalloc_memory() > self.MAX_MEMORY_OVERHEAD: break + self.agent.log('Deactivating memory allocation profiler.') + if tracemalloc.is_tracing(): snapshot = tracemalloc.take_snapshot() self.agent.log('Allocation profiler memory overhead {0} bytes'.format(tracemalloc.get_tracemalloc_memory())) diff --git a/stackimpact/reporters/block_reporter.py b/stackimpact/reporters/block_reporter.py index 917c1b1..75fd661 100644 --- a/stackimpact/reporters/block_reporter.py +++ b/stackimpact/reporters/block_reporter.py @@ -103,10 +103,14 @@ def record(self, duration): if self.agent.config.is_profiling_disabled(): return + self.agent.log('Activating blocking call profiler.') + signal.setitimer(signal.ITIMER_REAL, self.SAMPLING_RATE, self.SAMPLING_RATE) time.sleep(duration) signal.setitimer(signal.ITIMER_REAL, 0) + self.agent.log('Deactivating blocking call profiler.') + self.profile_duration += duration self.agent.log('Time profiler CPU overhead per activity second: {0} seconds'.format(self.block_profile._overhead / self.profile_duration)) @@ -152,7 +156,6 @@ def update_http_profile(self, stack, sample_time): include = False for frame in stack: if self.agent.frame_selector.is_http_frame(frame.filename): - frame._skip = True include = True if include: @@ -160,9 +163,8 @@ def update_http_profile(self, stack, sample_time): current_node.increment(sample_time, 1) for frame in reversed(stack): - if not frame._skip: - current_node = current_node.find_or_add_child(str(frame)) - current_node.increment(sample_time, 1) + current_node = current_node.find_or_add_child(str(frame)) + current_node.increment(sample_time, 1) def report(self): diff --git a/stackimpact/reporters/cpu_reporter.py b/stackimpact/reporters/cpu_reporter.py index 241269d..bdf91a8 100644 --- a/stackimpact/reporters/cpu_reporter.py +++ b/stackimpact/reporters/cpu_reporter.py @@ -86,10 +86,14 @@ def record(self, duration): if self.agent.config.is_profiling_disabled(): return + self.agent.log('Activating CPU profiler.') + signal.setitimer(signal.ITIMER_PROF, self.SAMPLING_RATE, self.SAMPLING_RATE) time.sleep(duration) signal.setitimer(signal.ITIMER_PROF, 0) + self.agent.log('Deactivating CPU profiler.') + self.profile_duration += duration self.agent.log('CPU profiler CPU overhead per activity second: {0} seconds'.format(self.profile._overhead / self.profile_duration)) diff --git a/stackimpact/reporters/error_reporter.py b/stackimpact/reporters/error_reporter.py index 39dfba3..47f1bf0 100644 --- a/stackimpact/reporters/error_reporter.py +++ b/stackimpact/reporters/error_reporter.py @@ -53,8 +53,6 @@ def destroy(self): unpatch(sys, 'exc_info') - self.reset_profile() - if self.process_timer: self.process_timer.cancel() self.process_timer = None diff --git a/stackimpact/runtime.py b/stackimpact/runtime.py index 0f040a9..c8ef979 100644 --- a/stackimpact/runtime.py +++ b/stackimpact/runtime.py @@ -4,6 +4,7 @@ import resource import re import os +import signal from functools import wraps @@ -117,3 +118,17 @@ def unpatch(obj, func_name): return setattr(obj, func_name, getattr(wrapper, '__stackimpact_orig__')) + + +def register_signal(signal_number, handler_func): + prev_handler = signal.SIG_IGN + def _handler(signum, frame): + skip_prev = handler_func(signum, frame) + + if not skip_prev and prev_handler not in [signal.SIG_IGN, signal.SIG_DFL] and callable(prev_handler): + prev_handler(signum, frame) + + prev_handler = signal.signal(signal_number, signal.SIG_IGN) + if prev_handler != signal.SIG_IGN: + signal.signal(signal_number, _handler) + From 31341d3e29025b273b4adca43db9bbffea2b3232 Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Thu, 29 Jun 2017 12:09:34 +0200 Subject: [PATCH 02/26] default signal fix --- setup.py | 2 +- stackimpact/agent.py | 10 +++++----- stackimpact/runtime.py | 19 +++++++++++-------- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/setup.py b/setup.py index d05bb5a..8e829f9 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name = 'stackimpact', - version = '1.0.1', + version = '1.0.2', description = 'StackImpact Python Agent', author = 'StackImpact', author_email = 'devops@stackimpact.com', diff --git a/stackimpact/agent.py b/stackimpact/agent.py index b32c842..4f846ac 100644 --- a/stackimpact/agent.py +++ b/stackimpact/agent.py @@ -26,7 +26,7 @@ class Agent: - AGENT_VERSION = "1.0.1" + AGENT_VERSION = "1.0.2" SAAS_DASHBOARD_ADDRESS = "https://agent-api.stackimpact.com" def __init__(self, **kwargs): @@ -128,10 +128,10 @@ def _exit_handler(*arg): atexit.register(_exit_handler) - register_signal(signal.SIGQUIT, _exit_handler) - register_signal(signal.SIGINT, _exit_handler) - register_signal(signal.SIGTERM, _exit_handler) - register_signal(signal.SIGHUP, _exit_handler) + register_signal(signal.SIGQUIT, _exit_handler, ignore_default = False) + register_signal(signal.SIGINT, _exit_handler, ignore_default = False) + register_signal(signal.SIGTERM, _exit_handler, ignore_default = False) + register_signal(signal.SIGHUP, _exit_handler, ignore_default = False) self.agent_started = True self.log('Agent started') diff --git a/stackimpact/runtime.py b/stackimpact/runtime.py index c8ef979..94db1d8 100644 --- a/stackimpact/runtime.py +++ b/stackimpact/runtime.py @@ -120,15 +120,18 @@ def unpatch(obj, func_name): setattr(obj, func_name, getattr(wrapper, '__stackimpact_orig__')) -def register_signal(signal_number, handler_func): - prev_handler = signal.SIG_IGN +def register_signal(signal_number, handler_func, ignore_default = True): + prev_handler = None + def _handler(signum, frame): skip_prev = handler_func(signum, frame) - if not skip_prev and prev_handler not in [signal.SIG_IGN, signal.SIG_DFL] and callable(prev_handler): - prev_handler(signum, frame) - - prev_handler = signal.signal(signal_number, signal.SIG_IGN) - if prev_handler != signal.SIG_IGN: - signal.signal(signal_number, _handler) + if not skip_prev: + if callable(prev_handler): + prev_handler(signum, frame) + elif prev_handler == signal.SIG_DFL and not ignore_default: + signal.signal(signum, signal.SIG_DFL) + os.kill(os.getpid(), signum) + prev_handler = signal.signal(signal_number, _handler) + From 68847ac8aba2a71c7cca15c12cd90bb917098fce Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Fri, 30 Jun 2017 12:24:23 +0200 Subject: [PATCH 03/26] cleanup local variables in signal handler --- setup.py | 3 +- stackimpact/agent.py | 2 +- stackimpact/reporters/block_reporter.py | 49 +++++++++++++++---------- stackimpact/reporters/cpu_reporter.py | 25 +++++++++---- tests/runtime_test.py | 44 ++++++++++++++++++++++ 5 files changed, 94 insertions(+), 29 deletions(-) create mode 100644 tests/runtime_test.py diff --git a/setup.py b/setup.py index 8e829f9..477cc6e 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name = 'stackimpact', - version = '1.0.2', + version = '1.0.3', description = 'StackImpact Python Agent', author = 'StackImpact', author_email = 'devops@stackimpact.com', @@ -29,6 +29,7 @@ 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: Implementation :: CPython', 'Topic :: Software Development', 'Topic :: System :: Monitoring', ], diff --git a/stackimpact/agent.py b/stackimpact/agent.py index 4f846ac..cc2a653 100644 --- a/stackimpact/agent.py +++ b/stackimpact/agent.py @@ -26,7 +26,7 @@ class Agent: - AGENT_VERSION = "1.0.2" + AGENT_VERSION = "1.0.3" SAAS_DASHBOARD_ADDRESS = "https://agent-api.stackimpact.com" def __init__(self, **kwargs): diff --git a/stackimpact/reporters/block_reporter.py b/stackimpact/reporters/block_reporter.py index 75fd661..e3925d2 100644 --- a/stackimpact/reporters/block_reporter.py +++ b/stackimpact/reporters/block_reporter.py @@ -38,7 +38,7 @@ def start(self): return if not runtime_info.OS_LINUX and not runtime_info.OS_DARWIN: - self.agent.log('Time profiler is only supported on Linux and OS X.') + self.agent.log('Block profiler is only supported on Linux and OS X.') return sample_time = self.SAMPLING_RATE * 1000 @@ -56,20 +56,8 @@ def _sample(signum, signal_frame): with self.profile_lock: try: - if self.block_profile: - start = time.clock() - - current_frames = sys._current_frames() - for thread_id, thread_frame in current_frames.items(): - if thread_id == main_thread_id: - thread_frame = signal_frame - - stack = self.recover_stack(thread_frame) - if stack: - self.update_block_profile(stack, sample_time) - self.update_http_profile(stack, sample_time) - - self.block_profile._overhead += (time.clock() - start) + self.process_sample(signal_frame, sample_time, main_thread_id) + signal_frame = None except Exception: self.agent.exception() @@ -86,7 +74,8 @@ def destroy(self): if self.agent.get_option('block_profiler_disabled'): return - if self.prev_signal_handler: + if self.prev_signal_handler != None: + signal.setitimer(signal.ITIMER_REAL, 0) signal.signal(signal.SIGALRM, self.prev_signal_handler) if self.profiler_scheduler: @@ -103,17 +92,39 @@ def record(self, duration): if self.agent.config.is_profiling_disabled(): return - self.agent.log('Activating blocking call profiler.') + self.agent.log('Activating block profiler.') signal.setitimer(signal.ITIMER_REAL, self.SAMPLING_RATE, self.SAMPLING_RATE) time.sleep(duration) signal.setitimer(signal.ITIMER_REAL, 0) - self.agent.log('Deactivating blocking call profiler.') + self.agent.log('Deactivating block profiler.') self.profile_duration += duration - self.agent.log('Time profiler CPU overhead per activity second: {0} seconds'.format(self.block_profile._overhead / self.profile_duration)) + self.agent.log('Block profiler CPU overhead per activity second: {0} seconds'.format(self.block_profile._overhead / self.profile_duration)) + + + def process_sample(self, signal_frame, sample_time, main_thread_id): + if self.block_profile: + start = time.clock() + + current_frames = sys._current_frames() + for thread_id, thread_frame in current_frames.items(): + if thread_id == main_thread_id: + thread_frame = signal_frame + + stack = self.recover_stack(thread_frame) + if stack: + self.update_block_profile(stack, sample_time) + self.update_http_profile(stack, sample_time) + + thread_id, thread_frame, stack = None, None, None + + current_frames = None + + self.block_profile._overhead += (time.clock() - start) + def recover_stack(self, thread_frame): diff --git a/stackimpact/reporters/cpu_reporter.py b/stackimpact/reporters/cpu_reporter.py index bdf91a8..f405e83 100644 --- a/stackimpact/reporters/cpu_reporter.py +++ b/stackimpact/reporters/cpu_reporter.py @@ -45,13 +45,8 @@ def _sample(signum, signal_frame): with self.profile_lock: try: - if self.profile: - start = time.clock() - if signal_frame: - stack = self.recover_stack(signal_frame) - if stack: - self.update_profile(self.profile, stack) - self.profile._overhead += (time.clock() - start) + self.process_sample(signal_frame) + signal_frame = None except Exception: self.agent.exception() @@ -69,7 +64,8 @@ def destroy(self): if self.agent.get_option('cpu_profiler_disabled'): return - if self.prev_signal_handler: + if self.prev_signal_handler != None: + signal.setitimer(signal.ITIMER_PROF, 0) signal.signal(signal.SIGPROF, self.prev_signal_handler) if self.profiler_scheduler: @@ -99,6 +95,19 @@ def record(self, duration): self.agent.log('CPU profiler CPU overhead per activity second: {0} seconds'.format(self.profile._overhead / self.profile_duration)) + def process_sample(self, signal_frame): + if self.profile: + start = time.clock() + if signal_frame: + stack = self.recover_stack(signal_frame) + if stack: + self.update_profile(self.profile, stack) + + stack = None + + self.profile._overhead += (time.clock() - start) + + def recover_stack(self, signal_frame): stack = [] diff --git a/tests/runtime_test.py b/tests/runtime_test.py new file mode 100644 index 0000000..0fc5b94 --- /dev/null +++ b/tests/runtime_test.py @@ -0,0 +1,44 @@ +import unittest +import signal +import os + +import stackimpact +from stackimpact.runtime import register_signal + + +class RuntimeTestCase(unittest.TestCase): + + def test_register_signal(self): + result = {'handler': 0} + + def _handler(signum, frame): + result['handler'] += 1 + + register_signal(signal.SIGUSR1, _handler) + + os.kill(os.getpid(), signal.SIGUSR1) + os.kill(os.getpid(), signal.SIGUSR1) + + signal.signal(signal.SIGUSR1, signal.SIG_DFL) + + self.assertEqual(result['handler'], 2) + + + '''def test_register_signal_default(self): + result = {'handler': 0} + + def _handler(signum, frame): + result['handler'] += 1 + + register_signal(signal.SIGUSR1, _handler, ignore_default = False) + + os.kill(os.getpid(), signal.SIGUSR1) + os.kill(os.getpid(), signal.SIGUSR1) + + self.assertEqual(result['handler'], 1)''' + + +if __name__ == '__main__': + unittest.main() + + From 540a8407d9ce3cb440b64a17e7ea7d06a42aab21 Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Tue, 4 Jul 2017 09:25:41 +0200 Subject: [PATCH 04/26] readme update, resource import fix on windows --- README.md | 5 +++- README.rst | 32 +++++++++++++++++-------- stackimpact/reporters/block_reporter.py | 4 +++- stackimpact/runtime.py | 5 +++- 4 files changed, 33 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index f94f790..2a614b3 100644 --- a/README.md +++ b/README.md @@ -28,8 +28,9 @@ See full [documentation](https://stackimpact.com/docs/) for reference. * Linux, OS X or Windows. Python version 2.7, 3.4 or higher. * Memory allocation profiler and some GC metrics are only available for Python 3. -* CPU and Time profilers only support Linux and OS X. +* Profilers only support Linux and OS X. * Time (blocking call) profiler supports threads and gevent. +* On unix systems the profilers use the following signals: SIGPROF, SIGALRM, SIGUSR2. Only SIGUSR2 is handled transparently, i.e. it should not conflict with previousely registered handlers. ## Getting started @@ -73,6 +74,8 @@ Other initialization options: * `app_environment` (Optional) Used to differentiate applications in different environments. * `host_name` (Optional) By default, host name will be the OS hostname. * `debug` (Optional) Enables debug logging. +* `cpu_profiler_disabled`, `allocation_profiler_disabled`, `block_profiler_disabled`, `error_profiler_disabled` (Optional) Disables respective profiler when `True`. +* `include_agent_frames`, `include_system_frames` (Optional) Set to `True` to not exclude agent and/or system stack frames from profiles. diff --git a/README.rst b/README.rst index b3bb759..f7bcec1 100644 --- a/README.rst +++ b/README.rst @@ -37,14 +37,17 @@ Documentation See full `documentation `__ for reference. -Requirements ------------- +Supported environment +--------------------- - Linux, OS X or Windows. Python version 2.7, 3.4 or higher. -- Memorly allocation profiler and some GC metrics are only available - for Python 3. -- CPU and Time profilers only supports Linux and OS X. +- Memory allocation profiler and some GC metrics are only available for + Python 3. +- Profilers only support Linux and OS X. - Time (blocking call) profiler supports threads and gevent. +- On unix systems the profilers use the following signals: SIGPROF, + SIGALRM, SIGUSR2. Only SIGUSR2 is handled transparently, i.e. it + should not conflict with previousely registered handlers. Getting started --------------- @@ -58,7 +61,7 @@ Sign up for a free account at Installing the agent ^^^^^^^^^^^^^^^^^^^^ -Install the Go agent by running +Install the Python agent by running :: @@ -81,7 +84,10 @@ Configuration section. agent = stackimpact.start( agent_key = 'agent key here', - app_name = 'MyPythonApp', + app_name = 'MyPythonApp') + +Add the agent initialization to the worker code, e.g. wsgi.py, if +applicable. Other initialization options: @@ -92,13 +98,19 @@ Other initialization options: - ``host_name`` (Optional) By default, host name will be the OS hostname. - ``debug`` (Optional) Enables debug logging. +- ``cpu_profiler_disabled``, ``allocation_profiler_disabled``, + ``block_profiler_disabled``, ``error_profiler_disabled`` (Optional) + Disables respective profiler when ``True``. +- ``include_agent_frames``, ``include_system_frames`` (Optional) Set to + ``True`` to not exclude agent and/or system stack frames from + profiles. Analyzing performance data in the Dashboard ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Once your application is restarted, you can start observing regular and -anomaly-triggered CPU, memory, I/O, and other hot spot profiles, -execution bottlenecks as well as process metrics in the +Once your application is restarted, you can start observing continuous +CPU, memory, I/O, and other hot spot profiles, execution bottlenecks as +well as process metrics in the `Dashboard `__. Troubleshooting diff --git a/stackimpact/reporters/block_reporter.py b/stackimpact/reporters/block_reporter.py index e3925d2..abac140 100644 --- a/stackimpact/reporters/block_reporter.py +++ b/stackimpact/reporters/block_reporter.py @@ -110,7 +110,8 @@ def process_sample(self, signal_frame, sample_time, main_thread_id): start = time.clock() current_frames = sys._current_frames() - for thread_id, thread_frame in current_frames.items(): + items = current_frames.items() + for thread_id, thread_frame in items: if thread_id == main_thread_id: thread_frame = signal_frame @@ -121,6 +122,7 @@ def process_sample(self, signal_frame, sample_time, main_thread_id): thread_id, thread_frame, stack = None, None, None + items = None current_frames = None self.block_profile._overhead += (time.clock() - start) diff --git a/stackimpact/runtime.py b/stackimpact/runtime.py index 94db1d8..54b8124 100644 --- a/stackimpact/runtime.py +++ b/stackimpact/runtime.py @@ -6,7 +6,10 @@ import os import signal from functools import wraps - +try: + import resource +except ImportError: + pass class runtime_info: From 55ffd6746b5301437f5be0e17c782d6b97c185d8 Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Thu, 6 Jul 2017 14:18:10 +0200 Subject: [PATCH 05/26] readme update --- README.md | 2 +- stackimpact/runtime.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 2a614b3..d1a079e 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ See full [documentation](https://stackimpact.com/docs/) for reference. #### Create StackImpact account -Sign up for a free account at [stackimpact.com](https://stackimpact.com/). +[Sign up](https://dashboard.stackimpact.com/#/signup) for a free account (also with GitHub login). #### Installing the agent diff --git a/stackimpact/runtime.py b/stackimpact/runtime.py index 54b8124..4854c5b 100644 --- a/stackimpact/runtime.py +++ b/stackimpact/runtime.py @@ -1,7 +1,6 @@ import time import sys -import resource import re import os import signal From c87a46987fa728c7b2767b049fc34a7d54f458e2 Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Wed, 12 Jul 2017 12:04:56 +0200 Subject: [PATCH 06/26] readme update --- README.md | 4 +++- README.rst | 10 +++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index d1a079e..0447635 100644 --- a/README.md +++ b/README.md @@ -68,8 +68,10 @@ agent = stackimpact.start( Add the agent initialization to the worker code, e.g. wsgi.py, if applicable. -Other initialization options: +All initialization options: +* `agent_key` (Required) The access key for communication with the StackImpact servers. +* `app_name` (Required) A name to identify and group application data. Typically, a single codebase corresponds to one application. * `app_version` (Optional) Sets application version, which can be used to associate profiling information with the source code release. * `app_environment` (Optional) Used to differentiate applications in different environments. * `host_name` (Optional) By default, host name will be the OS hostname. diff --git a/README.rst b/README.rst index f7bcec1..8248249 100644 --- a/README.rst +++ b/README.rst @@ -55,8 +55,8 @@ Getting started Create StackImpact account ^^^^^^^^^^^^^^^^^^^^^^^^^^ -Sign up for a free account at -`stackimpact.com `__. +`Sign up `__ for a free +account (also with GitHub login). Installing the agent ^^^^^^^^^^^^^^^^^^^^ @@ -89,8 +89,12 @@ Configuration section. Add the agent initialization to the worker code, e.g. wsgi.py, if applicable. -Other initialization options: +All initialization options: +- ``agent_key`` (Required) The access key for communication with + StackImpact servers. +- ``app_name`` (Required) A name to identify and group application + data. Typically, single codebase corresponds to one application. - ``app_version`` (Optional) Sets application version, which can be used to associate profiling information with the source code release. - ``app_environment`` (Optional) Used to differentiate applications in From b32bf14f5c80944e0e34937e47ee4356751042c9 Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Tue, 18 Jul 2017 09:27:27 +0200 Subject: [PATCH 07/26] call graph filtering --- setup.py | 2 +- stackimpact/agent.py | 2 +- stackimpact/reporters/allocation_reporter.py | 1 + stackimpact/reporters/block_reporter.py | 2 ++ stackimpact/reporters/cpu_reporter.py | 2 ++ 5 files changed, 7 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 477cc6e..ba30c81 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name = 'stackimpact', - version = '1.0.3', + version = '1.0.4', description = 'StackImpact Python Agent', author = 'StackImpact', author_email = 'devops@stackimpact.com', diff --git a/stackimpact/agent.py b/stackimpact/agent.py index cc2a653..94a1d8a 100644 --- a/stackimpact/agent.py +++ b/stackimpact/agent.py @@ -26,7 +26,7 @@ class Agent: - AGENT_VERSION = "1.0.3" + AGENT_VERSION = "1.0.4" SAAS_DASHBOARD_ADDRESS = "https://agent-api.stackimpact.com" def __init__(self, **kwargs): diff --git a/stackimpact/reporters/allocation_reporter.py b/stackimpact/reporters/allocation_reporter.py index 0588c65..cd2734e 100644 --- a/stackimpact/reporters/allocation_reporter.py +++ b/stackimpact/reporters/allocation_reporter.py @@ -130,6 +130,7 @@ def report(self): return self.profile.normalize(self.profile_duration) + self.profile.filter(2, 1000, float("inf")) metric = Metric(self.agent, Metric.TYPE_PROFILE, Metric.CATEGORY_MEMORY_PROFILE, Metric.NAME_UNCOLLECTED_ALLOCATIONS, Metric.UNIT_BYTE) measurement = metric.create_measurement(Metric.TRIGGER_TIMER, self.profile.measurement, 1, self.profile) diff --git a/stackimpact/reporters/block_reporter.py b/stackimpact/reporters/block_reporter.py index abac140..7ea434a 100644 --- a/stackimpact/reporters/block_reporter.py +++ b/stackimpact/reporters/block_reporter.py @@ -189,6 +189,7 @@ def report(self): with self.profile_lock: self.block_profile.normalize(self.profile_duration) + self.block_profile.filter(2, 1, float("inf")) metric = Metric(self.agent, Metric.TYPE_PROFILE, Metric.CATEGORY_BLOCK_PROFILE, Metric.NAME_BLOCKING_CALL_TIMES, Metric.UNIT_MILLISECOND) measurement = metric.create_measurement(Metric.TRIGGER_TIMER, self.block_profile.measurement, 1, self.block_profile) @@ -197,6 +198,7 @@ def report(self): if self.block_profile.num_samples > 0 and self.http_profile.num_samples > 0: self.http_profile.normalize(self.profile_duration) self.http_profile.convert_to_percent(self.block_profile.measurement) + self.block_profile.filter(2, 1, 100) metric = Metric(self.agent, Metric.TYPE_PROFILE, Metric.CATEGORY_HTTP_TRACE, Metric.NAME_HTTP_TRANSACTION_BREAKDOWN, Metric.UNIT_PERCENT) measurement = metric.create_measurement(Metric.TRIGGER_TIMER, self.http_profile.measurement, None, self.http_profile) diff --git a/stackimpact/reporters/cpu_reporter.py b/stackimpact/reporters/cpu_reporter.py index f405e83..2972037 100644 --- a/stackimpact/reporters/cpu_reporter.py +++ b/stackimpact/reporters/cpu_reporter.py @@ -154,6 +154,8 @@ def report(self): with self.profile_lock: self.profile.evaluate_percent(self.profile_duration / self.SAMPLING_RATE) + self.profile.filter(2, 1, 100) + metric = Metric(self.agent, Metric.TYPE_PROFILE, Metric.CATEGORY_CPU_PROFILE, Metric.NAME_MAIN_THREAD_CPU_USAGE, Metric.UNIT_PERCENT) measurement = metric.create_measurement(Metric.TRIGGER_TIMER, self.profile.measurement, None, self.profile) self.agent.message_queue.add('metric', metric.to_dict()) From 9b15b18da735c9d2c359610006c8149df8d9a9de Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Fri, 18 Aug 2017 17:37:52 +0200 Subject: [PATCH 08/26] auto_destroy option --- README.md | 3 +- README.rst | 8 +++-- setup.py | 2 +- stackimpact/agent.py | 35 ++++++++++---------- stackimpact/api_request.py | 5 +-- stackimpact/metric.py | 7 ++++ stackimpact/reporters/allocation_reporter.py | 1 + stackimpact/reporters/block_reporter.py | 1 + stackimpact/reporters/process_reporter.py | 2 +- stackimpact/runtime.py | 6 ++-- tests/agent_test.py | 2 +- tests/runtime_test.py | 2 +- tests/test.py | 29 ---------------- tests/test_server.py | 10 ++++-- 14 files changed, 54 insertions(+), 59 deletions(-) delete mode 100644 tests/test.py diff --git a/README.md b/README.md index 0447635..a646bf6 100644 --- a/README.md +++ b/README.md @@ -71,13 +71,14 @@ Add the agent initialization to the worker code, e.g. wsgi.py, if applicable. All initialization options: * `agent_key` (Required) The access key for communication with the StackImpact servers. -* `app_name` (Required) A name to identify and group application data. Typically, a single codebase corresponds to one application. +* `app_name` (Required) A name to identify and group application data. Typically, a single codebase, deployable unit or executable module corresponds to one application. * `app_version` (Optional) Sets application version, which can be used to associate profiling information with the source code release. * `app_environment` (Optional) Used to differentiate applications in different environments. * `host_name` (Optional) By default, host name will be the OS hostname. * `debug` (Optional) Enables debug logging. * `cpu_profiler_disabled`, `allocation_profiler_disabled`, `block_profiler_disabled`, `error_profiler_disabled` (Optional) Disables respective profiler when `True`. * `include_agent_frames`, `include_system_frames` (Optional) Set to `True` to not exclude agent and/or system stack frames from profiles. +* `auto_destroy` (Optional) Set to `False` to disable agent's exit handlers. If necessary, call `destroy()` to gracefully shutdown the agent. diff --git a/README.rst b/README.rst index 8248249..2bb668b 100644 --- a/README.rst +++ b/README.rst @@ -91,10 +91,11 @@ applicable. All initialization options: -- ``agent_key`` (Required) The access key for communication with +- ``agent_key`` (Required) The access key for communication with the StackImpact servers. - ``app_name`` (Required) A name to identify and group application - data. Typically, single codebase corresponds to one application. + data. Typically, a single codebase, deployable unit or executable + module corresponds to one application. - ``app_version`` (Optional) Sets application version, which can be used to associate profiling information with the source code release. - ``app_environment`` (Optional) Used to differentiate applications in @@ -108,6 +109,9 @@ All initialization options: - ``include_agent_frames``, ``include_system_frames`` (Optional) Set to ``True`` to not exclude agent and/or system stack frames from profiles. +- ``auto_destroy`` (Optional) Set to ``False`` to disable agent's exit + handlers. If necessary, call ``destroy()`` to gracefully shutdown the + agent. Analyzing performance data in the Dashboard ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/setup.py b/setup.py index ba30c81..a8b5c4a 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name = 'stackimpact', - version = '1.0.4', + version = '1.0.5', description = 'StackImpact Python Agent', author = 'StackImpact', author_email = 'devops@stackimpact.com', diff --git a/stackimpact/agent.py b/stackimpact/agent.py index 94a1d8a..085b718 100644 --- a/stackimpact/agent.py +++ b/stackimpact/agent.py @@ -26,7 +26,7 @@ class Agent: - AGENT_VERSION = "1.0.4" + AGENT_VERSION = "1.0.5" SAAS_DASHBOARD_ADDRESS = "https://agent-api.stackimpact.com" def __init__(self, **kwargs): @@ -61,7 +61,7 @@ def get_option(self, name, default_val = None): def start(self, **kwargs): if not min_version(2, 7) and not min_version(3, 4): - raise Exception('Supported Python versions 2.6 or highter and 3.4 or higher') + raise Exception('Supported Python versions 2.6 or higher and 3.4 or higher') if platform.python_implementation() != 'CPython': raise Exception('Supported Python interpreter is CPython') @@ -114,24 +114,26 @@ def _signal_handler(signum, frame): register_signal(signal.SIGUSR2, _signal_handler) - # destroy agent on exit - def _exit_handler(*arg): - if not self.agent_started or self.agent_destroyed: - return + if self.get_option('auto_destroy') == False: + # destroy agent on exit + def _exit_handler(*arg): + if not self.agent_started or self.agent_destroyed: + return + + try: + self.message_queue.flush() + self.destroy() + except Exception: + self.exception() - try: - self.message_queue.flush() - self.destroy() - except Exception: - self.exception() + atexit.register(_exit_handler) - atexit.register(_exit_handler) + register_signal(signal.SIGQUIT, _exit_handler, once = True) + register_signal(signal.SIGINT, _exit_handler, once = True) + register_signal(signal.SIGTERM, _exit_handler, once = True) + register_signal(signal.SIGHUP, _exit_handler, once = True) - register_signal(signal.SIGQUIT, _exit_handler, ignore_default = False) - register_signal(signal.SIGINT, _exit_handler, ignore_default = False) - register_signal(signal.SIGTERM, _exit_handler, ignore_default = False) - register_signal(signal.SIGHUP, _exit_handler, ignore_default = False) self.agent_started = True self.log('Agent started') @@ -178,7 +180,6 @@ def error(self, message): def exception(self): if self.get_option('debug'): - self.print_err(sys.exc_info()[0]) traceback.print_exc() diff --git a/stackimpact/api_request.py b/stackimpact/api_request.py index 2fbf8de..c5d0192 100644 --- a/stackimpact/api_request.py +++ b/stackimpact/api_request.py @@ -33,11 +33,11 @@ def post(self, endpoint, payload): 'Content-Encoding': 'gzip' } - host_name = 'host' + host_name = 'undefined' try: host_name = socket.gethostname() except Exception: - pass + self.agent.exception() req_body = { 'runtime_type': 'python', @@ -71,6 +71,7 @@ def post(self, endpoint, payload): headers = headers) response = urlopen(request, timeout = 20) + result_data = response.read() if response.info(): diff --git a/stackimpact/metric.py b/stackimpact/metric.py index c5bd9ca..82721aa 100644 --- a/stackimpact/metric.py +++ b/stackimpact/metric.py @@ -285,6 +285,13 @@ def normalize(self, factor): child.normalize(factor) + def floor(self): + self.measurement = int(self.measurement) + + for name, child in self.children.items(): + child.floor() + + def to_dict(self): children_map = [] for name, child in self.children.items(): diff --git a/stackimpact/reporters/allocation_reporter.py b/stackimpact/reporters/allocation_reporter.py index cd2734e..2b1a213 100644 --- a/stackimpact/reporters/allocation_reporter.py +++ b/stackimpact/reporters/allocation_reporter.py @@ -130,6 +130,7 @@ def report(self): return self.profile.normalize(self.profile_duration) + self.profile.floor() self.profile.filter(2, 1000, float("inf")) metric = Metric(self.agent, Metric.TYPE_PROFILE, Metric.CATEGORY_MEMORY_PROFILE, Metric.NAME_UNCOLLECTED_ALLOCATIONS, Metric.UNIT_BYTE) diff --git a/stackimpact/reporters/block_reporter.py b/stackimpact/reporters/block_reporter.py index 7ea434a..3761fe1 100644 --- a/stackimpact/reporters/block_reporter.py +++ b/stackimpact/reporters/block_reporter.py @@ -189,6 +189,7 @@ def report(self): with self.profile_lock: self.block_profile.normalize(self.profile_duration) + self.block_profile.floor() self.block_profile.filter(2, 1, float("inf")) metric = Metric(self.agent, Metric.TYPE_PROFILE, Metric.CATEGORY_BLOCK_PROFILE, Metric.NAME_BLOCKING_CALL_TIMES, Metric.UNIT_MILLISECOND) diff --git a/stackimpact/reporters/process_reporter.py b/stackimpact/reporters/process_reporter.py index a44c03c..54ec5a5 100644 --- a/stackimpact/reporters/process_reporter.py +++ b/stackimpact/reporters/process_reporter.py @@ -19,7 +19,7 @@ def __init__(self, agent): def start(self): - self.report_timer = self.agent.schedule(5, 60, self.report) + self.report_timer = self.agent.schedule(60, 60, self.report) def destroy(self): diff --git a/stackimpact/runtime.py b/stackimpact/runtime.py index 4854c5b..3094383 100644 --- a/stackimpact/runtime.py +++ b/stackimpact/runtime.py @@ -122,7 +122,7 @@ def unpatch(obj, func_name): setattr(obj, func_name, getattr(wrapper, '__stackimpact_orig__')) -def register_signal(signal_number, handler_func, ignore_default = True): +def register_signal(signal_number, handler_func, once = False): prev_handler = None def _handler(signum, frame): @@ -130,8 +130,10 @@ def _handler(signum, frame): if not skip_prev: if callable(prev_handler): + if once: + signal.signal(signum, prev_handler) prev_handler(signum, frame) - elif prev_handler == signal.SIG_DFL and not ignore_default: + elif prev_handler == signal.SIG_DFL and once: signal.signal(signum, signal.SIG_DFL) os.kill(os.getpid(), signum) diff --git a/tests/agent_test.py b/tests/agent_test.py index 4a28c2c..12754ee 100644 --- a/tests/agent_test.py +++ b/tests/agent_test.py @@ -5,7 +5,7 @@ import stackimpact -# python -m unittest discover -s tests -p *_test.py +# python3 -m unittest discover -s tests -p *_test.py class AgentTestCase(unittest.TestCase): diff --git a/tests/runtime_test.py b/tests/runtime_test.py index 0fc5b94..f3d5903 100644 --- a/tests/runtime_test.py +++ b/tests/runtime_test.py @@ -30,7 +30,7 @@ def _handler(signum, frame): def _handler(signum, frame): result['handler'] += 1 - register_signal(signal.SIGUSR1, _handler, ignore_default = False) + register_signal(signal.SIGUSR1, _handler, once = True) os.kill(os.getpid(), signal.SIGUSR1) os.kill(os.getpid(), signal.SIGUSR1) diff --git a/tests/test.py b/tests/test.py deleted file mode 100644 index e071622..0000000 --- a/tests/test.py +++ /dev/null @@ -1,29 +0,0 @@ - -import threading -import collections -import signal - - - -class Sampler(object): - def __init__(self, interval=0.001): - self.stack_counts = collections.defaultdict(int) - self.interval = 0.001 - - def _sample(self, signum, frame): - stack = [] - while frame is not None: - formatted_frame = '{}({})'.format(frame.f_code.co_name, - frame.f_globals.get('__name__')) - stack.append(formatted_frame) - frame = frame.f_back - - formatted_stack = ';'.join(reversed(stack)) - self.stack_counts[formatted_stack] += 1 - signal.setitimer(signal.ITIMER_PROF, self.interval, 0) - - def start(self): - signal.signal(signal.SIGPROF, self._sample) - - - \ No newline at end of file diff --git a/tests/test_server.py b/tests/test_server.py index 5bd13dc..c9e260f 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -21,6 +21,8 @@ def __init__(self, port, delay = None, handler_func = None): self.port = port RequestHandler.delay = delay RequestHandler.handler_func = [handler_func] + RequestHandler.response_data = '{}' + RequestHandler.response_code = 200 threading.Thread.__init__(self) self.server = HTTPServer(('localhost', self.port), RequestHandler) @@ -30,6 +32,9 @@ def get_request_data(self): def set_response_data(self, response_data): RequestHandler.response_data = response_data + def set_response_code(self, response_code): + RequestHandler.response_code = response_code + def run(self): self.server.handle_request() @@ -39,6 +44,7 @@ class RequestHandler(BaseHTTPRequestHandler): handler_func = None request_data = None response_data = '{}' + response_code = 200 def do_GET(self): @@ -48,7 +54,7 @@ def do_GET(self): if RequestHandler.handler_func: RequestHandler.handler_func[0]() - self.send_response(200) + self.send_response(RequestHandler.response_code) self.send_header('Content-Type', 'application/json') self.end_headers() self.wfile.write(RequestHandler.response_data.encode('utf-8')) @@ -64,7 +70,7 @@ def do_POST(self): decompressed_data = gzip.GzipFile(fileobj=BytesIO(self.rfile.read(content_len))).read() RequestHandler.request_data = decompressed_data.decode('utf-8') - self.send_response(200) + self.send_response(RequestHandler.response_code) self.send_header('Content-Type', 'application/json') self.end_headers() self.wfile.write(RequestHandler.response_data.encode('utf-8')) From f9dfe76069601f522642679a97b1c1130d821a88 Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Tue, 26 Sep 2017 08:53:22 +0200 Subject: [PATCH 09/26] agent enable config --- README.md | 9 +- README.rst | 12 +- examples/flask_app.py | 7 +- setup.py | 2 +- stackimpact/agent.py | 31 ++--- stackimpact/config.py | 11 ++ stackimpact/config_loader.py | 28 ++++- .../{frame_selector.py => frame_cache.py} | 52 +------- stackimpact/message_queue.py | 2 +- stackimpact/metric.py | 9 +- stackimpact/profiler_scheduler.py | 2 +- stackimpact/reporters/allocation_reporter.py | 39 ++---- stackimpact/reporters/block_reporter.py | 112 ++++++++---------- stackimpact/reporters/cpu_reporter.py | 47 ++++++-- stackimpact/reporters/error_reporter.py | 24 ++-- stackimpact/reporters/process_reporter.py | 16 ++- ...e_selector_test.py => frame_cache_test.py} | 9 +- tests/profiler_scheduler_test.py | 2 +- tests/reporters/allocation_reporter_test.py | 1 + tests/reporters/block_reporter_test.py | 12 +- tests/reporters/cpu_reporter_test.py | 1 + tests/reporters/error_reporter_test.py | 3 +- tests/reporters/process_reporter_test.py | 1 + 23 files changed, 214 insertions(+), 218 deletions(-) rename stackimpact/{frame_selector.py => frame_cache.py} (57%) rename tests/{frame_selector_test.py => frame_cache_test.py} (58%) diff --git a/README.md b/README.md index a646bf6..8c8ae72 100644 --- a/README.md +++ b/README.md @@ -8,11 +8,10 @@ StackImpact is a performance profiler for production applications. It gives deve #### Features -* Automatic hot spot profiling for CPU, memory allocations, blocking calls -* Automatic bottleneck tracing for HTTP handlers and other libraries -* Exception monitoring +* Continuous hot spot profiling for CPU, memory allocations, blocking calls +* Error monitoring * Health monitoring including CPU, memory, garbage collection and other runtime metrics -* Anomaly alerts on most important metrics +* Anomaly detection * Multiple account users for team collaboration Learn more on the [features](https://stackimpact.com/features/) page (with screenshots). @@ -77,7 +76,7 @@ All initialization options: * `host_name` (Optional) By default, host name will be the OS hostname. * `debug` (Optional) Enables debug logging. * `cpu_profiler_disabled`, `allocation_profiler_disabled`, `block_profiler_disabled`, `error_profiler_disabled` (Optional) Disables respective profiler when `True`. -* `include_agent_frames`, `include_system_frames` (Optional) Set to `True` to not exclude agent and/or system stack frames from profiles. +* `include_agent_frames` (Optional) Set to `True` to not exclude agent stack frames from profile call graphs. * `auto_destroy` (Optional) Set to `False` to disable agent's exit handlers. If necessary, call `destroy()` to gracefully shutdown the agent. diff --git a/README.rst b/README.rst index 2bb668b..cbfadf0 100644 --- a/README.rst +++ b/README.rst @@ -19,13 +19,12 @@ errors and runtime metrics. Learn more at Features ^^^^^^^^ -- Automatic hot spot profiling for CPU, memory allocations, blocking +- Continuous hot spot profiling for CPU, memory allocations, blocking calls -- Automatic bottleneck tracing for HTTP handlers and other libraries -- Exception monitoring +- Error monitoring - Health monitoring including CPU, memory, garbage collection and other runtime metrics -- Anomaly alerts on most important metrics +- Anomaly detection - Multiple account users for team collaboration Learn more on the `features `__ page @@ -106,9 +105,8 @@ All initialization options: - ``cpu_profiler_disabled``, ``allocation_profiler_disabled``, ``block_profiler_disabled``, ``error_profiler_disabled`` (Optional) Disables respective profiler when ``True``. -- ``include_agent_frames``, ``include_system_frames`` (Optional) Set to - ``True`` to not exclude agent and/or system stack frames from - profiles. +- ``include_agent_frames`` (Optional) Set to ``True`` to not exclude + agent stack frames from profile call graphs. - ``auto_destroy`` (Optional) Set to ``False`` to disable agent's exit handlers. If necessary, call ``destroy()`` to gracefully shutdown the agent. diff --git a/examples/flask_app.py b/examples/flask_app.py index 48ef704..b3851af 100644 --- a/examples/flask_app.py +++ b/examples/flask_app.py @@ -1,6 +1,4 @@ -# export AGENT_KEY=agent_key_here -# gunicorn --workers 2 --bind 127.0.0.1:5010 flask_app:app -# gunicorn --workers 2 -k gevent --bind 127.0.0.1:5010 flask_app:app +#env AGENT_KEY=agnetkeyhere FLASK_APP=examples/flask_app.py flask run -p 5010 from __future__ import print_function @@ -22,7 +20,7 @@ # python 3 from urllib.request import urlopen - +sys.path.append(".") import stackimpact @@ -30,6 +28,7 @@ # StackImpact agent initialization agent = stackimpact.start( agent_key = os.environ['AGENT_KEY'], + dashboard_address = os.environ['DASHBOARD_ADDRESS'], app_name = 'ExamplePythonFlaskApp', app_version = '1.0.0', debug = True) diff --git a/setup.py b/setup.py index a8b5c4a..792dafa 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name = 'stackimpact', - version = '1.0.5', + version = '1.1.0', description = 'StackImpact Python Agent', author = 'StackImpact', author_email = 'devops@stackimpact.com', diff --git a/stackimpact/agent.py b/stackimpact/agent.py index 085b718..fdbda27 100644 --- a/stackimpact/agent.py +++ b/stackimpact/agent.py @@ -16,7 +16,7 @@ from .config import Config from .config_loader import ConfigLoader from .message_queue import MessageQueue -from .frame_selector import FrameSelector +from .frame_cache import FrameCache from .reporters.process_reporter import ProcessReporter from .reporters.cpu_reporter import CPUReporter from .reporters.allocation_reporter import AllocationReporter @@ -26,7 +26,7 @@ class Agent: - AGENT_VERSION = "1.0.5" + AGENT_VERSION = "1.1.0" SAAS_DASHBOARD_ADDRESS = "https://agent-api.stackimpact.com" def __init__(self, **kwargs): @@ -42,7 +42,7 @@ def __init__(self, **kwargs): self.config = Config(self) self.config_loader = ConfigLoader(self) self.message_queue = MessageQueue(self) - self.frame_selector = FrameSelector(self) + self.frame_cache = FrameCache(self) self.process_reporter = ProcessReporter(self) self.cpu_reporter = CPUReporter(self) self.allocation_reporter = AllocationReporter(self) @@ -93,12 +93,10 @@ def start(self, **kwargs): self.config_loader.start() self.message_queue.start() - self.frame_selector.start() - self.process_reporter.start() - self.cpu_reporter.start() - self.allocation_reporter.start() - self.block_reporter.start() - self.error_reporter.start() + self.frame_cache.start() + + self.cpu_reporter.setup() + self.block_reporter.setup() # execute main_thread_func in main thread on signal def _signal_handler(signum, frame): @@ -147,14 +145,17 @@ def destroy(self): if self.agent_destroyed: return - self.config_loader.destroy() - self.message_queue.destroy() - self.frame_selector.destroy() - self.process_reporter.destroy() + self.config_loader.stop() + self.message_queue.stop() + self.frame_cache.stop() + self.cpu_reporter.stop() + self.allocation_reporter.stop() + self.block_reporter.stop() + self.error_reporter.stop() + self.process_reporter.stop() + self.cpu_reporter.destroy() - self.allocation_reporter.destroy() self.block_reporter.destroy() - self.error_reporter.destroy() self.agent_destroyed = True self.log('Agent destroyed') diff --git a/stackimpact/config.py b/stackimpact/config.py index 50d6426..2a8181b 100644 --- a/stackimpact/config.py +++ b/stackimpact/config.py @@ -3,9 +3,20 @@ class Config: def __init__(self, agent): self.agent = agent + self.agent_enabled = False self.profiling_disabled = False self.config_lock = threading.Lock() + + def set_agent_enabled(self, val): + with self.config_lock: + self.agent_enabled = val + + def is_agent_enabled(self): + with self.config_lock: + val = self.agent_enabled + return val + def set_profiling_disabled(self, val): with self.config_lock: self.profiling_disabled = val diff --git a/stackimpact/config_loader.py b/stackimpact/config_loader.py index 0467435..6ffac88 100644 --- a/stackimpact/config_loader.py +++ b/stackimpact/config_loader.py @@ -11,7 +11,7 @@ def start(self): self.load_timer = self.agent.schedule(2, 120, self.load) - def destroy(self): + def stop(self): if self.load_timer: self.load_timer.cancel() self.load_timer = None @@ -22,12 +22,36 @@ def load(self): api_request = APIRequest(self.agent) config = api_request.post('config', {}) - # profiling_enabled yes|no + # agent_enabled yes|no + if 'agent_enabled' in config: + self.agent.config.set_agent_enabled(config['agent_enabled'] == 'yes') + else: + self.agent.config.set_agent_enabled(False) + + # profiling_disabled yes|no if 'profiling_disabled' in config: self.agent.config.set_profiling_disabled(config['profiling_disabled'] == 'yes') else: self.agent.config.set_profiling_disabled(False) + + if self.agent.config.is_agent_enabled() and not self.agent.config.is_profiling_disabled(): + self.agent.cpu_reporter.start() + self.agent.allocation_reporter.start() + self.agent.block_reporter.start() + else: + self.agent.cpu_reporter.stop() + self.agent.allocation_reporter.stop() + self.agent.block_reporter.stop() + + if self.agent.config.is_agent_enabled(): + self.agent.error_reporter.start() + self.agent.process_reporter.start() + else: + self.agent.error_reporter.stop() + self.agent.process_reporter.stop() + + except Exception: self.agent.log('Error loading config') self.agent.exception() diff --git a/stackimpact/frame_selector.py b/stackimpact/frame_cache.py similarity index 57% rename from stackimpact/frame_selector.py rename to stackimpact/frame_cache.py index 6e9b79f..408a480 100644 --- a/stackimpact/frame_selector.py +++ b/stackimpact/frame_cache.py @@ -9,61 +9,31 @@ if runtime_info.GEVENT: import gevent -class FrameSelector: +class FrameCache: MAX_CACHE_SIZE = 2500 def __init__(self, agent): self.agent = agent self.agent_frame_cache = None self.system_frame_cache = None - self.http_frame_cache = None self.include_agent_frames = None self.include_system_frames = None - self.http_frame_regexp = None - self.agent_dir = os.path.dirname(os.path.realpath(__file__)) self.system_dir = os.path.dirname(os.path.realpath(threading.__file__)) if runtime_info.GEVENT: self.gevent_dir = os.path.dirname(os.path.realpath(gevent.__file__)) - def add_http_package(self, name): - try: - m = importlib.import_module(name) - if m and m.__file__: - self.add_http_frame_regexp(os.path.dirname(os.path.realpath(m.__file__))) - except Exception: - pass - - - def add_http_frame_regexp(self, regexp_str): - self.http_frame_regexp.append(re.compile(regexp_str, re.IGNORECASE)) - - def start(self): self.agent_frame_cache = dict() self.system_frame_cache = dict() - self.http_frame_cache = dict() self.include_agent_frames = self.agent.get_option('include_agent_frames') self.include_system_frames = self.agent.get_option('include_system_frames') - self.http_frame_regexp = [] - self.add_http_package('gunicorn') - self.add_http_package('waitress') - self.add_http_package('werkzeug') - self.add_http_package('flask') - self.add_http_package('django') - self.add_http_package('pyramid') - self.add_http_package('cherrypy') - self.add_http_package('tornado') - self.add_http_package('web2py') - self.add_http_package('bottle') - - - def destroy(self): + def stop(self): pass @@ -98,21 +68,3 @@ def is_system_frame(self, filename): self.system_frame_cache[filename] = system_frame return system_frame - - - def is_http_frame(self, filename): - if filename in self.http_frame_cache: - return self.http_frame_cache[filename] - - http_frame = False - - for r in self.http_frame_regexp: - if r.search(filename): - http_frame = True - break - - if len(self.http_frame_cache) < self.MAX_CACHE_SIZE: - self.http_frame_cache[filename] = http_frame - - return http_frame - diff --git a/stackimpact/message_queue.py b/stackimpact/message_queue.py index c36aae2..ce904dd 100644 --- a/stackimpact/message_queue.py +++ b/stackimpact/message_queue.py @@ -24,7 +24,7 @@ def start(self): self.flush_timer = self.agent.schedule(5, 5, self.flush) - def destroy(self): + def stop(self): if self.flush_timer: self.flush_timer.cancel() self.flush_timer = None diff --git a/stackimpact/metric.py b/stackimpact/metric.py index 82721aa..d7e5f7b 100644 --- a/stackimpact/metric.py +++ b/stackimpact/metric.py @@ -20,7 +20,6 @@ class Metric: CATEGORY_CPU_PROFILE = 'cpu-profile' CATEGORY_MEMORY_PROFILE = 'memory-profile' CATEGORY_BLOCK_PROFILE = 'block-profile' - CATEGORY_HTTP_TRACE = 'http-trace' CATEGORY_ERROR_PROFILE = 'error-profile' NAME_CPU_TIME = 'CPU time' @@ -36,7 +35,6 @@ class Metric: NAME_THREAD_COUNT = 'Active threads' NAME_UNCOLLECTED_ALLOCATIONS = 'Uncollected allocations' NAME_BLOCKING_CALL_TIMES = 'Blocking call times' - NAME_HTTP_TRANSACTION_BREAKDOWN = 'HTTP transaction breakdown' NAME_HANDLED_EXCEPTIONS = 'Handled exceptions' UNIT_NONE = '' @@ -285,6 +283,13 @@ def normalize(self, factor): child.normalize(factor) + def round(self): + self.measurement = round(self.measurement) + + for name, child in self.children.items(): + child.round() + + def floor(self): self.measurement = int(self.measurement) diff --git a/stackimpact/profiler_scheduler.py b/stackimpact/profiler_scheduler.py index 0606f65..0b32984 100644 --- a/stackimpact/profiler_scheduler.py +++ b/stackimpact/profiler_scheduler.py @@ -31,7 +31,7 @@ def random_delay(): self.report_timer = self.agent.schedule(self.report_interval, self.report_interval, self.execute_report) - def destroy(self): + def stop(self): if self.random_timer: self.random_timer.cancel() self.random_timer = None diff --git a/stackimpact/reporters/allocation_reporter.py b/stackimpact/reporters/allocation_reporter.py index 2b1a213..225d5af 100644 --- a/stackimpact/reporters/allocation_reporter.py +++ b/stackimpact/reporters/allocation_reporter.py @@ -21,6 +21,7 @@ class AllocationReporter: def __init__(self, agent): self.agent = agent + self.started = False self.profiler_scheduler = None self.profile = None self.profile_duration = 0 @@ -34,27 +35,22 @@ def start(self): self.agent.log('Memory allocation profiling is available for Python 3.4 or higher') return + if self.started: + return + self.started = True + self.reset() self.profiler_scheduler = ProfilerScheduler(self.agent, 20, 5, 120, self.record, self.report) self.profiler_scheduler.start() - def destroy(self): - if self.agent.get_option('allocation_profiler_disabled'): + def stop(self): + if not self.started: return + self.started = False - if self.profiler_scheduler: - self.profiler_scheduler.destroy() - - - def metrics(self): - if runtime_info.OS_LINUX: - return { - 'vm-size': read_vm_size() - } - - return None + self.profiler_scheduler.stop() def reset(self): @@ -63,9 +59,6 @@ def reset(self): def record(self, max_duration): - if self.agent.config.is_profiling_disabled(): - return - self.agent.log('Activating memory allocation profiler.') def start(): @@ -100,36 +93,28 @@ def process_snapshot(self, snapshot, duration): if stat.traceback: skip_stack = False for frame in stat.traceback: - if self.agent.frame_selector.is_agent_frame(frame.filename): + if self.agent.frame_cache.is_agent_frame(frame.filename): skip_stack = True break if skip_stack: continue current_node = self.profile - current_node.increment(stat.size, stat.count) - for frame in reversed(stat.traceback): if frame.filename == '': continue - if self.agent.frame_selector.is_system_frame(frame.filename): - continue - frame_name = '{0}:{1}'.format(frame.filename, frame.lineno) - current_node = current_node.find_or_add_child(frame_name) - current_node.increment(stat.size, stat.count) + current_node.increment(stat.size, stat.count) def report(self): - if self.agent.config.is_profiling_disabled(): - return - if self.profile_duration == 0: return self.profile.normalize(self.profile_duration) + self.propagate() self.profile.floor() self.profile.filter(2, 1000, float("inf")) diff --git a/stackimpact/reporters/block_reporter.py b/stackimpact/reporters/block_reporter.py index 3761fe1..431afb6 100644 --- a/stackimpact/reporters/block_reporter.py +++ b/stackimpact/reporters/block_reporter.py @@ -24,16 +24,17 @@ class BlockReporter: def __init__(self, agent): self.agent = agent + self.setup_done = False + self.started = False self.profiler_scheduler = None self.profile_lock = threading.Lock() - self.block_profile = None - self.http_profile = None + self.profile = None self.profile_duration = 0 self.prev_signal_handler = None self.handler_active = False - def start(self): + def setup(self): if self.agent.get_option('block_profiler_disabled'): return @@ -64,34 +65,46 @@ def _sample(signum, signal_frame): self.handler_active = False self.prev_signal_handler = signal.signal(signal.SIGALRM, _sample) + + self.setup_done = True + + + def start(self): + if not self.setup_done: + return + + if self.started: + return + self.started = True + self.reset() self.profiler_scheduler = ProfilerScheduler(self.agent, 10, 2, 120, self.record, self.report) self.profiler_scheduler.start() + def stop(self): + if not self.started: + return; + self.started = False + + self.profiler_scheduler.stop() + + def destroy(self): - if self.agent.get_option('block_profiler_disabled'): + if not self.setup_done: return - if self.prev_signal_handler != None: - signal.setitimer(signal.ITIMER_REAL, 0) - signal.signal(signal.SIGALRM, self.prev_signal_handler) - - if self.profiler_scheduler: - self.profiler_scheduler.destroy() + signal.setitimer(signal.ITIMER_REAL, 0) + signal.signal(signal.SIGALRM, self.prev_signal_handler) def reset(self): - self.block_profile = Breakdown('root') - self.http_profile = Breakdown('root') + self.profile = Breakdown('root') self.profile_duration = 0 def record(self, duration): - if self.agent.config.is_profiling_disabled(): - return - self.agent.log('Activating block profiler.') signal.setitimer(signal.ITIMER_REAL, self.SAMPLING_RATE, self.SAMPLING_RATE) @@ -102,11 +115,11 @@ def record(self, duration): self.profile_duration += duration - self.agent.log('Block profiler CPU overhead per activity second: {0} seconds'.format(self.block_profile._overhead / self.profile_duration)) + self.agent.log('Block profiler CPU overhead per activity second: {0} seconds'.format(self.profile._overhead / self.profile_duration)) def process_sample(self, signal_frame, sample_time, main_thread_id): - if self.block_profile: + if self.profile: start = time.clock() current_frames = sys._current_frames() @@ -117,21 +130,23 @@ def process_sample(self, signal_frame, sample_time, main_thread_id): stack = self.recover_stack(thread_frame) if stack: - self.update_block_profile(stack, sample_time) - self.update_http_profile(stack, sample_time) + current_node = self.profile + for frame in reversed(stack): + current_node = current_node.find_or_add_child(str(frame)) + current_node.increment(sample_time, 1) thread_id, thread_frame, stack = None, None, None items = None current_frames = None - self.block_profile._overhead += (time.clock() - start) - + self.profile._overhead += (time.clock() - start) def recover_stack(self, thread_frame): stack = [] + system_only = True depth = 0 while thread_frame is not None and depth <= self.MAX_TRACEBACK_SIZE: if thread_frame.f_code and thread_frame.f_code.co_name and thread_frame.f_code.co_filename: @@ -139,47 +154,28 @@ def recover_stack(self, thread_frame): filename = thread_frame.f_code.co_filename lineno = thread_frame.f_lineno - if self.agent.frame_selector.is_agent_frame(filename): + if self.agent.frame_cache.is_agent_frame(filename): return None - if not self.agent.frame_selector.is_system_frame(filename): - frame = Frame(func_name, filename, lineno) - stack.append(frame) + if not self.agent.frame_cache.is_system_frame(filename): + system_only = False + + frame = Frame(func_name, filename, lineno) + stack.append(frame) thread_frame = thread_frame.f_back depth += 1 + if system_only: + return None + if len(stack) == 0: return None else: return stack - def update_block_profile(self, stack, sample_time): - current_node = self.block_profile - current_node.increment(sample_time, 1) - - for frame in reversed(stack): - current_node = current_node.find_or_add_child(str(frame)) - current_node.increment(sample_time, 1) - - - def update_http_profile(self, stack, sample_time): - include = False - for frame in stack: - if self.agent.frame_selector.is_http_frame(frame.filename): - include = True - - if include: - current_node = self.http_profile - current_node.increment(sample_time, 1) - - for frame in reversed(stack): - current_node = current_node.find_or_add_child(str(frame)) - current_node.increment(sample_time, 1) - - def report(self): if self.agent.config.is_profiling_disabled(): return @@ -188,21 +184,13 @@ def report(self): return with self.profile_lock: - self.block_profile.normalize(self.profile_duration) - self.block_profile.floor() - self.block_profile.filter(2, 1, float("inf")) + self.profile.normalize(self.profile_duration) + self.profile.propagate() + self.profile.floor() + self.profile.filter(2, 1, float("inf")) metric = Metric(self.agent, Metric.TYPE_PROFILE, Metric.CATEGORY_BLOCK_PROFILE, Metric.NAME_BLOCKING_CALL_TIMES, Metric.UNIT_MILLISECOND) - measurement = metric.create_measurement(Metric.TRIGGER_TIMER, self.block_profile.measurement, 1, self.block_profile) + measurement = metric.create_measurement(Metric.TRIGGER_TIMER, self.profile.measurement, 1, self.profile) self.agent.message_queue.add('metric', metric.to_dict()) - if self.block_profile.num_samples > 0 and self.http_profile.num_samples > 0: - self.http_profile.normalize(self.profile_duration) - self.http_profile.convert_to_percent(self.block_profile.measurement) - self.block_profile.filter(2, 1, 100) - - metric = Metric(self.agent, Metric.TYPE_PROFILE, Metric.CATEGORY_HTTP_TRACE, Metric.NAME_HTTP_TRANSACTION_BREAKDOWN, Metric.UNIT_PERCENT) - measurement = metric.create_measurement(Metric.TRIGGER_TIMER, self.http_profile.measurement, None, self.http_profile) - self.agent.message_queue.add('metric', metric.to_dict()) - self.reset() diff --git a/stackimpact/reporters/cpu_reporter.py b/stackimpact/reporters/cpu_reporter.py index 2972037..04bfe52 100644 --- a/stackimpact/reporters/cpu_reporter.py +++ b/stackimpact/reporters/cpu_reporter.py @@ -22,6 +22,8 @@ class CPUReporter: def __init__(self, agent): self.agent = agent + self.setup_done = False + self.started = False self.profiler_scheduler = None self.profile = None self.profile_lock = threading.Lock() @@ -30,7 +32,7 @@ def __init__(self, agent): self.handler_active = False - def start(self): + def setup(self): if self.agent.get_option('cpu_profiler_disabled'): return @@ -54,22 +56,44 @@ def _sample(signum, signal_frame): self.prev_signal_handler = signal.signal(signal.SIGPROF, _sample) + self.setup_done = True + + + def start(self): + if self.agent.get_option('cpu_profiler_disabled'): + return + + if not runtime_info.OS_LINUX and not runtime_info.OS_DARWIN: + self.agent.log('CPU profiler is only supported on Linux and OS X.') + return + + if not self.setup_done: + return + + if self.started: + return; + self.started = True + self.reset() self.profiler_scheduler = ProfilerScheduler(self.agent, 10, 2, 120, self.record, self.report) self.profiler_scheduler.start() - def destroy(self): - if self.agent.get_option('cpu_profiler_disabled'): + def stop(self): + if not self.started: return + self.started = False - if self.prev_signal_handler != None: - signal.setitimer(signal.ITIMER_PROF, 0) - signal.signal(signal.SIGPROF, self.prev_signal_handler) + self.profiler_scheduler.stop() - if self.profiler_scheduler: - self.profiler_scheduler.destroy() + + def destroy(self): + if not self.setup_done: + return + + signal.setitimer(signal.ITIMER_PROF, 0) + signal.signal(signal.SIGPROF, self.prev_signal_handler) def reset(self): @@ -118,12 +142,11 @@ def recover_stack(self, signal_frame): filename = signal_frame.f_code.co_filename lineno = signal_frame.f_lineno - if self.agent.frame_selector.is_agent_frame(filename): + if self.agent.frame_cache.is_agent_frame(filename): return None - if not self.agent.frame_selector.is_system_frame(filename): - frame = Frame(func_name, filename, lineno) - stack.append(frame) + frame = Frame(func_name, filename, lineno) + stack.append(frame) signal_frame = signal_frame.f_back diff --git a/stackimpact/reporters/error_reporter.py b/stackimpact/reporters/error_reporter.py index 47f1bf0..c459373 100644 --- a/stackimpact/reporters/error_reporter.py +++ b/stackimpact/reporters/error_reporter.py @@ -16,6 +16,7 @@ class ErrorReporter: def __init__(self, agent): self.agent = agent + self.started = False self.process_timer = None self.report_timer = None self.exc_queue = collections.deque() @@ -28,6 +29,10 @@ def start(self): if self.agent.get_option('error_profiler_disabled'): return + if self.started: + return + self.started = True + self.reset_profile() self.process_timer = self.agent.schedule(1, 1, self.process) @@ -47,19 +52,18 @@ def _exc_info(ret): patch(sys, 'exc_info', None, _exc_info) - def destroy(self): - if self.agent.get_option('error_profiler_disabled'): + def stop(self): + if not self.started: return + self.started = False unpatch(sys, 'exc_info') - if self.process_timer: - self.process_timer.cancel() - self.process_timer = None + self.process_timer.cancel() + self.process_timer = None - if self.report_timer: - self.report_timer.cancel() - self.report_timer = None + self.report_timer.cancel() + self.report_timer = None def reset_profile(self): @@ -97,10 +101,10 @@ def recover_stack(self, exc): filename = tb_frame[0] lineno = tb_frame[1] - if self.agent.frame_selector.is_agent_frame(filename): + if self.agent.frame_cache.is_agent_frame(filename): return None - if not self.agent.frame_selector.is_system_frame(filename): + if not self.agent.frame_cache.is_system_frame(filename): frame = Frame(func_name, filename, lineno) stack.append(frame) diff --git a/stackimpact/reporters/process_reporter.py b/stackimpact/reporters/process_reporter.py index 54ec5a5..78be764 100644 --- a/stackimpact/reporters/process_reporter.py +++ b/stackimpact/reporters/process_reporter.py @@ -14,18 +14,26 @@ class ProcessReporter: def __init__(self, agent): self.agent = agent + self.started = False self.metrics = {} self.report_timer = None def start(self): + if self.started: + return + self.started = True + self.report_timer = self.agent.schedule(60, 60, self.report) - def destroy(self): - if self.report_timer: - self.report_timer.cancel() - self.report_timer = None + def stop(self): + if not self.started: + return + self.started = False + + self.report_timer.cancel() + self.report_timer = None def report(self): diff --git a/tests/frame_selector_test.py b/tests/frame_cache_test.py similarity index 58% rename from tests/frame_selector_test.py rename to tests/frame_cache_test.py index 8d0ca04..92ab785 100644 --- a/tests/frame_selector_test.py +++ b/tests/frame_cache_test.py @@ -6,7 +6,7 @@ import stackimpact -class FrameSelectorTestCase(unittest.TestCase): +class FrameCacheTestCase(unittest.TestCase): def test_skip_stack(self): stackimpact._agent = None @@ -18,13 +18,10 @@ def test_skip_stack(self): ) test_agent_file = os.path.realpath(stackimpact.__file__) - self.assertTrue(agent.frame_selector.is_agent_frame(test_agent_file)) + self.assertTrue(agent.frame_cache.is_agent_frame(test_agent_file)) test_system_file = os.path.realpath(threading.__file__) - self.assertTrue(agent.frame_selector.is_system_frame(test_system_file)) - - agent.frame_selector.add_http_frame_regexp(os.path.join('a', 'b', 'c')) - self.assertTrue(agent.frame_selector.is_http_frame(os.path.join('a', 'b', 'c', 'd'))) + self.assertTrue(agent.frame_cache.is_system_frame(test_system_file)) agent.destroy() diff --git a/tests/profiler_scheduler_test.py b/tests/profiler_scheduler_test.py index 686cd37..5997331 100644 --- a/tests/profiler_scheduler_test.py +++ b/tests/profiler_scheduler_test.py @@ -38,7 +38,7 @@ def report_func(): self.assertFalse(stats["records"] < 10) self.assertFalse(stats["reports"] < 2) - ps.destroy() + ps.stop() agent.destroy() diff --git a/tests/reporters/allocation_reporter_test.py b/tests/reporters/allocation_reporter_test.py index 96f9e60..3dccfe5 100644 --- a/tests/reporters/allocation_reporter_test.py +++ b/tests/reporters/allocation_reporter_test.py @@ -21,6 +21,7 @@ def test_record_allocation_profile(self): app_name = 'TestPythonApp', debug = True ) + agent.allocation_reporter.start() mem1 = [] def mem_leak(n = 100000): diff --git a/tests/reporters/block_reporter_test.py b/tests/reporters/block_reporter_test.py index e348a73..e7cc7d7 100644 --- a/tests/reporters/block_reporter_test.py +++ b/tests/reporters/block_reporter_test.py @@ -29,6 +29,7 @@ def test_record_block_profile(self): app_name = 'TestPythonApp', debug = True ) + agent.block_reporter.start() lock = threading.Lock() event = threading.Event() @@ -64,7 +65,6 @@ def url_wait(): result = {} def record(): - agent.frame_selector.add_http_frame_regexp(os.path.join('tests', 'test_server.py')) agent.block_reporter.record(2) record_t = threading.Thread(target=record) @@ -95,13 +95,11 @@ def record(): record_t.join() - #print(agent.block_reporter.block_profile) - #print(agent.block_reporter.http_profile) + #print(agent.block_reporter.profile) - self.assertTrue('lock_wait' in str(agent.block_reporter.block_profile)) - self.assertTrue('event_wait' in str(agent.block_reporter.block_profile)) - self.assertTrue('url_wait' in str(agent.block_reporter.block_profile)) - self.assertTrue('handler' in str(agent.block_reporter.http_profile)) + self.assertTrue('lock_wait' in str(agent.block_reporter.profile)) + self.assertTrue('event_wait' in str(agent.block_reporter.profile)) + self.assertTrue('url_wait' in str(agent.block_reporter.profile)) agent.destroy() diff --git a/tests/reporters/cpu_reporter_test.py b/tests/reporters/cpu_reporter_test.py index 67f411b..0b3c9f4 100644 --- a/tests/reporters/cpu_reporter_test.py +++ b/tests/reporters/cpu_reporter_test.py @@ -23,6 +23,7 @@ def test_record_profile(self): app_name = 'TestPythonApp', debug = True ) + agent.cpu_reporter.start() def record(): agent.cpu_reporter.record(2) diff --git a/tests/reporters/error_reporter_test.py b/tests/reporters/error_reporter_test.py index 6818d10..f8ec719 100644 --- a/tests/reporters/error_reporter_test.py +++ b/tests/reporters/error_reporter_test.py @@ -20,6 +20,7 @@ def test_add_exception(self): app_name = 'TestPythonApp', debug = True ) + agent.error_reporter.start() try: raise ValueError('test_exc_1') @@ -29,7 +30,7 @@ def test_add_exception(self): time.sleep(1.1) profile_handled_exc = agent.error_reporter.profile - print(profile_handled_exc) + #print(profile_handled_exc) self.assertTrue('ValueError: test_exc_1' in str(profile_handled_exc)) self.assertTrue('test_add_exception' in str(profile_handled_exc)) diff --git a/tests/reporters/process_reporter_test.py b/tests/reporters/process_reporter_test.py index a15fc72..d196434 100644 --- a/tests/reporters/process_reporter_test.py +++ b/tests/reporters/process_reporter_test.py @@ -18,6 +18,7 @@ def test_report(self): app_name = 'TestPythonApp', debug = True ) + agent.process_reporter.start() agent.process_reporter.report() time.sleep(0.1) From e4da4ecc2e0c29918616361e1257a6d8a7e5b2dd Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Tue, 26 Sep 2017 10:23:06 +0200 Subject: [PATCH 10/26] allocation profiler fix --- examples/flask_app.py | 1 - setup.py | 2 +- stackimpact/agent.py | 2 +- stackimpact/reporters/allocation_reporter.py | 2 +- 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/flask_app.py b/examples/flask_app.py index b3851af..af3d439 100644 --- a/examples/flask_app.py +++ b/examples/flask_app.py @@ -28,7 +28,6 @@ # StackImpact agent initialization agent = stackimpact.start( agent_key = os.environ['AGENT_KEY'], - dashboard_address = os.environ['DASHBOARD_ADDRESS'], app_name = 'ExamplePythonFlaskApp', app_version = '1.0.0', debug = True) diff --git a/setup.py b/setup.py index 792dafa..6448f59 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name = 'stackimpact', - version = '1.1.0', + version = '1.1.1', description = 'StackImpact Python Agent', author = 'StackImpact', author_email = 'devops@stackimpact.com', diff --git a/stackimpact/agent.py b/stackimpact/agent.py index fdbda27..2191a05 100644 --- a/stackimpact/agent.py +++ b/stackimpact/agent.py @@ -26,7 +26,7 @@ class Agent: - AGENT_VERSION = "1.1.0" + AGENT_VERSION = "1.1.1" SAAS_DASHBOARD_ADDRESS = "https://agent-api.stackimpact.com" def __init__(self, **kwargs): diff --git a/stackimpact/reporters/allocation_reporter.py b/stackimpact/reporters/allocation_reporter.py index 225d5af..ef2105d 100644 --- a/stackimpact/reporters/allocation_reporter.py +++ b/stackimpact/reporters/allocation_reporter.py @@ -114,7 +114,7 @@ def report(self): return self.profile.normalize(self.profile_duration) - self.propagate() + self.profile.propagate() self.profile.floor() self.profile.filter(2, 1000, float("inf")) From 586c7674d951eb33a0127177dd1a4edf29d56b57 Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Tue, 26 Sep 2017 13:43:04 +0200 Subject: [PATCH 11/26] overhead optimization --- examples/flask_app.py | 4 ++-- setup.py | 2 +- stackimpact/agent.py | 2 +- stackimpact/reporters/allocation_reporter.py | 2 +- stackimpact/reporters/process_reporter.py | 8 +++++++- 5 files changed, 12 insertions(+), 6 deletions(-) diff --git a/examples/flask_app.py b/examples/flask_app.py index af3d439..999695b 100644 --- a/examples/flask_app.py +++ b/examples/flask_app.py @@ -37,11 +37,11 @@ # Simulate CPU intensive work def simulate_cpu(): duration = 10 * 60 * 60 - usage = 20 + usage = 10 while True: for j in range(0, duration): - for i in range(0, usage * 20000): + for i in range(0, usage * 15000): text = "text1" + str(i) text = text + "text2" diff --git a/setup.py b/setup.py index 6448f59..1766216 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name = 'stackimpact', - version = '1.1.1', + version = '1.1.2', description = 'StackImpact Python Agent', author = 'StackImpact', author_email = 'devops@stackimpact.com', diff --git a/stackimpact/agent.py b/stackimpact/agent.py index 2191a05..2bbec4e 100644 --- a/stackimpact/agent.py +++ b/stackimpact/agent.py @@ -26,7 +26,7 @@ class Agent: - AGENT_VERSION = "1.1.1" + AGENT_VERSION = "1.1.2" SAAS_DASHBOARD_ADDRESS = "https://agent-api.stackimpact.com" def __init__(self, **kwargs): diff --git a/stackimpact/reporters/allocation_reporter.py b/stackimpact/reporters/allocation_reporter.py index ef2105d..354ed48 100644 --- a/stackimpact/reporters/allocation_reporter.py +++ b/stackimpact/reporters/allocation_reporter.py @@ -41,7 +41,7 @@ def start(self): self.reset() - self.profiler_scheduler = ProfilerScheduler(self.agent, 20, 5, 120, self.record, self.report) + self.profiler_scheduler = ProfilerScheduler(self.agent, 30, 4, 120, self.record, self.report) self.profiler_scheduler.start() diff --git a/stackimpact/reporters/process_reporter.py b/stackimpact/reporters/process_reporter.py index 78be764..0ab5bee 100644 --- a/stackimpact/reporters/process_reporter.py +++ b/stackimpact/reporters/process_reporter.py @@ -15,7 +15,7 @@ class ProcessReporter: def __init__(self, agent): self.agent = agent self.started = False - self.metrics = {} + self.metrics = None self.report_timer = None @@ -24,6 +24,8 @@ def start(self): return self.started = True + self.reset() + self.report_timer = self.agent.schedule(60, 60, self.report) @@ -36,6 +38,10 @@ def stop(self): self.report_timer = None + def reset(self): + self.metrics = {} + + def report(self): # CPU if runtime_info.OS_LINUX or runtime_info.OS_DARWIN: From 832ee43521997d8b937dd991c989d338eeba5a1a Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Fri, 29 Sep 2017 09:14:14 +0200 Subject: [PATCH 12/26] readme update --- README.md | 2 +- README.rst | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8c8ae72..fed5cf5 100644 --- a/README.md +++ b/README.md @@ -93,4 +93,4 @@ To enable debug logging, add `debug = True` to startup options. If the debug log ## Overhead -The agent overhead is measured to be less than 1% for applications under high load. +The agent overhead is measured to be less than 1% for applications under high load. For applications that are horizontally scaled to multiple processes, StackImpact agents are only active on a small subset (adjustable) of the processes at any point of time, therefore the total overhead is much lower. diff --git a/README.rst b/README.rst index cbfadf0..df585f8 100644 --- a/README.rst +++ b/README.rst @@ -130,4 +130,7 @@ Overhead -------- The agent overhead is measured to be less than 1% for applications under -high load. +high load. For applications that are horizontally scaled to multiple +processes, StackImpact agents are only active on a small subset +(adjustable) of the processes at any point of time, therefore the total +overhead is much lower. From ca08a038a5a28a39b1828f6395877253e6d75dd3 Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Mon, 2 Oct 2017 12:22:06 +0200 Subject: [PATCH 13/26] OS check in allocation reporter --- setup.py | 2 +- stackimpact/agent.py | 2 +- stackimpact/config_loader.py | 2 ++ stackimpact/reporters/allocation_reporter.py | 2 +- tests/reporters/allocation_reporter_test.py | 4 ++-- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 1766216..3ab6782 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name = 'stackimpact', - version = '1.1.2', + version = '1.1.3', description = 'StackImpact Python Agent', author = 'StackImpact', author_email = 'devops@stackimpact.com', diff --git a/stackimpact/agent.py b/stackimpact/agent.py index 2bbec4e..d4a359b 100644 --- a/stackimpact/agent.py +++ b/stackimpact/agent.py @@ -26,7 +26,7 @@ class Agent: - AGENT_VERSION = "1.1.2" + AGENT_VERSION = "1.1.3" SAAS_DASHBOARD_ADDRESS = "https://agent-api.stackimpact.com" def __init__(self, **kwargs): diff --git a/stackimpact/config_loader.py b/stackimpact/config_loader.py index 6ffac88..c0752e5 100644 --- a/stackimpact/config_loader.py +++ b/stackimpact/config_loader.py @@ -47,9 +47,11 @@ def load(self): if self.agent.config.is_agent_enabled(): self.agent.error_reporter.start() self.agent.process_reporter.start() + self.agent.log('Agent activated') else: self.agent.error_reporter.stop() self.agent.process_reporter.stop() + self.agent.log('Agent deactivated') except Exception: diff --git a/stackimpact/reporters/allocation_reporter.py b/stackimpact/reporters/allocation_reporter.py index 354ed48..4dd88e5 100644 --- a/stackimpact/reporters/allocation_reporter.py +++ b/stackimpact/reporters/allocation_reporter.py @@ -31,7 +31,7 @@ def start(self): if self.agent.get_option('allocation_profiler_disabled'): return - if not min_version(3, 4): + if not (runtime_info.OS_LINUX or runtime_info.OS_DARWIN) or not min_version(3, 4): self.agent.log('Memory allocation profiling is available for Python 3.4 or higher') return diff --git a/tests/reporters/allocation_reporter_test.py b/tests/reporters/allocation_reporter_test.py index 3dccfe5..51610c0 100644 --- a/tests/reporters/allocation_reporter_test.py +++ b/tests/reporters/allocation_reporter_test.py @@ -5,13 +5,13 @@ import threading import stackimpact -from stackimpact.runtime import min_version +from stackimpact.runtime import min_version, runtime_info class AllocationReporterTestCase(unittest.TestCase): def test_record_allocation_profile(self): - if not min_version(3, 4): + if not (runtime_info.OS_LINUX or runtime_info.OS_DARWIN) or not min_version(3, 4): return stackimpact._agent = None From c578dbf2b232989a8436ebd98eb82b46af3c3f79 Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Mon, 2 Oct 2017 16:58:04 +0200 Subject: [PATCH 14/26] windows support fixes --- setup.py | 2 +- stackimpact/agent.py | 14 ++++---- stackimpact/reporters/allocation_reporter.py | 6 +++- stackimpact/reporters/block_reporter.py | 4 +-- stackimpact/reporters/cpu_reporter.py | 4 +-- stackimpact/reporters/process_reporter.py | 4 +-- tests/agent_test.py | 4 +++ tests/reporters/allocation_reporter_test.py | 2 +- tests/reporters/block_reporter_test.py | 4 ++- tests/reporters/cpu_reporter_test.py | 2 +- tests/reporters/process_reporter_test.py | 9 +++-- tests/runtime_test.py | 35 +++++++++++--------- 12 files changed, 54 insertions(+), 36 deletions(-) diff --git a/setup.py b/setup.py index 3ab6782..1544370 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name = 'stackimpact', - version = '1.1.3', + version = '1.1.4', description = 'StackImpact Python Agent', author = 'StackImpact', author_email = 'devops@stackimpact.com', diff --git a/stackimpact/agent.py b/stackimpact/agent.py index d4a359b..b544d79 100644 --- a/stackimpact/agent.py +++ b/stackimpact/agent.py @@ -26,7 +26,7 @@ class Agent: - AGENT_VERSION = "1.1.3" + AGENT_VERSION = "1.1.4" SAAS_DASHBOARD_ADDRESS = "https://agent-api.stackimpact.com" def __init__(self, **kwargs): @@ -110,7 +110,8 @@ def _signal_handler(signum, frame): return True - register_signal(signal.SIGUSR2, _signal_handler) + if not runtime_info.OS_WIN: + register_signal(signal.SIGUSR2, _signal_handler) if self.get_option('auto_destroy') == False: # destroy agent on exit @@ -127,10 +128,11 @@ def _exit_handler(*arg): atexit.register(_exit_handler) - register_signal(signal.SIGQUIT, _exit_handler, once = True) - register_signal(signal.SIGINT, _exit_handler, once = True) - register_signal(signal.SIGTERM, _exit_handler, once = True) - register_signal(signal.SIGHUP, _exit_handler, once = True) + if not runtime_info.OS_WIN: + register_signal(signal.SIGQUIT, _exit_handler, once = True) + register_signal(signal.SIGINT, _exit_handler, once = True) + register_signal(signal.SIGTERM, _exit_handler, once = True) + register_signal(signal.SIGHUP, _exit_handler, once = True) self.agent_started = True diff --git a/stackimpact/reporters/allocation_reporter.py b/stackimpact/reporters/allocation_reporter.py index 4dd88e5..98fa3fe 100644 --- a/stackimpact/reporters/allocation_reporter.py +++ b/stackimpact/reporters/allocation_reporter.py @@ -31,7 +31,11 @@ def start(self): if self.agent.get_option('allocation_profiler_disabled'): return - if not (runtime_info.OS_LINUX or runtime_info.OS_DARWIN) or not min_version(3, 4): + if runtime_info.OS_WIN: + self.agent.log('Memory allocation profiler is not available on Windows.') + return + + if not min_version(3, 4): self.agent.log('Memory allocation profiling is available for Python 3.4 or higher') return diff --git a/stackimpact/reporters/block_reporter.py b/stackimpact/reporters/block_reporter.py index 431afb6..4ef013d 100644 --- a/stackimpact/reporters/block_reporter.py +++ b/stackimpact/reporters/block_reporter.py @@ -38,8 +38,8 @@ def setup(self): if self.agent.get_option('block_profiler_disabled'): return - if not runtime_info.OS_LINUX and not runtime_info.OS_DARWIN: - self.agent.log('Block profiler is only supported on Linux and OS X.') + if runtime_info.OS_WIN: + self.agent.log('Block profiler is not available on Windows.') return sample_time = self.SAMPLING_RATE * 1000 diff --git a/stackimpact/reporters/cpu_reporter.py b/stackimpact/reporters/cpu_reporter.py index 04bfe52..43461cd 100644 --- a/stackimpact/reporters/cpu_reporter.py +++ b/stackimpact/reporters/cpu_reporter.py @@ -36,8 +36,8 @@ def setup(self): if self.agent.get_option('cpu_profiler_disabled'): return - if not runtime_info.OS_LINUX and not runtime_info.OS_DARWIN: - self.agent.log('CPU profiler is only supported on Linux and OS X.') + if runtime_info.OS_WIN: + self.agent.log('CPU profiler is not available on Windows.') return def _sample(signum, signal_frame): diff --git a/stackimpact/reporters/process_reporter.py b/stackimpact/reporters/process_reporter.py index 0ab5bee..3bb4af8 100644 --- a/stackimpact/reporters/process_reporter.py +++ b/stackimpact/reporters/process_reporter.py @@ -44,7 +44,7 @@ def reset(self): def report(self): # CPU - if runtime_info.OS_LINUX or runtime_info.OS_DARWIN: + if not runtime_info.OS_WIN: cpu_time = read_cpu_time() if cpu_time != None: cpu_time_metric = self.report_metric(Metric.TYPE_COUNTER, Metric.CATEGORY_CPU, Metric.NAME_CPU_TIME, Metric.UNIT_NANOSECOND, cpu_time) @@ -59,7 +59,7 @@ def report(self): # Memory - if runtime_info.OS_LINUX or runtime_info.OS_DARWIN: + if not runtime_info.OS_WIN: max_rss = read_max_rss() if max_rss != None: self.report_metric(Metric.TYPE_STATE, Metric.CATEGORY_MEMORY, Metric.NAME_MAX_RSS, Metric.UNIT_KILOBYTE, max_rss) diff --git a/tests/agent_test.py b/tests/agent_test.py index 12754ee..8cdd69f 100644 --- a/tests/agent_test.py +++ b/tests/agent_test.py @@ -3,6 +3,7 @@ import threading import stackimpact +from stackimpact.runtime import runtime_info # python3 -m unittest discover -s tests -p *_test.py @@ -10,6 +11,9 @@ class AgentTestCase(unittest.TestCase): def test_run_in_main_thread(self): + if runtime_info.OS_WIN: + return + stackimpact._agent = None agent = stackimpact.start( dashboard_address = 'http://localhost:5001', diff --git a/tests/reporters/allocation_reporter_test.py b/tests/reporters/allocation_reporter_test.py index 51610c0..7b722d1 100644 --- a/tests/reporters/allocation_reporter_test.py +++ b/tests/reporters/allocation_reporter_test.py @@ -11,7 +11,7 @@ class AllocationReporterTestCase(unittest.TestCase): def test_record_allocation_profile(self): - if not (runtime_info.OS_LINUX or runtime_info.OS_DARWIN) or not min_version(3, 4): + if runtime_info.OS_WIN or not min_version(3, 4): return stackimpact._agent = None diff --git a/tests/reporters/block_reporter_test.py b/tests/reporters/block_reporter_test.py index e7cc7d7..2db295f 100644 --- a/tests/reporters/block_reporter_test.py +++ b/tests/reporters/block_reporter_test.py @@ -20,8 +20,10 @@ class BlockReporterTestCase(unittest.TestCase): - def test_record_block_profile(self): + if runtime_info.OS_WIN: + return + stackimpact._agent = None agent = stackimpact.start( dashboard_address = 'http://localhost:5001', diff --git a/tests/reporters/cpu_reporter_test.py b/tests/reporters/cpu_reporter_test.py index 0b3c9f4..bbf15ea 100644 --- a/tests/reporters/cpu_reporter_test.py +++ b/tests/reporters/cpu_reporter_test.py @@ -13,7 +13,7 @@ class CPUReporterTestCase(unittest.TestCase): def test_record_profile(self): - if not runtime_info.OS_LINUX and not runtime_info.OS_DARWIN: + if runtime_info.OS_WIN: return stackimpact._agent = None diff --git a/tests/reporters/process_reporter_test.py b/tests/reporters/process_reporter_test.py index d196434..4c1280d 100644 --- a/tests/reporters/process_reporter_test.py +++ b/tests/reporters/process_reporter_test.py @@ -26,10 +26,13 @@ def test_report(self): metrics = agent.process_reporter.metrics - self.is_valid(metrics, Metric.TYPE_COUNTER, Metric.CATEGORY_CPU, Metric.NAME_CPU_TIME, 0, float("inf")) - self.is_valid(metrics, Metric.TYPE_STATE, Metric.CATEGORY_CPU, Metric.NAME_CPU_USAGE, 0, float("inf")) + if not runtime_info.OS_WIN: + self.is_valid(metrics, Metric.TYPE_COUNTER, Metric.CATEGORY_CPU, Metric.NAME_CPU_TIME, 0, float("inf")) + self.is_valid(metrics, Metric.TYPE_STATE, Metric.CATEGORY_CPU, Metric.NAME_CPU_USAGE, 0, float("inf")) + + if not runtime_info.OS_WIN: + self.is_valid(metrics, Metric.TYPE_STATE, Metric.CATEGORY_MEMORY, Metric.NAME_MAX_RSS, 0, float("inf")) - self.is_valid(metrics, Metric.TYPE_STATE, Metric.CATEGORY_MEMORY, Metric.NAME_MAX_RSS, 0, float("inf")) if runtime_info.OS_LINUX: self.is_valid(metrics, Metric.TYPE_STATE, Metric.CATEGORY_MEMORY, Metric.NAME_CURRENT_RSS, 0, float("inf")) self.is_valid(metrics, Metric.TYPE_STATE, Metric.CATEGORY_MEMORY, Metric.NAME_VM_SIZE, 0, float("inf")) diff --git a/tests/runtime_test.py b/tests/runtime_test.py index f3d5903..e97ee11 100644 --- a/tests/runtime_test.py +++ b/tests/runtime_test.py @@ -3,39 +3,42 @@ import os import stackimpact -from stackimpact.runtime import register_signal +from stackimpact.runtime import runtime_info, register_signal class RuntimeTestCase(unittest.TestCase): def test_register_signal(self): - result = {'handler': 0} + if runtime_info.OS_WIN: + return - def _handler(signum, frame): - result['handler'] += 1 + result = {'handler': 0} - register_signal(signal.SIGUSR1, _handler) + def _handler(signum, frame): + result['handler'] += 1 - os.kill(os.getpid(), signal.SIGUSR1) - os.kill(os.getpid(), signal.SIGUSR1) + register_signal(signal.SIGUSR1, _handler) - signal.signal(signal.SIGUSR1, signal.SIG_DFL) + os.kill(os.getpid(), signal.SIGUSR1) + os.kill(os.getpid(), signal.SIGUSR1) - self.assertEqual(result['handler'], 2) + signal.signal(signal.SIGUSR1, signal.SIG_DFL) + + self.assertEqual(result['handler'], 2) '''def test_register_signal_default(self): - result = {'handler': 0} + result = {'handler': 0} - def _handler(signum, frame): - result['handler'] += 1 + def _handler(signum, frame): + result['handler'] += 1 - register_signal(signal.SIGUSR1, _handler, once = True) + register_signal(signal.SIGUSR1, _handler, once = True) - os.kill(os.getpid(), signal.SIGUSR1) - os.kill(os.getpid(), signal.SIGUSR1) + os.kill(os.getpid(), signal.SIGUSR1) + os.kill(os.getpid(), signal.SIGUSR1) - self.assertEqual(result['handler'], 1)''' + self.assertEqual(result['handler'], 1)''' if __name__ == '__main__': From 2520dde888e1d148de527e69a71197c874a034ee Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Sun, 5 Nov 2017 20:56:06 +0000 Subject: [PATCH 15/26] auto_destroy option fix --- setup.py | 2 +- stackimpact/agent.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 1544370..5785aa3 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name = 'stackimpact', - version = '1.1.4', + version = '1.1.5', description = 'StackImpact Python Agent', author = 'StackImpact', author_email = 'devops@stackimpact.com', diff --git a/stackimpact/agent.py b/stackimpact/agent.py index b544d79..6a9c38f 100644 --- a/stackimpact/agent.py +++ b/stackimpact/agent.py @@ -26,7 +26,7 @@ class Agent: - AGENT_VERSION = "1.1.4" + AGENT_VERSION = "1.1.5" SAAS_DASHBOARD_ADDRESS = "https://agent-api.stackimpact.com" def __init__(self, **kwargs): @@ -113,7 +113,7 @@ def _signal_handler(signum, frame): if not runtime_info.OS_WIN: register_signal(signal.SIGUSR2, _signal_handler) - if self.get_option('auto_destroy') == False: + if self.get_option('auto_destroy') is None or self.get_option('auto_destroy') is True: # destroy agent on exit def _exit_handler(*arg): if not self.agent_started or self.agent_destroyed: From 60f81748f085cecb3f47729f537c99b2362e02dc Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Sun, 28 Jan 2018 08:44:36 +0000 Subject: [PATCH 16/26] readme update --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index fed5cf5..aab095e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ## Overview -StackImpact is a performance profiler for production applications. It gives developers continuous and historical view of application performance with line-of-code precision, which includes CPU, memory allocation and blocking call hot spots as well as execution bottlenecks, errors and runtime metrics. Learn more at [stackimpact.com](https://stackimpact.com/). +StackImpact is a production-grade performance profiler built for both production and development environments. It gives developers continuous and historical view of application performance with line-of-code precision that is essential for locating CPU, memory allocation and blocking call hot spots as well as latency bottlenecks. Included runtime metric and error monitoring complement profiles for extensive performance analysis. Learn more at [stackimpact.com](https://stackimpact.com/). ![dashboard](https://stackimpact.com/wp-content/uploads/2017/06/hotspots-cpu-1.4-python.png) @@ -37,7 +37,7 @@ See full [documentation](https://stackimpact.com/docs/) for reference. #### Create StackImpact account -[Sign up](https://dashboard.stackimpact.com/#/signup) for a free account (also with GitHub login). +[Sign up](https://dashboard.stackimpact.com/#/signup) for a free trial account (also with GitHub login). #### Installing the agent @@ -93,4 +93,4 @@ To enable debug logging, add `debug = True` to startup options. If the debug log ## Overhead -The agent overhead is measured to be less than 1% for applications under high load. For applications that are horizontally scaled to multiple processes, StackImpact agents are only active on a small subset (adjustable) of the processes at any point of time, therefore the total overhead is much lower. +The agent overhead is measured to be less than 1% for applications under high load. For applications that are horizontally scaled to multiple processes, StackImpact agents are only active on a small subset of the processes at any point of time, therefore the total overhead is much lower. From 93597682f0205cf86e3650a9bca0b183c0a6c2b0 Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Sun, 28 Jan 2018 08:53:17 +0000 Subject: [PATCH 17/26] readme update --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index aab095e..4d7e87a 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ See full [documentation](https://stackimpact.com/docs/) for reference. #### Create StackImpact account -[Sign up](https://dashboard.stackimpact.com/#/signup) for a free trial account (also with GitHub login). +Sign up for a free trial account at [stackimpact.com](https://stackimpact.com) (also with GitHub login). #### Installing the agent From 1f944af11fa5c0db05987626b9f4cc4be038f50a Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Fri, 23 Feb 2018 20:47:01 +0000 Subject: [PATCH 18/26] readme updates --- README.md | 2 +- README.rst | 24 +++++++++++++----------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 4d7e87a..550f378 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ StackImpact is a production-grade performance profiler built for both production and development environments. It gives developers continuous and historical view of application performance with line-of-code precision that is essential for locating CPU, memory allocation and blocking call hot spots as well as latency bottlenecks. Included runtime metric and error monitoring complement profiles for extensive performance analysis. Learn more at [stackimpact.com](https://stackimpact.com/). -![dashboard](https://stackimpact.com/wp-content/uploads/2017/06/hotspots-cpu-1.4-python.png) +![dashboard](https://stackimpact.com/img/readme/hotspots-cpu-1.4-python.png) #### Features diff --git a/README.rst b/README.rst index df585f8..79ae98e 100644 --- a/README.rst +++ b/README.rst @@ -4,14 +4,16 @@ StackImpact Python Agent Overview -------- -StackImpact is a performance profiler for production applications. It -gives developers continuous and historical view of application -performance with line-of-code precision, which includes CPU, memory -allocation and blocking call hot spots as well as execution bottlenecks, -errors and runtime metrics. Learn more at +StackImpact is a production-grade performance profiler built for both +production and development environments. It gives developers continuous +and historical view of application performance with line-of-code +precision that is essential for locating CPU, memory allocation and +blocking call hot spots as well as latency bottlenecks. Included runtime +metric and error monitoring complement profiles for extensive +performance analysis. Learn more at `stackimpact.com `__. -.. figure:: https://stackimpact.com/wp-content/uploads/2017/06/hotspots-cpu-1.4-python.png +.. figure:: https://stackimpact.com/img/readme/hotspots-cpu-1.4-python.png :alt: dashboard dashboard @@ -54,8 +56,8 @@ Getting started Create StackImpact account ^^^^^^^^^^^^^^^^^^^^^^^^^^ -`Sign up `__ for a free -account (also with GitHub login). +Sign up for a free trial account at +`stackimpact.com `__ (also with GitHub login). Installing the agent ^^^^^^^^^^^^^^^^^^^^ @@ -131,6 +133,6 @@ Overhead The agent overhead is measured to be less than 1% for applications under high load. For applications that are horizontally scaled to multiple -processes, StackImpact agents are only active on a small subset -(adjustable) of the processes at any point of time, therefore the total -overhead is much lower. +processes, StackImpact agents are only active on a small subset of the +processes at any point of time, therefore the total overhead is much +lower. From 394db3d6950b80402185ee17508ccbda83c87ac7 Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Sun, 1 Apr 2018 17:09:38 +0000 Subject: [PATCH 19/26] update readme --- README.md | 19 ++++++++++++------- README.rst | 36 +++++++++++++++++++++++------------- 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 550f378..49f0628 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,27 @@ -# StackImpact Python Agent +# StackImpact Python Profiler ## Overview -StackImpact is a production-grade performance profiler built for both production and development environments. It gives developers continuous and historical view of application performance with line-of-code precision that is essential for locating CPU, memory allocation and blocking call hot spots as well as latency bottlenecks. Included runtime metric and error monitoring complement profiles for extensive performance analysis. Learn more at [stackimpact.com](https://stackimpact.com/). +StackImpact is a production-grade performance profiler built for both production and development environments. It gives developers continuous and historical code-level view of application performance that is essential for locating CPU, memory allocation and I/O hot spots as well as latency bottlenecks. Included runtime metrics and error monitoring complement profiles for extensive performance analysis. Learn more at [stackimpact.com](https://stackimpact.com/). ![dashboard](https://stackimpact.com/img/readme/hotspots-cpu-1.4-python.png) #### Features -* Continuous hot spot profiling for CPU, memory allocations, blocking calls -* Error monitoring -* Health monitoring including CPU, memory, garbage collection and other runtime metrics -* Anomaly detection -* Multiple account users for team collaboration +* Continuous hot spot profiling for CPU usage, memory allocation, blocking calls. +* Error and exception monitoring. +* Health monitoring including CPU, memory, garbage collection and other runtime metrics. +* Alerts on profile anomalies. +* Team access. Learn more on the [features](https://stackimpact.com/features/) page (with screenshots). +#### How it works + +The StackImpact profiler agent is imported into a program and used as a normal package. When the program runs, various sampling profilers are started and stopped automatically by the agent and/or programmatically using the agent methods. The agent periodically reports recorded profiles and metrics to the StackImpact Dashboard. If an application has multiple processes, also referred to as workers, instances or nodes, only one or two processes will have active agents at any point of time. + + #### Documentation See full [documentation](https://stackimpact.com/docs/) for reference. diff --git a/README.rst b/README.rst index 79ae98e..6f2d092 100644 --- a/README.rst +++ b/README.rst @@ -1,16 +1,15 @@ -StackImpact Python Agent -======================== +StackImpact Python Profiler +=========================== Overview -------- StackImpact is a production-grade performance profiler built for both production and development environments. It gives developers continuous -and historical view of application performance with line-of-code -precision that is essential for locating CPU, memory allocation and -blocking call hot spots as well as latency bottlenecks. Included runtime -metric and error monitoring complement profiles for extensive -performance analysis. Learn more at +and historical code-level view of application performance that is +essential for locating CPU, memory allocation and I/O hot spots as well +as latency bottlenecks. Included runtime metric and error monitoring +complements profiles for extensive performance analysis. Learn more at `stackimpact.com `__. .. figure:: https://stackimpact.com/img/readme/hotspots-cpu-1.4-python.png @@ -21,17 +20,28 @@ performance analysis. Learn more at Features ^^^^^^^^ -- Continuous hot spot profiling for CPU, memory allocations, blocking - calls -- Error monitoring +- Continuous hot spot profiling for CPU usage, memory allocation, + blocking calls. +- Error and exception monitoring. - Health monitoring including CPU, memory, garbage collection and other - runtime metrics -- Anomaly detection -- Multiple account users for team collaboration + runtime metrics. +- Alerts on profile anomalies. +- Team access. Learn more on the `features `__ page (with screenshots). +How it works +^^^^^^^^^^^^ + +The StackImpact profiler agent is imported into a program and used as a +normal package. When the program runs, various sampling profilers are +started and stopped automatically by the agent and/or programmatically +using the agent methods. The agent periodically reports recorded +profiles and metrics to the StackImpact Dashboard. If an application has +multiple processes, also referred to as workers, instances or nodes, +only one or two processes will have active agents at any point of time. + Documentation ^^^^^^^^^^^^^ From 48535f22bf641f8ab0a488b38a07c5d965d06622 Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Thu, 19 Apr 2018 14:21:25 +0000 Subject: [PATCH 20/26] add programmatic and manual profiling --- README.md | 57 +++++ README.rst | 73 ++++++- examples/aws-lambda/aws_lambda.py | 40 ++++ examples/{ => demo}/flask_app.py | 0 examples/manual/app.py | 45 ++++ examples/programmatic/app.py | 38 ++++ setup.py | 9 +- stackimpact/agent.py | 203 ++++++++++++++++-- stackimpact/config_loader.py | 20 +- stackimpact/message_queue.py | 32 ++- stackimpact/metric.py | 3 +- stackimpact/profiler_scheduler.py | 62 ------ stackimpact/profilers/__init__.py | 0 stackimpact/profilers/allocation_profiler.py | 123 +++++++++++ .../block_profiler.py} | 92 +++----- .../cpu_profiler.py} | 109 +++------- stackimpact/reporters/allocation_reporter.py | 130 ----------- stackimpact/reporters/error_reporter.py | 55 +++-- stackimpact/reporters/process_reporter.py | 15 ++ stackimpact/reporters/profile_reporter.py | 187 ++++++++++++++++ stackimpact/reporters/span_reporter.py | 82 +++++++ tests/agent_test.py | 152 ++++++++++++- tests/message_queue_test.py | 41 +--- tests/profiler_scheduler_test.py | 47 ---- tests/profilers/__init__.py | 0 .../allocation_profiler_test.py} | 15 +- .../block_profiler_test.py} | 19 +- .../cpu_profiler_test.py} | 16 +- tests/reporters/profile_reporter_test.py | 0 tests/reporters/span_reporter_test.py | 41 ++++ 30 files changed, 1213 insertions(+), 493 deletions(-) create mode 100644 examples/aws-lambda/aws_lambda.py rename examples/{ => demo}/flask_app.py (100%) create mode 100644 examples/manual/app.py create mode 100644 examples/programmatic/app.py delete mode 100644 stackimpact/profiler_scheduler.py create mode 100644 stackimpact/profilers/__init__.py create mode 100644 stackimpact/profilers/allocation_profiler.py rename stackimpact/{reporters/block_reporter.py => profilers/block_profiler.py} (66%) rename stackimpact/{reporters/cpu_reporter.py => profilers/cpu_profiler.py} (55%) delete mode 100644 stackimpact/reporters/allocation_reporter.py create mode 100644 stackimpact/reporters/profile_reporter.py create mode 100644 stackimpact/reporters/span_reporter.py delete mode 100644 tests/profiler_scheduler_test.py create mode 100644 tests/profilers/__init__.py rename tests/{reporters/allocation_reporter_test.py => profilers/allocation_profiler_test.py} (70%) rename tests/{reporters/block_reporter_test.py => profilers/block_profiler_test.py} (79%) rename tests/{reporters/cpu_reporter_test.py => profilers/cpu_profiler_test.py} (66%) create mode 100644 tests/reporters/profile_reporter_test.py create mode 100644 tests/reporters/span_reporter_test.py diff --git a/README.md b/README.md index 49f0628..a9f521b 100644 --- a/README.md +++ b/README.md @@ -79,12 +79,69 @@ All initialization options: * `app_version` (Optional) Sets application version, which can be used to associate profiling information with the source code release. * `app_environment` (Optional) Used to differentiate applications in different environments. * `host_name` (Optional) By default, host name will be the OS hostname. +* `auto_profiling` (Optional) If set to `False`, disables automatic profiling and reporting. Programmatic or manual profiling should be used instead. Useful for environments without support for timers or background tasks. * `debug` (Optional) Enables debug logging. * `cpu_profiler_disabled`, `allocation_profiler_disabled`, `block_profiler_disabled`, `error_profiler_disabled` (Optional) Disables respective profiler when `True`. * `include_agent_frames` (Optional) Set to `True` to not exclude agent stack frames from profile call graphs. * `auto_destroy` (Optional) Set to `False` to disable agent's exit handlers. If necessary, call `destroy()` to gracefully shutdown the agent. +#### Programmatic profiling + +Use `agent.profile(name)` to instruct the agent when to start and stop profiling. The agent decides if and which profiler is activated. Normally, this method should be used in repeating code, such as request or event handlers. In addition to more precise profiling, timing information will also be reported for the profiled spans. Usage example: + +```python +span = agent.profile('span1'); + +# your code here + +span.stop(); +``` + +Alternatively, a `with` statement can be used: + +```python +with agent.profile('span1'): + # your code ehere +``` + + +#### Manual profiling + +*Manual profiling should not be used in production!* + +By default, the agent starts and stops profiling automatically. Manual profiling allows to start and stop profilers directly. It is suitable for profiling short-lived programs and should not be used for long-running production applications. Automatic profiling should be disabled with `auto_profiling: False`. + +```python +# Start CPU profiler. +agent.start_cpu_profiler(); +``` + +```python +# Stop CPU profiler and report the recorded profile to the Dashboard. +agent.stop_cpu_profiler(); +``` + +```python +# Start blocking call profiler. +agent.start_block_profiler(); +``` + +```python +# Stop blocking call profiler and report the recorded profile to the Dashboard. +agent.stop_block_profiler(); +``` + +```python +# Start heap allocation profiler. +agent.start_allocation_profiler(); +``` + +```python +# Stop heap allocation profiler and report the recorded profile to the Dashboard. +agent.stop_allocation_profiler(); +``` + #### Analyzing performance data in the Dashboard diff --git a/README.rst b/README.rst index 6f2d092..f94df70 100644 --- a/README.rst +++ b/README.rst @@ -8,8 +8,8 @@ StackImpact is a production-grade performance profiler built for both production and development environments. It gives developers continuous and historical code-level view of application performance that is essential for locating CPU, memory allocation and I/O hot spots as well -as latency bottlenecks. Included runtime metric and error monitoring -complements profiles for extensive performance analysis. Learn more at +as latency bottlenecks. Included runtime metrics and error monitoring +complement profiles for extensive performance analysis. Learn more at `stackimpact.com `__. .. figure:: https://stackimpact.com/img/readme/hotspots-cpu-1.4-python.png @@ -113,6 +113,10 @@ All initialization options: different environments. - ``host_name`` (Optional) By default, host name will be the OS hostname. +- ``auto_profiling`` (Optional) If set to ``False``, disables automatic + profiling and reporting. Programmatic or manual profiling should be + used instead. Useful for environments without support for timers or + background tasks. - ``debug`` (Optional) Enables debug logging. - ``cpu_profiler_disabled``, ``allocation_profiler_disabled``, ``block_profiler_disabled``, ``error_profiler_disabled`` (Optional) @@ -123,6 +127,71 @@ All initialization options: handlers. If necessary, call ``destroy()`` to gracefully shutdown the agent. +Programmatic profiling +^^^^^^^^^^^^^^^^^^^^^^ + +Use ``agent.profile(name)`` to instruct the agent when to start and stop +profiling. The agent decides if and which profiler is activated. +Normally, this method should be used in repeating code, such as request +or event handlers. In addition to more precise profiling, timing +information will also be reported for the profiled spans. Usage example: + +.. code:: python + + span = agent.profile('span1'); + + # your code here + + span.stop(); + +Alternatively, a ``with`` statement can be used: + +.. code:: python + + with agent.profile('span1'): + # your code ehere + +Manual profiling +^^^^^^^^^^^^^^^^ + +*Manual profiling should not be used in production!* + +By default, the agent starts and stops profiling automatically. Manual +profiling allows to start and stop profilers directly. It is suitable +for profiling short-lived programs and should not be used for +long-running production applications. Automatic profiling should be +disabled with ``auto_profiling: False``. + +.. code:: python + + # Start CPU profiler. + agent.start_cpu_profiler(); + +.. code:: python + + # Stop CPU profiler and report the recorded profile to the Dashboard. + agent.stop_cpu_profiler(); + +.. code:: python + + # Start blocking call profiler. + agent.start_block_profiler(); + +.. code:: python + + # Stop blocking call profiler and report the recorded profile to the Dashboard. + agent.stop_block_profiler(); + +.. code:: python + + # Start heap allocation profiler. + agent.start_allocation_profiler(); + +.. code:: python + + # Stop heap allocation profiler and report the recorded profile to the Dashboard. + agent.stop_allocation_profiler(); + Analyzing performance data in the Dashboard ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/examples/aws-lambda/aws_lambda.py b/examples/aws-lambda/aws_lambda.py new file mode 100644 index 0000000..4fda51b --- /dev/null +++ b/examples/aws-lambda/aws_lambda.py @@ -0,0 +1,40 @@ +from __future__ import print_function +import stackimpact +import random +import threading +import time +import signal + +agent = stackimpact.start( + agent_key = 'agent key here', + app_name = 'LambdaDemoPython', + app_environment = 'prod', + block_profiler_disabled = True) + + +def simulate_cpu_work(): + for j in range(0, 100000): + random.randint(1, 1000000) + +mem = [] +def simulate_mem_leak(): + for i in range(0, 1000): + obj = {'v': random.randint(0, 1000000)} + mem.append(obj) + +def handler(event, context): + span = agent.profile() + + simulate_cpu_work() + simulate_mem_leak() + + span.stop() + + response = { + "statusCode": 200, + "body": 'Done' + } + + return response + + diff --git a/examples/flask_app.py b/examples/demo/flask_app.py similarity index 100% rename from examples/flask_app.py rename to examples/demo/flask_app.py diff --git a/examples/manual/app.py b/examples/manual/app.py new file mode 100644 index 0000000..af34096 --- /dev/null +++ b/examples/manual/app.py @@ -0,0 +1,45 @@ +from __future__ import print_function +import random +import time +import sys +import threading +sys.path.append(".") +import stackimpact + +agent = stackimpact.start( + agent_key = 'agent key here', + app_name = 'MyPythonApp', + auto_profiling = False) + + +agent.start_cpu_profiler() + +for j in range(0, 1000000): + random.randint(1, 1000000) + +agent.stop_cpu_profiler() + + +''' +agent.start_allocation_profiler() + +mem1 = [] +for i in range(0, 1000): + obj1 = {'v': random.randint(0, 1000000)} + mem1.append(obj1) + +agent.stop_allocation_profiler() +''' + + +''' +agent.start_block_profiler() + +def blocking_call(): + time.sleep(0.1) + +for i in range(5): + blocking_call() + +agent.stop_block_profiler() +''' \ No newline at end of file diff --git a/examples/programmatic/app.py b/examples/programmatic/app.py new file mode 100644 index 0000000..b8566e9 --- /dev/null +++ b/examples/programmatic/app.py @@ -0,0 +1,38 @@ +from __future__ import print_function +import random +import time +import sys +import threading +sys.path.append(".") +import stackimpact + +agent = stackimpact.start( + agent_key = 'agent key here', + app_name = 'MyPythonApp') + + +def simulate_cpu_work(): + for j in range(0, 100000): + random.randint(1, 1000000) + + +def handle_some_event(): + span = agent.profile('some event') + + simulate_cpu_work() + + span.stop() + + response = { + "statusCode": 200, + "body": 'Done' + } + + return response + + +# Simulate events +while True: + handle_some_event() + time.sleep(2) + diff --git a/setup.py b/setup.py index 5785aa3..f58712d 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,14 @@ +import os from setuptools import setup, find_packages +def read(fname): + return open(os.path.join(os.path.dirname(__file__), fname)).read() + setup( name = 'stackimpact', - version = '1.1.5', - description = 'StackImpact Python Agent', + version = '1.2.2', + description = 'StackImpact Python Profiler', + long_description = read('README.rst'), author = 'StackImpact', author_email = 'devops@stackimpact.com', url = 'https://stackimpact.com', diff --git a/stackimpact/agent.py b/stackimpact/agent.py index 6a9c38f..e71211f 100644 --- a/stackimpact/agent.py +++ b/stackimpact/agent.py @@ -10,6 +10,8 @@ import signal import atexit import platform +import random +import math from .runtime import min_version, runtime_info, register_signal from .utils import timestamp, generate_uuid @@ -18,22 +20,45 @@ from .message_queue import MessageQueue from .frame_cache import FrameCache from .reporters.process_reporter import ProcessReporter -from .reporters.cpu_reporter import CPUReporter -from .reporters.allocation_reporter import AllocationReporter -from .reporters.block_reporter import BlockReporter +from .reporters.profile_reporter import ProfileReporter, ProfilerConfig from .reporters.error_reporter import ErrorReporter +from .reporters.span_reporter import SpanReporter +from .profilers.cpu_profiler import CPUProfiler +from .profilers.allocation_profiler import AllocationProfiler +from .profilers.block_profiler import BlockProfiler + + +class Span: + + def __init__(self, stop_func = None): + if stop_func: + self.stop_func = stop_func + + + def stop(self): + if self.stop_func: + self.stop_func() + + + def __enter__(self): + pass + + + def __exit__(self, exc_type, exc_value, traceback): + self.stop() class Agent: - AGENT_VERSION = "1.1.5" + AGENT_VERSION = "1.2.2" SAAS_DASHBOARD_ADDRESS = "https://agent-api.stackimpact.com" def __init__(self, **kwargs): self.agent_started = False self.agent_destroyed = False - self.profiler_lock = threading.Lock() + self.profiler_active = False + self.span_active = False self.main_thread_func = None @@ -44,10 +69,36 @@ def __init__(self, **kwargs): self.message_queue = MessageQueue(self) self.frame_cache = FrameCache(self) self.process_reporter = ProcessReporter(self) - self.cpu_reporter = CPUReporter(self) - self.allocation_reporter = AllocationReporter(self) - self.block_reporter = BlockReporter(self) self.error_reporter = ErrorReporter(self) + self.span_reporter = SpanReporter(self) + + config = ProfilerConfig() + config.log_prefix = 'CPU profiler' + config.max_profile_duration = 20 + config.max_span_duration = 5 + config.max_span_count = 30 + config.span_interval = 20 + config.report_interval = 120 + self.cpu_reporter = ProfileReporter(self, CPUProfiler(self), config) + + config = ProfilerConfig() + config.log_prefix = 'Allocation profiler' + config.max_profile_duration = 20 + config.max_span_duration = 5 + config.max_span_count = 30 + config.span_interval = 20 + config.report_interval = 120 + self.allocation_reporter = ProfileReporter(self, AllocationProfiler(self), config) + + config = ProfilerConfig() + config.log_prefix = 'Block profiler' + config.max_profile_duration = 20 + config.max_span_duration = 5 + config.max_span_count = 30 + config.span_interval = 20 + config.report_interval = 120 + self.block_reporter = ProfileReporter(self, BlockProfiler(self), config) + self.options = None @@ -75,6 +126,9 @@ def start(self, **kwargs): self.options = kwargs + if 'auto_profiling' not in self.options: + self.options['auto_profiling'] = True + if 'dashboard_address' not in self.options: self.options['dashboard_address'] = self.SAAS_DASHBOARD_ADDRESS @@ -87,7 +141,6 @@ def start(self, **kwargs): if 'host_name' not in self.options: self.options['host_name'] = socket.gethostname() - self.run_id = generate_uuid() self.run_ts = timestamp() @@ -96,7 +149,11 @@ def start(self, **kwargs): self.frame_cache.start() self.cpu_reporter.setup() + self.allocation_reporter.setup() self.block_reporter.setup() + self.span_reporter.setup() + self.error_reporter.setup() + self.process_reporter.setup() # execute main_thread_func in main thread on signal def _signal_handler(signum, frame): @@ -139,6 +196,121 @@ def _exit_handler(*arg): self.log('Agent started') + def enable(self): + if not self.config.is_agent_enabled(): + self.cpu_reporter.start() + self.allocation_reporter.start() + self.block_reporter.start() + self.span_reporter.start() + self.error_reporter.start() + self.process_reporter.start() + self.config.set_agent_enabled(True) + + + def disable(self): + if self.config.is_agent_enabled(): + self.cpu_reporter.stop() + self.allocation_reporter.stop() + self.block_reporter.stop() + self.span_reporter.stop() + self.error_reporter.stop() + self.process_reporter.stop() + self.config.set_agent_enabled(False) + + + def profile(self, name = 'Default'): + if not self.agent_started or self.span_active: + return Span(None) + + self.span_active = True + + selected_reporter = None + active_reporters = [] + if self.cpu_reporter.started: + active_reporters.append(self.cpu_reporter) + if self.allocation_reporter.started: + active_reporters.append(self.allocation_reporter) + if self.block_reporter.started: + active_reporters.append(self.block_reporter) + + if len(active_reporters) > 0: + selected_reporter = active_reporters[int(math.floor(random.random() * len(active_reporters)))] + if not selected_reporter.start_profiling(True, True): + selected_reporter = None + + start_timestamp = timestamp() + + def stop_func(): + if selected_reporter: + selected_reporter.stop_profiling() + + duration = timestamp() - start_timestamp + self.span_reporter.record_span(name, duration) + + self.span_active = False + + return Span(stop_func) + + + def _start_profiler(self, reporter): + if not self.agent_started or self.get_option('auto_profiling'): + return + + self.span_active = True + + reporter.start() + reporter.start_profiling(True, False) + + + def _stop_profiler(self, reporter): + if not self.agent_started or self.get_option('auto_profiling'): + return + + reporter.stop_profiling() + reporter.report(False) + reporter.stop() + self.message_queue.flush(False) + + self.span_active = False + + + def start_cpu_profiler(self): + self._start_profiler(self.cpu_reporter) + + + def stop_cpu_profiler(self): + self._stop_profiler(self.cpu_reporter) + + + def start_allocation_profiler(self): + self._start_profiler(self.allocation_reporter) + + + def stop_allocation_profiler(self): + self._stop_profiler(self.allocation_reporter) + + + def start_block_profiler(self): + self._start_profiler(self.block_reporter) + + + def stop_block_profiler(self): + self._stop_profiler(self.block_reporter) + + + def report(self): + if not self.agent_started or self.get_option('auto_profiling'): + return + + self.configLoader.load(True) + + self.cpuReporter.report(True); + self.allocationReporter.report(True); + self.asyncReporter.report(True); + + self.messageQueue.flush(True) + + def destroy(self): if not self.agent_started: self.log('Agent has not been started') @@ -154,10 +326,15 @@ def destroy(self): self.allocation_reporter.stop() self.block_reporter.stop() self.error_reporter.stop() + self.span_reporter.stop() self.process_reporter.stop() self.cpu_reporter.destroy() + self.allocation_reporter.destroy() self.block_reporter.destroy() + self.error_reporter.destroy() + self.span_reporter.destroy() + self.process_reporter.destroy() self.agent_destroyed = True self.log('Agent destroyed') @@ -186,10 +363,10 @@ def exception(self): traceback.print_exc() - def delay(self, timeout, func): + def delay(self, timeout, func, *args): def func_wrapper(): try: - func() + func(*args) except Exception: self.exception() @@ -199,14 +376,14 @@ def func_wrapper(): return t - def schedule(self, timeout, interval, func): + def schedule(self, timeout, interval, func, *args): tw = TimerWraper() def func_wrapper(): start = time.time() try: - func() + func(*args) except Exception: self.exception() diff --git a/stackimpact/config_loader.py b/stackimpact/config_loader.py index c0752e5..1879c1f 100644 --- a/stackimpact/config_loader.py +++ b/stackimpact/config_loader.py @@ -1,14 +1,21 @@ from .api_request import APIRequest +from .utils import timestamp class ConfigLoader: + LOAD_DELAY = 2 + LOAD_INTERVAL = 120 + + def __init__(self, agent): self.agent = agent self.load_timer = None + self.last_load_ts = 0 def start(self): - self.load_timer = self.agent.schedule(2, 120, self.load) + if self.agent.get_option('auto_profiling'): + self.load_timer = self.agent.schedule(self.LOAD_DELAY, self.LOAD_INTERVAL, self.load) def stop(self): @@ -17,7 +24,14 @@ def stop(self): self.load_timer = None - def load(self): + def load(self, with_interval = False): + now = timestamp() + if with_interval and self.last_load_ts > now - self.LOAD_INTERVAL: + return + + self.last_load_ts = now; + + try: api_request = APIRequest(self.agent) config = api_request.post('config', {}) @@ -46,10 +60,12 @@ def load(self): if self.agent.config.is_agent_enabled(): self.agent.error_reporter.start() + self.agent.span_reporter.start() self.agent.process_reporter.start() self.agent.log('Agent activated') else: self.agent.error_reporter.stop() + self.agent.span_reporter.stop() self.agent.process_reporter.stop() self.agent.log('Agent deactivated') diff --git a/stackimpact/message_queue.py b/stackimpact/message_queue.py index ce904dd..d9ed49b 100644 --- a/stackimpact/message_queue.py +++ b/stackimpact/message_queue.py @@ -7,32 +7,29 @@ class MessageQueue: + FLUSH_INTERVAL = 5; MESSAGE_TTL = 10 * 60 + def __init__(self, agent): self.agent = agent self.queue = [] self.queue_lock = threading.Lock() self.flush_timer = None - self.expire_timer = None self.backoff_seconds = 0 self.last_flush_ts = 0 def start(self): - self.expire_timer = self.agent.schedule(60, 60, self.expire) - self.flush_timer = self.agent.schedule(5, 5, self.flush) + if self.agent.get_option('auto_profiling'): + self.flush_timer = self.agent.schedule(self.FLUSH_INTERVAL, self.FLUSH_INTERVAL, self.flush) def stop(self): if self.flush_timer: self.flush_timer.cancel() self.flush_timer = None - - if self.expire_timer: - self.expire_timer.cancel() - self.expire_timer = None - + def add(self, topic, message): m = { @@ -48,27 +45,22 @@ def add(self, topic, message): self.agent.log(message) - def expire(self): + def flush(self, with_interval = False): if len(self.queue) == 0: return now = timestamp() - - # expire and copy messages - with self.queue_lock: - self.queue = [m for m in self.queue if m['added_at'] >= now - self.MESSAGE_TTL] - - - def flush(self): - if len(self.queue) == 0: + if with_interval and self.last_flush_ts > now - self.FLUSH_INTERVAL: return - now = timestamp() - # flush only if backoff time is elapsed if self.last_flush_ts + self.backoff_seconds > now: return + # expire old messages + with self.queue_lock: + self.queue = [m for m in self.queue if m['added_at'] >= now - self.MESSAGE_TTL] + # read queue outgoing = None with self.queue_lock: @@ -84,7 +76,7 @@ def flush(self): 'messages': outgoing_copy } - self.last_flush_ts = timestamp() + self.last_flush_ts = now try: api_request = APIRequest(self.agent) diff --git a/stackimpact/metric.py b/stackimpact/metric.py index d7e5f7b..8587615 100644 --- a/stackimpact/metric.py +++ b/stackimpact/metric.py @@ -17,6 +17,7 @@ class Metric: CATEGORY_MEMORY = 'memory' CATEGORY_GC = 'gc' CATEGORY_RUNTIME = 'runtime' + CATEGORY_SPAN = 'span' CATEGORY_CPU_PROFILE = 'cpu-profile' CATEGORY_MEMORY_PROFILE = 'memory-profile' CATEGORY_BLOCK_PROFILE = 'block-profile' @@ -46,7 +47,7 @@ class Metric: UNIT_PERCENT = 'percent' TRIGGER_TIMER = 'timer' - TRIGGER_ANOMALY = 'anomaly' + TRIGGER_API = 'api' def __init__(self, agent, typ, category, name, unit): diff --git a/stackimpact/profiler_scheduler.py b/stackimpact/profiler_scheduler.py deleted file mode 100644 index 0b32984..0000000 --- a/stackimpact/profiler_scheduler.py +++ /dev/null @@ -1,62 +0,0 @@ -from __future__ import division - -import random -import math - -from .utils import timestamp, base64_encode -from .metric import Metric - -class ProfilerScheduler: - - def __init__(self, agent, record_interval, record_duration, report_interval, record_func, report_func): - self.agent = agent - self.record_interval = record_interval - self.record_duration = record_duration - self.report_interval = report_interval - self.record_func = record_func - self.report_func = report_func - self.random_timer = None - self.record_timer = None - self.report_timer = None - - - def start(self): - if self.record_func: - def random_delay(): - timeout = random.randint(0, round(self.record_interval - self.record_duration)) - self.random_timer = self.agent.delay(timeout, self.execute_record) - - self.record_timer = self.agent.schedule(self.record_interval, self.record_interval, random_delay) - - self.report_timer = self.agent.schedule(self.report_interval, self.report_interval, self.execute_report) - - - def stop(self): - if self.random_timer: - self.random_timer.cancel() - self.random_timer = None - - if self.record_timer: - self.record_timer.cancel() - self.record_timer = None - - if self.report_timer: - self.report_timer.cancel() - self.report_timer = None - - - def execute_record(self): - with self.agent.profiler_lock: - try: - self.record_func(self.record_duration) - except Exception: - self.agent.exception() - - - def execute_report(self): - with self.agent.profiler_lock: - try: - self.report_func() - except Exception: - self.agent.exception() - diff --git a/stackimpact/profilers/__init__.py b/stackimpact/profilers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/stackimpact/profilers/allocation_profiler.py b/stackimpact/profilers/allocation_profiler.py new file mode 100644 index 0000000..56448cd --- /dev/null +++ b/stackimpact/profilers/allocation_profiler.py @@ -0,0 +1,123 @@ +from __future__ import division + +import sys +import time +import re +import threading + +from ..runtime import min_version, runtime_info, read_vm_size +from ..utils import timestamp +from ..metric import Metric +from ..metric import Breakdown + +if min_version(3, 4): + import tracemalloc + + +class AllocationProfiler: + MAX_TRACEBACK_SIZE = 25 # number of frames + MAX_MEMORY_OVERHEAD = 10 * 1e6 # 10MB + MAX_PROFILED_ALLOCATIONS = 25 + + + def __init__(self, agent): + self.agent = agent + self.ready = False + self.profile = None + self.profile_lock = threading.Lock() + self.overhead_monitor = None + self.start_ts = None + + + def setup(self): + if self.agent.get_option('allocation_profiler_disabled'): + return + + if not runtime_info.OS_LINUX and not runtime_info.OS_DARWIN: + self.agent.log('CPU profiler is only supported on Linux and OS X.') + return + + if not min_version(3, 4): + self.agent.log('Memory allocation profiling is available for Python 3.4 or higher') + return + + self.ready = True + + + def reset(self): + self.profile = Breakdown('root') + + + def start_profiler(self): + self.agent.log('Activating memory allocation profiler.') + + def start(): + tracemalloc.start(self.MAX_TRACEBACK_SIZE) + self.agent.run_in_main_thread(start) + + self.start_ts = time.time() + + def monitor_overhead(): + if tracemalloc.is_tracing() and tracemalloc.get_tracemalloc_memory() > self.MAX_MEMORY_OVERHEAD: + self.agent.log('Allocation profiler memory overhead limit exceeded: {0} bytes'.format(tracemalloc.get_tracemalloc_memory())) + self.stop_profiler() + + self.overhead_monitor = self.agent.schedule(0.5, 0.5, monitor_overhead) + + + def stop_profiler(self): + self.agent.log('Deactivating memory allocation profiler.') + + with self.profile_lock: + if self.overhead_monitor: + self.overhead_monitor.cancel() + self.overhead_monitor = None + + if tracemalloc.is_tracing(): + snapshot = tracemalloc.take_snapshot() + self.agent.log('Allocation profiler memory overhead {0} bytes'.format(tracemalloc.get_tracemalloc_memory())) + tracemalloc.stop() + self.process_snapshot(snapshot, time.time() - self.start_ts) + + + def build_profile(self, duration): + with self.profile_lock: + self.profile.normalize(duration) + self.profile.propagate() + self.profile.floor() + self.profile.filter(2, 1000, float("inf")) + + return [{ + 'category': Metric.CATEGORY_MEMORY_PROFILE, + 'name': Metric.NAME_UNCOLLECTED_ALLOCATIONS, + 'unit': Metric.UNIT_BYTE, + 'unit_interval': None, + 'profile': self.profile + }] + + + def destroy(self): + pass + + + def process_snapshot(self, snapshot, duration): + stats = snapshot.statistics('traceback') + + for stat in stats[:self.MAX_PROFILED_ALLOCATIONS]: + if stat.traceback: + skip_stack = False + for frame in stat.traceback: + if self.agent.frame_cache.is_agent_frame(frame.filename): + skip_stack = True + break + if skip_stack: + continue + + current_node = self.profile + for frame in reversed(stat.traceback): + if frame.filename == '': + continue + + frame_name = '{0}:{1}'.format(frame.filename, frame.lineno) + current_node = current_node.find_or_add_child(frame_name) + current_node.increment(stat.size, stat.count) diff --git a/stackimpact/reporters/block_reporter.py b/stackimpact/profilers/block_profiler.py similarity index 66% rename from stackimpact/reporters/block_reporter.py rename to stackimpact/profilers/block_profiler.py index 4ef013d..d876001 100644 --- a/stackimpact/reporters/block_reporter.py +++ b/stackimpact/profilers/block_profiler.py @@ -8,7 +8,6 @@ import signal from ..runtime import min_version, runtime_info -from ..profiler_scheduler import ProfilerScheduler from ..metric import Metric from ..metric import Breakdown from ..frame import Frame @@ -17,29 +16,26 @@ import gevent -class BlockReporter: +class BlockProfiler: SAMPLING_RATE = 0.05 MAX_TRACEBACK_SIZE = 25 # number of frames def __init__(self, agent): self.agent = agent - self.setup_done = False - self.started = False - self.profiler_scheduler = None - self.profile_lock = threading.Lock() + self.ready = False self.profile = None - self.profile_duration = 0 + self.profile_lock = threading.Lock() self.prev_signal_handler = None - self.handler_active = False + self.sampler_active = False def setup(self): if self.agent.get_option('block_profiler_disabled'): return - if runtime_info.OS_WIN: - self.agent.log('Block profiler is not available on Windows.') + if not runtime_info.OS_LINUX and not runtime_info.OS_DARWIN: + self.agent.log('CPU profiler is only supported on Linux and OS X.') return sample_time = self.SAMPLING_RATE * 1000 @@ -51,9 +47,9 @@ def setup(self): main_thread_id = threading.current_thread().ident def _sample(signum, signal_frame): - if self.handler_active: + if self.sampler_active: return - self.handler_active = True + self.sampler_active = True with self.profile_lock: try: @@ -62,60 +58,50 @@ def _sample(signum, signal_frame): except Exception: self.agent.exception() - self.handler_active = False + self.sampler_active = False self.prev_signal_handler = signal.signal(signal.SIGALRM, _sample) - self.setup_done = True - - - def start(self): - if not self.setup_done: - return - - if self.started: - return - self.started = True - - self.reset() - - self.profiler_scheduler = ProfilerScheduler(self.agent, 10, 2, 120, self.record, self.report) - self.profiler_scheduler.start() - - - def stop(self): - if not self.started: - return; - self.started = False - - self.profiler_scheduler.stop() + self.ready = True def destroy(self): - if not self.setup_done: + if not self.ready: return - signal.setitimer(signal.ITIMER_REAL, 0) signal.signal(signal.SIGALRM, self.prev_signal_handler) def reset(self): self.profile = Breakdown('root') - self.profile_duration = 0 - def record(self, duration): + def start_profiler(self): self.agent.log('Activating block profiler.') signal.setitimer(signal.ITIMER_REAL, self.SAMPLING_RATE, self.SAMPLING_RATE) - time.sleep(duration) + + + def stop_profiler(self): signal.setitimer(signal.ITIMER_REAL, 0) self.agent.log('Deactivating block profiler.') - self.profile_duration += duration - self.agent.log('Block profiler CPU overhead per activity second: {0} seconds'.format(self.profile._overhead / self.profile_duration)) + def build_profile(self, duration): + with self.profile_lock: + self.profile.normalize(duration) + self.profile.propagate() + self.profile.floor() + self.profile.filter(2, 1, float("inf")) + + return [{ + 'category': Metric.CATEGORY_BLOCK_PROFILE, + 'name': Metric.NAME_BLOCKING_CALL_TIMES, + 'unit': Metric.UNIT_MILLISECOND, + 'unit_interval': None, + 'profile': self.profile + }] def process_sample(self, signal_frame, sample_time, main_thread_id): @@ -174,23 +160,3 @@ def recover_stack(self, thread_frame): return None else: return stack - - - def report(self): - if self.agent.config.is_profiling_disabled(): - return - - if self.profile_duration == 0: - return - - with self.profile_lock: - self.profile.normalize(self.profile_duration) - self.profile.propagate() - self.profile.floor() - self.profile.filter(2, 1, float("inf")) - - metric = Metric(self.agent, Metric.TYPE_PROFILE, Metric.CATEGORY_BLOCK_PROFILE, Metric.NAME_BLOCKING_CALL_TIMES, Metric.UNIT_MILLISECOND) - measurement = metric.create_measurement(Metric.TRIGGER_TIMER, self.profile.measurement, 1, self.profile) - self.agent.message_queue.add('metric', metric.to_dict()) - - self.reset() diff --git a/stackimpact/reporters/cpu_reporter.py b/stackimpact/profilers/cpu_profiler.py similarity index 55% rename from stackimpact/reporters/cpu_reporter.py rename to stackimpact/profilers/cpu_profiler.py index 43461cd..f648deb 100644 --- a/stackimpact/reporters/cpu_reporter.py +++ b/stackimpact/profilers/cpu_profiler.py @@ -8,42 +8,38 @@ import signal from ..runtime import min_version, runtime_info -from ..profiler_scheduler import ProfilerScheduler from ..metric import Metric from ..metric import Breakdown from ..frame import Frame -class CPUReporter: +class CPUProfiler: SAMPLING_RATE = 0.01 MAX_TRACEBACK_SIZE = 25 # number of frames def __init__(self, agent): self.agent = agent - self.setup_done = False - self.started = False - self.profiler_scheduler = None + self.ready = False self.profile = None self.profile_lock = threading.Lock() - self.profile_duration = 0 self.prev_signal_handler = None - self.handler_active = False + self.sampler_active = False def setup(self): if self.agent.get_option('cpu_profiler_disabled'): return - if runtime_info.OS_WIN: - self.agent.log('CPU profiler is not available on Windows.') + if not runtime_info.OS_LINUX and not runtime_info.OS_DARWIN: + self.agent.log('CPU profiler is only supported on Linux and OS X.') return def _sample(signum, signal_frame): - if self.handler_active: + if self.sampler_active: return - self.handler_active = True + self.sampler_active = True with self.profile_lock: try: @@ -52,71 +48,47 @@ def _sample(signum, signal_frame): except Exception: self.agent.exception() - self.handler_active = False + self.sampler_active = False self.prev_signal_handler = signal.signal(signal.SIGPROF, _sample) - self.setup_done = True - - - def start(self): - if self.agent.get_option('cpu_profiler_disabled'): - return + self.ready = True - if not runtime_info.OS_LINUX and not runtime_info.OS_DARWIN: - self.agent.log('CPU profiler is only supported on Linux and OS X.') - return - - if not self.setup_done: - return - if self.started: - return; - self.started = True + def reset(self): + self.profile = Breakdown('root') - self.reset() - self.profiler_scheduler = ProfilerScheduler(self.agent, 10, 2, 120, self.record, self.report) - self.profiler_scheduler.start() + def start_profiler(self): + self.agent.log('Activating CPU profiler.') + signal.setitimer(signal.ITIMER_PROF, self.SAMPLING_RATE, self.SAMPLING_RATE) - def stop(self): - if not self.started: - return - self.started = False - self.profiler_scheduler.stop() + def stop_profiler(self): + signal.setitimer(signal.ITIMER_PROF, 0) def destroy(self): - if not self.setup_done: + if not self.ready: return - signal.setitimer(signal.ITIMER_PROF, 0) signal.signal(signal.SIGPROF, self.prev_signal_handler) - def reset(self): + def build_profile(self, duration): with self.profile_lock: - self.profile = Breakdown('root') - self.profile_duration = 0 - - - def record(self, duration): - if self.agent.config.is_profiling_disabled(): - return - - self.agent.log('Activating CPU profiler.') - - signal.setitimer(signal.ITIMER_PROF, self.SAMPLING_RATE, self.SAMPLING_RATE) - time.sleep(duration) - signal.setitimer(signal.ITIMER_PROF, 0) - - self.agent.log('Deactivating CPU profiler.') - - self.profile_duration += duration + self.profile.propagate() + self.profile.evaluate_percent(duration / self.SAMPLING_RATE) + self.profile.filter(2, 1, 100) - self.agent.log('CPU profiler CPU overhead per activity second: {0} seconds'.format(self.profile._overhead / self.profile_duration)) + return [{ + 'category': Metric.CATEGORY_CPU_PROFILE, + 'name': Metric.NAME_MAIN_THREAD_CPU_USAGE, + 'unit': Metric.UNIT_PERCENT, + 'unit_interval': None, + 'profile': self.profile + }] def process_sample(self, signal_frame): @@ -128,8 +100,6 @@ def process_sample(self, signal_frame): self.update_profile(self.profile, stack) stack = None - - self.profile._overhead += (time.clock() - start) def recover_stack(self, signal_frame): @@ -160,27 +130,8 @@ def recover_stack(self, signal_frame): def update_profile(self, profile, stack): current_node = profile - current_node.increment(0, 1) for frame in reversed(stack): current_node = current_node.find_or_add_child(str(frame)) - current_node.increment(0, 1) - - - def report(self): - if self.agent.config.is_profiling_disabled(): - return - - if self.profile_duration == 0: - return - - with self.profile_lock: - self.profile.evaluate_percent(self.profile_duration / self.SAMPLING_RATE) - - self.profile.filter(2, 1, 100) - - metric = Metric(self.agent, Metric.TYPE_PROFILE, Metric.CATEGORY_CPU_PROFILE, Metric.NAME_MAIN_THREAD_CPU_USAGE, Metric.UNIT_PERCENT) - measurement = metric.create_measurement(Metric.TRIGGER_TIMER, self.profile.measurement, None, self.profile) - self.agent.message_queue.add('metric', metric.to_dict()) - - self.reset() + + current_node.increment(0, 1) diff --git a/stackimpact/reporters/allocation_reporter.py b/stackimpact/reporters/allocation_reporter.py deleted file mode 100644 index 98fa3fe..0000000 --- a/stackimpact/reporters/allocation_reporter.py +++ /dev/null @@ -1,130 +0,0 @@ -from __future__ import division - -import sys -import time -import re - -from ..runtime import min_version, runtime_info, read_vm_size -from ..profiler_scheduler import ProfilerScheduler -from ..metric import Metric -from ..metric import Breakdown - -if min_version(3, 4): - import tracemalloc - - -class AllocationReporter: - MAX_TRACEBACK_SIZE = 25 # number of frames - MAX_MEMORY_OVERHEAD = 10 * 1e6 # 10MB - MAX_PROFILED_ALLOCATIONS = 25 - - - def __init__(self, agent): - self.agent = agent - self.started = False - self.profiler_scheduler = None - self.profile = None - self.profile_duration = 0 - - - def start(self): - if self.agent.get_option('allocation_profiler_disabled'): - return - - if runtime_info.OS_WIN: - self.agent.log('Memory allocation profiler is not available on Windows.') - return - - if not min_version(3, 4): - self.agent.log('Memory allocation profiling is available for Python 3.4 or higher') - return - - if self.started: - return - self.started = True - - self.reset() - - self.profiler_scheduler = ProfilerScheduler(self.agent, 30, 4, 120, self.record, self.report) - self.profiler_scheduler.start() - - - def stop(self): - if not self.started: - return - self.started = False - - self.profiler_scheduler.stop() - - - def reset(self): - self.profile = Breakdown('root') - self.profile_duration = 0 - - - def record(self, max_duration): - self.agent.log('Activating memory allocation profiler.') - - def start(): - tracemalloc.start(self.MAX_TRACEBACK_SIZE) - self.agent.run_in_main_thread(start) - - duration = 0 - step = 1 - while duration < max_duration: - time.sleep(step) - duration += step - - if tracemalloc.get_tracemalloc_memory() > self.MAX_MEMORY_OVERHEAD: - break - - self.agent.log('Deactivating memory allocation profiler.') - - if tracemalloc.is_tracing(): - snapshot = tracemalloc.take_snapshot() - self.agent.log('Allocation profiler memory overhead {0} bytes'.format(tracemalloc.get_tracemalloc_memory())) - tracemalloc.stop() - self.process_snapshot(snapshot, duration) - - self.profile_duration += duration - - - - def process_snapshot(self, snapshot, duration): - stats = snapshot.statistics('traceback') - - for stat in stats[:self.MAX_PROFILED_ALLOCATIONS]: - if stat.traceback: - skip_stack = False - for frame in stat.traceback: - if self.agent.frame_cache.is_agent_frame(frame.filename): - skip_stack = True - break - if skip_stack: - continue - - current_node = self.profile - for frame in reversed(stat.traceback): - if frame.filename == '': - continue - - frame_name = '{0}:{1}'.format(frame.filename, frame.lineno) - current_node = current_node.find_or_add_child(frame_name) - current_node.increment(stat.size, stat.count) - - - def report(self): - if self.profile_duration == 0: - return - - self.profile.normalize(self.profile_duration) - self.profile.propagate() - self.profile.floor() - self.profile.filter(2, 1000, float("inf")) - - metric = Metric(self.agent, Metric.TYPE_PROFILE, Metric.CATEGORY_MEMORY_PROFILE, Metric.NAME_UNCOLLECTED_ALLOCATIONS, Metric.UNIT_BYTE) - measurement = metric.create_measurement(Metric.TRIGGER_TIMER, self.profile.measurement, 1, self.profile) - self.agent.message_queue.add('metric', metric.to_dict()) - - self.reset() - diff --git a/stackimpact/reporters/error_reporter.py b/stackimpact/reporters/error_reporter.py index c459373..ae6ecb7 100644 --- a/stackimpact/reporters/error_reporter.py +++ b/stackimpact/reporters/error_reporter.py @@ -16,6 +16,7 @@ class ErrorReporter: def __init__(self, agent): self.agent = agent + self.setup_done = False self.started = False self.process_timer = None self.report_timer = None @@ -25,19 +26,10 @@ def __init__(self, agent): self.added_exceptions = None - def start(self): + def setup(self): if self.agent.get_option('error_profiler_disabled'): return - if self.started: - return - self.started = True - - self.reset_profile() - - self.process_timer = self.agent.schedule(1, 1, self.process) - self.report_timer = self.agent.schedule(60, 60, self.report) - def _exc_info(ret): try: if not self.agent.agent_started or self.agent.agent_destroyed: @@ -51,14 +43,45 @@ def _exc_info(ret): patch(sys, 'exc_info', None, _exc_info) + self.setup_done = True + + + def destroy(self): + if not self.setup_done: + return + + unpatch(sys, 'exc_info') + + + def reset(self): + with self.profile_lock: + self.profile = Breakdown('root') + self.added_exceptions = {} + + + def start(self): + if self.agent.get_option('error_profiler_disabled'): + return + + if not self.agent.get_option('auto_profiling'): + return + + if self.started: + return + self.started = True + + self.reset() + + + self.process_timer = self.agent.schedule(1, 1, self.process) + self.report_timer = self.agent.schedule(60, 60, self.report) + def stop(self): if not self.started: return self.started = False - unpatch(sys, 'exc_info') - self.process_timer.cancel() self.process_timer = None @@ -66,19 +89,13 @@ def stop(self): self.report_timer = None - def reset_profile(self): - with self.profile_lock: - self.profile = Breakdown('root') - self.added_exceptions = {} - - def report(self): with self.profile_lock: metric = Metric(self.agent, Metric.TYPE_PROFILE, Metric.CATEGORY_ERROR_PROFILE, Metric.NAME_HANDLED_EXCEPTIONS, Metric.UNIT_NONE) measurement = metric.create_measurement(Metric.TRIGGER_TIMER, self.profile.measurement, 60, self.profile) self.agent.message_queue.add('metric', metric.to_dict()) - self.reset_profile() + self.reset() def process(self): diff --git a/stackimpact/reporters/process_reporter.py b/stackimpact/reporters/process_reporter.py index 3bb4af8..dc3c195 100644 --- a/stackimpact/reporters/process_reporter.py +++ b/stackimpact/reporters/process_reporter.py @@ -19,7 +19,22 @@ def __init__(self, agent): self.report_timer = None + def setup(self): + pass + + + def destroy(self): + pass + + + def reset(self): + pass + + def start(self): + if not self.agent.get_option('auto_profiling'): + return + if self.started: return self.started = True diff --git a/stackimpact/reporters/profile_reporter.py b/stackimpact/reporters/profile_reporter.py new file mode 100644 index 0000000..11579bf --- /dev/null +++ b/stackimpact/reporters/profile_reporter.py @@ -0,0 +1,187 @@ +from __future__ import division + +import os +import sys +import time +import threading +import re +import random + +from ..runtime import min_version, runtime_info +from ..utils import timestamp +from ..metric import Metric +from ..metric import Breakdown +from ..frame import Frame + + +class ProfilerConfig: + + def __init__(self): + self.log_prefix = None + self.max_profile_duration = None + self.max_span_duration = None + self.max_span_count = None + self.span_interval = None + self.report_interval = None + self.report_only = False + + +class ProfileReporter: + + def __init__(self, agent, profiler, config): + self.agent = agent + self.profiler = profiler + self.config = config + self.started = False + self.span_timer = None + self.span_timeout = None + self.random_timer = None + self.report_timer = None + self.profile_start_ts = None + self.profile_duration = None + self.span_count = None + self.span_active = False + self.span_start_ts = None + self.span_trigger = None + + + def setup(self): + self.profiler.setup() + + + def start(self): + if not self.profiler.ready: + return + + if self.started: + return + self.started = True + + self.reset() + + if self.agent.get_option('auto_profiling'): + if not self.config.report_only: + def random_delay(): + timeout = random.randint(0, round(self.config.span_interval - self.config.max_span_duration)) + self.random_timer = self.agent.delay(timeout, self.start_profiling, False, True) + + self.span_timer = self.agent.schedule(0, self.config.span_interval, random_delay) + + self.report_timer = self.agent.schedule(self.config.report_interval, self.config.report_interval, self.report) + + + def stop(self): + if not self.started: + return + + self.started = False + + if self.span_timer: + self.span_timer.cancel() + self.span_timer = None + + if self.random_timer: + self.random_timer.cancel() + self.random_timer = None + + if self.report_timer: + self.report_timer.cancel() + self.report_timer = None + + self.stop_profiling() + + + def destroy(self): + self.profiler.destroy() + + + def reset(self): + self.profiler.reset() + self.profile_start_ts = timestamp() + self.profile_duration = 0 + self.span_count = 0 + self.span_trigger = Metric.TRIGGER_TIMER + + + def start_profiling(self, api_call, with_timeout): + if not self.started: + return False + + if self.profile_duration > self.config.max_profile_duration: + self.agent.log(self.config.log_prefix + ': max profiling duration reached.') + return False + + if api_call and self.span_count > self.config.max_span_count: + self.agent.log(self.config.log_prefix + ': max recording count reached.') + return False + + if self.agent.profiler_active: + self.agent.log(self.config.log_prefix + ': profiler lock exists.') + return False + + self.agent.profiler_active = True + self.agent.log(self.config.log_prefix + ': started.') + + try: + self.profiler.start_profiler() + except Exception: + self.agent.profiler_active = False + self.exception() + return False + + if with_timeout: + self.span_timeout = self.agent.delay(self.config.max_span_duration, self.stop_profiling) + + self.span_count = self.span_count + 1 + self.span_active = True + self.span_start_ts = time.time() + + if api_call: + self.span_trigger = Metric.TRIGGER_API + + return True + + + def stop_profiling(self): + if not self.span_active: + return + self.span_active = False + + try: + self.profile_duration = self.profile_duration + time.time() - self.span_start_ts + self.profiler.stop_profiler() + except Exception: + self.exception() + + self.agent.profiler_active = False + + if self.span_timeout: + self.span_timeout.cancel() + + self.agent.log(self.config.log_prefix + ': stopped.') + + + def report(self, with_interval = False): + if not self.started: + return + + if with_interval: + if self.profile_start_ts > timestamp() - self.config.report_interval: + return + elif self.profile_start_ts < timestamp() - 2 * self.config.report_interval: + self.reset() + return + + if not self.config.report_only and self.profile_duration == 0: + return + + self.agent.log(self.config.log_prefix + ': reporting profile.') + + profile_data = self.profiler.build_profile(self.profile_duration) + + for data in profile_data: + metric = Metric(self.agent, Metric.TYPE_PROFILE, data['category'], data['name'], data['unit']) + metric.create_measurement(self.span_trigger, data['profile'].measurement, data['unit_interval'], data['profile']) + self.agent.message_queue.add('metric', metric.to_dict()) + + self.reset() diff --git a/stackimpact/reporters/span_reporter.py b/stackimpact/reporters/span_reporter.py new file mode 100644 index 0000000..15c213c --- /dev/null +++ b/stackimpact/reporters/span_reporter.py @@ -0,0 +1,82 @@ + +import sys +import threading +import traceback +import collections + +from ..runtime import runtime_info, patch, unpatch +from ..metric import Metric +from ..metric import Breakdown +from ..frame import Frame + + +class SpanReporter: + MAX_QUEUED_EXC = 100 + + + def __init__(self, agent): + self.agent = agent + self.started = False + self.report_timer = None + self.span_counters = None + self.span_lock = threading.Lock() + + + def setup(self): + pass + + + def destroy(self): + pass + + + def reset(self): + self.span_counters = dict() + + + def start(self): + if not self.agent.get_option('auto_profiling'): + return + + if self.started: + return + self.started = True + + self.reset() + + self.report_timer = self.agent.schedule(60, 60, self.report) + + + def stop(self): + if not self.started: + return + self.started = False + + self.report_timer.cancel() + self.report_timer = None + + + def record_span(self, name, duration): + if not self.started: + return + + counter = None + if name in self.span_counters: + counter = self.span_counters[name] + else: + with self.span_lock: + counter = Breakdown(name) + self.span_counters[name] = counter + + counter.update_p95(duration * 1000) + + + def report(self): + for name, counter in self.span_counters.items(): + counter.evaluate_p95(); + + metric = Metric(self.agent, Metric.TYPE_STATE, Metric.CATEGORY_SPAN, counter.name, Metric.UNIT_MILLISECOND) + measurement = metric.create_measurement(Metric.TRIGGER_TIMER, counter.measurement, 60) + self.agent.message_queue.add('metric', metric.to_dict()) + + self.reset() diff --git a/tests/agent_test.py b/tests/agent_test.py index 8cdd69f..490469e 100644 --- a/tests/agent_test.py +++ b/tests/agent_test.py @@ -1,12 +1,14 @@ import unittest import sys import threading +import random +import time import stackimpact -from stackimpact.runtime import runtime_info +from stackimpact.runtime import runtime_info, min_version -# python3 -m unittest discover -s tests -p *_test.py +# python3 -m unittest discover -v -s tests -p *_test.py class AgentTestCase(unittest.TestCase): @@ -39,5 +41,151 @@ def _thread(): agent.destroy() + def test_profile(self): + if runtime_info.OS_WIN: + return + + stackimpact._agent = None + agent = stackimpact.start( + dashboard_address = 'http://localhost:5001', + agent_key = 'key1', + app_name = 'TestPythonApp', + auto_profiling = False, + debug = True + ) + + agent.cpu_reporter.start() + + span = agent.profile() + for i in range(0, 2000000): + random.randint(1, 1000000) + span.stop() + + agent.cpu_reporter.report() + + self.assertTrue('test_profile' in str(agent.message_queue.queue)) + + agent.destroy() + + + def test_with_profile(self): + if runtime_info.OS_WIN: + return + + stackimpact._agent = None + agent = stackimpact.start( + dashboard_address = 'http://localhost:5001', + agent_key = 'key1', + app_name = 'TestPythonApp', + auto_profiling = False, + debug = True + ) + + agent.cpu_reporter.start() + + with agent.profile(): + for i in range(0, 2000000): + random.randint(1, 1000000) + + agent.cpu_reporter.report() + + self.assertTrue('test_with_profile' in str(agent.message_queue.queue)) + + agent.destroy() + + + def test_cpu_profile(self): + stackimpact._agent = None + agent = stackimpact.start( + dashboard_address = 'http://localhost:5001', + agent_key = 'key1', + app_name = 'TestPythonApp', + auto_profiling = False, + debug = True + ) + + messages = [] + def add_mock(topic, message): + messages.append(message) + agent.message_queue.add = add_mock + + agent.start_cpu_profiler() + + for j in range(0, 2000000): + random.randint(1, 1000000) + + agent.stop_cpu_profiler() + + self.assertTrue('test_cpu_profile' in str(messages)) + + agent.destroy() + + + def test_allocation_profile(self): + if runtime_info.OS_WIN or not min_version(3, 4): + return + + stackimpact._agent = None + agent = stackimpact.start( + dashboard_address = 'http://localhost:5001', + agent_key = 'key1', + app_name = 'TestPythonApp', + auto_profiling = False, + debug = True + ) + + messages = [] + def add_mock(topic, message): + messages.append(message) + agent.message_queue.add = add_mock + + agent.start_allocation_profiler() + + mem1 = [] + for i in range(0, 1000): + obj1 = {'v': random.randint(0, 1000000)} + mem1.append(obj1) + + agent.stop_allocation_profiler() + + self.assertTrue('agent_test.py' in str(messages)) + + agent.destroy() + + + def test_block_profile(self): + if runtime_info.OS_WIN or not min_version(3, 4): + return + + stackimpact._agent = None + agent = stackimpact.start( + dashboard_address = 'http://localhost:5001', + agent_key = 'key1', + app_name = 'TestPythonApp', + auto_profiling = False, + debug = True + ) + + messages = [] + def add_mock(topic, message): + messages.append(message) + agent.message_queue.add = add_mock + + agent.start_block_profiler() + + def blocking_call(): + time.sleep(0.1) + + for i in range(5): + blocking_call() + + agent.stop_block_profiler() + + self.assertTrue('blocking_call' in str(messages)) + + agent.destroy() + + + if __name__ == '__main__': unittest.main() diff --git a/tests/message_queue_test.py b/tests/message_queue_test.py index 908dc53..6732755 100644 --- a/tests/message_queue_test.py +++ b/tests/message_queue_test.py @@ -11,42 +11,14 @@ class MessageQueueTest(unittest.TestCase): - def test_expire(self): - stackimpact._agent = None - agent = stackimpact.start( - dashboard_address = 'http://localhost:5003', - agent_key = 'key1', - app_name = 'TestPythonApp', - debug = True - ) - - m = { - 'm1': 1 - } - agent.message_queue.add('t1', m) - - m = { - 'm2': 2 - } - agent.message_queue.add('t1', m) - - agent.message_queue.queue[0]['added_at'] = timestamp() - 20 * 60 - - agent.message_queue.expire() - - self.assertEqual(len(agent.message_queue.queue), 1) - self.assertEqual(agent.message_queue.queue[0]['content']['m2'], 2) - - agent.destroy() - def test_flush(self): - server = TestServer(5004) + server = TestServer(5005) server.start() stackimpact._agent = None agent = stackimpact.start( - dashboard_address = 'http://localhost:5004', + dashboard_address = 'http://localhost:5005', agent_key = 'key1', app_name = 'TestPythonApp', debug = True @@ -62,24 +34,25 @@ def test_flush(self): } agent.message_queue.add('t1', m) + agent.message_queue.queue[0]['added_at'] = timestamp() - 20 * 60 + agent.message_queue.flush() data = json.loads(server.get_request_data()) - self.assertEqual(data['payload']['messages'][0]['content']['m1'], 1) - self.assertEqual(data['payload']['messages'][1]['content']['m2'], 2) + self.assertEqual(data['payload']['messages'][0]['content']['m2'], 2) agent.destroy() server.join() def test_flush_fail(self): - server = TestServer(5005) + server = TestServer(5006) server.set_response_data("unparsablejson") server.start() stackimpact._agent = None agent = stackimpact.start( - dashboard_address = 'http://localhost:5005', + dashboard_address = 'http://localhost:5006', agent_key = 'key1', app_name = 'TestPythonApp', debug = True diff --git a/tests/profiler_scheduler_test.py b/tests/profiler_scheduler_test.py deleted file mode 100644 index 5997331..0000000 --- a/tests/profiler_scheduler_test.py +++ /dev/null @@ -1,47 +0,0 @@ -import unittest -import sys -import random -import time - -import stackimpact -from stackimpact.profiler_scheduler import ProfilerScheduler - - -class ProfilerSchedulerTestCase(unittest.TestCase): - - - def test_start_profiler(self): - stackimpact._agent = None - agent = stackimpact.start( - dashboard_address = 'http://localhost:5001', - agent_key = 'key1', - app_name = 'TestPythonApp', - debug = True - ) - - stats = { - "records": 0, - "reports": 0, - } - - def record_func(duration): - stats["records"] += 1 - - def report_func(): - stats["reports"] += 1 - - ps = ProfilerScheduler(agent, 0.010, 0.002, 0.050, record_func, report_func) - ps.start() - - time.sleep(0.150) - - self.assertFalse(stats["records"] < 10) - self.assertFalse(stats["reports"] < 2) - - ps.stop() - agent.destroy() - - - -if __name__ == '__main__': - unittest.main() diff --git a/tests/profilers/__init__.py b/tests/profilers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/reporters/allocation_reporter_test.py b/tests/profilers/allocation_profiler_test.py similarity index 70% rename from tests/reporters/allocation_reporter_test.py rename to tests/profilers/allocation_profiler_test.py index 7b722d1..6f20754 100644 --- a/tests/reporters/allocation_reporter_test.py +++ b/tests/profilers/allocation_profiler_test.py @@ -8,7 +8,7 @@ from stackimpact.runtime import min_version, runtime_info -class AllocationReporterTestCase(unittest.TestCase): +class AllocationProfilerTestCase(unittest.TestCase): def test_record_allocation_profile(self): if runtime_info.OS_WIN or not min_version(3, 4): @@ -19,9 +19,11 @@ def test_record_allocation_profile(self): dashboard_address = 'http://localhost:5001', agent_key = 'key1', app_name = 'TestPythonApp', + auto_profiling = False, debug = True ) - agent.allocation_reporter.start() + + agent.allocation_reporter.profiler.reset() mem1 = [] def mem_leak(n = 100000): @@ -44,7 +46,9 @@ def mem_leak5(): result = {} def record(): - agent.allocation_reporter.record(2) + agent.allocation_reporter.profiler.start_profiler() + time.sleep(2) + agent.allocation_reporter.profiler.stop_profiler() t = threading.Thread(target=record) t.start() @@ -54,9 +58,10 @@ def record(): t.join() - #print(agent.allocation_reporter.profile) + profile = agent.allocation_reporter.profiler.build_profile(2)[0]['profile'].to_dict() + #print(str(profile)) - self.assertTrue('allocation_reporter_test.py' in str(agent.allocation_reporter.profile)) + self.assertTrue('allocation_profiler_test.py' in str(profile)) agent.destroy() diff --git a/tests/reporters/block_reporter_test.py b/tests/profilers/block_profiler_test.py similarity index 79% rename from tests/reporters/block_reporter_test.py rename to tests/profilers/block_profiler_test.py index 2db295f..e1cd8de 100644 --- a/tests/reporters/block_reporter_test.py +++ b/tests/profilers/block_profiler_test.py @@ -19,7 +19,7 @@ from test_server import TestServer -class BlockReporterTestCase(unittest.TestCase): +class BlockProfilerTestCase(unittest.TestCase): def test_record_block_profile(self): if runtime_info.OS_WIN: return @@ -29,9 +29,11 @@ def test_record_block_profile(self): dashboard_address = 'http://localhost:5001', agent_key = 'key1', app_name = 'TestPythonApp', + auto_profiling = False, debug = True ) - agent.block_reporter.start() + + agent.block_reporter.profiler.reset() lock = threading.Lock() event = threading.Event() @@ -67,7 +69,9 @@ def url_wait(): result = {} def record(): - agent.block_reporter.record(2) + agent.block_reporter.profiler.start_profiler() + time.sleep(2) + agent.block_reporter.profiler.stop_profiler() record_t = threading.Thread(target=record) record_t.start() @@ -97,11 +101,12 @@ def record(): record_t.join() - #print(agent.block_reporter.profile) + profile = agent.block_reporter.profiler.build_profile(2)[0]['profile'].to_dict() + #print(profile) - self.assertTrue('lock_wait' in str(agent.block_reporter.profile)) - self.assertTrue('event_wait' in str(agent.block_reporter.profile)) - self.assertTrue('url_wait' in str(agent.block_reporter.profile)) + self.assertTrue('lock_wait' in str(profile)) + self.assertTrue('event_wait' in str(profile)) + self.assertTrue('url_wait' in str(profile)) agent.destroy() diff --git a/tests/reporters/cpu_reporter_test.py b/tests/profilers/cpu_profiler_test.py similarity index 66% rename from tests/reporters/cpu_reporter_test.py rename to tests/profilers/cpu_profiler_test.py index bbf15ea..1ea25b6 100644 --- a/tests/reporters/cpu_reporter_test.py +++ b/tests/profilers/cpu_profiler_test.py @@ -10,7 +10,7 @@ from stackimpact.runtime import runtime_info -class CPUReporterTestCase(unittest.TestCase): +class CPUProfilerTestCase(unittest.TestCase): def test_record_profile(self): if runtime_info.OS_WIN: @@ -21,12 +21,17 @@ def test_record_profile(self): dashboard_address = 'http://localhost:5001', agent_key = 'key1', app_name = 'TestPythonApp', + auto_profiling = False, debug = True ) - agent.cpu_reporter.start() + + agent.cpu_reporter.profiler.reset() def record(): - agent.cpu_reporter.record(2) + agent.cpu_reporter.profiler.start_profiler() + time.sleep(2) + agent.cpu_reporter.profiler.stop_profiler() + record_t = threading.Thread(target=record) record_t.start() @@ -39,9 +44,10 @@ def cpu_work_main_thread(): record_t.join() - #print(agent.cpu_reporter.profile) + profile = agent.cpu_reporter.profiler.build_profile(2)[0]['profile'].to_dict() + #print(profile) - self.assertTrue('cpu_work_main_thread' in str(agent.cpu_reporter.profile)) + self.assertTrue('cpu_work_main_thread' in str(profile)) agent.destroy() diff --git a/tests/reporters/profile_reporter_test.py b/tests/reporters/profile_reporter_test.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/reporters/span_reporter_test.py b/tests/reporters/span_reporter_test.py new file mode 100644 index 0000000..30c7afa --- /dev/null +++ b/tests/reporters/span_reporter_test.py @@ -0,0 +1,41 @@ + +import time +import unittest +import random +import threading +import sys +import traceback + +import stackimpact +from stackimpact.runtime import min_version + + +class SpanReporterTestCase(unittest.TestCase): + + def test_record_span(self): + stackimpact._agent = None + agent = stackimpact.start( + dashboard_address = 'http://localhost:5001', + agent_key = 'key1', + app_name = 'TestPythonApp', + debug = True + ) + agent.span_reporter.start() + + for i in range(10): + agent.span_reporter.record_span("span1", 10); + + span_counters = agent.span_reporter.span_counters; + agent.span_reporter.report(); + + counter = span_counters['span1'] + #print(counter) + + self.assertEqual(counter.name, 'span1') + self.assertEqual(counter.measurement, 10000) + + agent.destroy() + + +if __name__ == '__main__': + unittest.main() From 49b966a07fadf3b3ee854b81b16cb59f87a2adbb Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Fri, 20 Apr 2018 10:42:13 +0000 Subject: [PATCH 21/26] rename programmatic to focused --- README.md | 4 ++-- README.rst | 8 ++++---- examples/{programmatic => focused}/app.py | 0 3 files changed, 6 insertions(+), 6 deletions(-) rename examples/{programmatic => focused}/app.py (100%) diff --git a/README.md b/README.md index a9f521b..ae1ab77 100644 --- a/README.md +++ b/README.md @@ -79,14 +79,14 @@ All initialization options: * `app_version` (Optional) Sets application version, which can be used to associate profiling information with the source code release. * `app_environment` (Optional) Used to differentiate applications in different environments. * `host_name` (Optional) By default, host name will be the OS hostname. -* `auto_profiling` (Optional) If set to `False`, disables automatic profiling and reporting. Programmatic or manual profiling should be used instead. Useful for environments without support for timers or background tasks. +* `auto_profiling` (Optional) If set to `False`, disables automatic profiling and reporting. Focused or manual profiling should be used instead. Useful for environments without support for timers or background tasks. * `debug` (Optional) Enables debug logging. * `cpu_profiler_disabled`, `allocation_profiler_disabled`, `block_profiler_disabled`, `error_profiler_disabled` (Optional) Disables respective profiler when `True`. * `include_agent_frames` (Optional) Set to `True` to not exclude agent stack frames from profile call graphs. * `auto_destroy` (Optional) Set to `False` to disable agent's exit handlers. If necessary, call `destroy()` to gracefully shutdown the agent. -#### Programmatic profiling +#### Focused profiling Use `agent.profile(name)` to instruct the agent when to start and stop profiling. The agent decides if and which profiler is activated. Normally, this method should be used in repeating code, such as request or event handlers. In addition to more precise profiling, timing information will also be reported for the profiled spans. Usage example: diff --git a/README.rst b/README.rst index f94df70..9bbf619 100644 --- a/README.rst +++ b/README.rst @@ -114,8 +114,8 @@ All initialization options: - ``host_name`` (Optional) By default, host name will be the OS hostname. - ``auto_profiling`` (Optional) If set to ``False``, disables automatic - profiling and reporting. Programmatic or manual profiling should be - used instead. Useful for environments without support for timers or + profiling and reporting. Focused or manual profiling should be used + instead. Useful for environments without support for timers or background tasks. - ``debug`` (Optional) Enables debug logging. - ``cpu_profiler_disabled``, ``allocation_profiler_disabled``, @@ -127,8 +127,8 @@ All initialization options: handlers. If necessary, call ``destroy()`` to gracefully shutdown the agent. -Programmatic profiling -^^^^^^^^^^^^^^^^^^^^^^ +Focused profiling +^^^^^^^^^^^^^^^^^ Use ``agent.profile(name)`` to instruct the agent when to start and stop profiling. The agent decides if and which profiler is activated. diff --git a/examples/programmatic/app.py b/examples/focused/app.py similarity index 100% rename from examples/programmatic/app.py rename to examples/focused/app.py From 7dd59b675762551491c665be4e7467533e6911c0 Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Wed, 9 May 2018 13:44:07 +0000 Subject: [PATCH 22/26] add tensorflow profiling --- .gitignore | 1 + README.md | 12 +- README.rst | 13 +- examples/tensorflow/app.py | 32 +++ examples/tensorflow/manual.py | 23 ++ examples/tensorflow/manual_keras.py | 35 +++ publish.sh | 16 ++ setup.py | 2 +- stackimpact/agent.py | 49 +++- stackimpact/api_request.py | 11 +- stackimpact/config.py | 2 +- stackimpact/config_loader.py | 6 +- stackimpact/frame.py | 10 +- stackimpact/frame_cache.py | 2 +- stackimpact/message_queue.py | 8 +- stackimpact/metric.py | 89 ++++--- stackimpact/profilers/allocation_profiler.py | 5 +- stackimpact/profilers/block_profiler.py | 9 +- stackimpact/profilers/cpu_profiler.py | 5 +- stackimpact/profilers/tf_profiler.py | 246 +++++++++++++++++++ stackimpact/reporters/error_reporter.py | 19 +- stackimpact/reporters/process_reporter.py | 2 +- stackimpact/reporters/profile_reporter.py | 4 +- stackimpact/reporters/span_reporter.py | 3 +- stackimpact/runtime.py | 38 +-- stackimpact/utils.py | 6 +- tests/agent_test.py | 38 +++ tests/metric_test.py | 8 +- tests/profilers/tf_profiler_test.py | 67 +++++ 29 files changed, 654 insertions(+), 107 deletions(-) create mode 100644 examples/tensorflow/app.py create mode 100644 examples/tensorflow/manual.py create mode 100644 examples/tensorflow/manual_keras.py create mode 100755 publish.sh create mode 100644 stackimpact/profilers/tf_profiler.py create mode 100644 tests/profilers/tf_profiler_test.py diff --git a/.gitignore b/.gitignore index a928f06..4b2835f 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ tmp dist stackimpact.egg-info __pycache__ +venv* diff --git a/README.md b/README.md index ae1ab77..a031345 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,8 @@ StackImpact is a production-grade performance profiler built for both production #### Features -* Continuous hot spot profiling for CPU usage, memory allocation, blocking calls. +* Continuous hot spot profiling of CPU usage, memory allocation and blocking calls. +* TensorFlow profiling. * Error and exception monitoring. * Health monitoring including CPU, memory, garbage collection and other runtime metrics. * Alerts on profile anomalies. @@ -142,6 +143,15 @@ agent.start_allocation_profiler(); agent.stop_allocation_profiler(); ``` +```python +# Start TensorFlow profiler. +agent.start_tf_profiler(); +``` + +```python +# Stop TensorFlow profiler and report the recorded profile to the Dashboard. +agent.stop_tf_profiler(); +``` #### Analyzing performance data in the Dashboard diff --git a/README.rst b/README.rst index 9bbf619..afdb03f 100644 --- a/README.rst +++ b/README.rst @@ -20,8 +20,9 @@ complement profiles for extensive performance analysis. Learn more at Features ^^^^^^^^ -- Continuous hot spot profiling for CPU usage, memory allocation, +- Continuous hot spot profiling of CPU usage, memory allocation and blocking calls. +- TensorFlow profiling. - Error and exception monitoring. - Health monitoring including CPU, memory, garbage collection and other runtime metrics. @@ -192,6 +193,16 @@ disabled with ``auto_profiling: False``. # Stop heap allocation profiler and report the recorded profile to the Dashboard. agent.stop_allocation_profiler(); +.. code:: python + + # Start TensorFlow profiler. + agent.start_tf_profiler(); + +.. code:: python + + # Stop TensorFlow profiler and report the recorded profile to the Dashboard. + agent.stop_tf_profiler(); + Analyzing performance data in the Dashboard ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/examples/tensorflow/app.py b/examples/tensorflow/app.py new file mode 100644 index 0000000..9b301bd --- /dev/null +++ b/examples/tensorflow/app.py @@ -0,0 +1,32 @@ +from __future__ import print_function +import random +import time +import sys +import threading +import tensorflow as tf +sys.path.append(".") +import stackimpact + + +agent = stackimpact.start( + agent_key = 'agent key here', + app_name = 'MyTensorFlowScript') + + +def handle_some_event(): + with agent.profile(): + tf.reset_default_graph() + x = tf.random_normal([1000, 1000]) + y = tf.random_normal([1000, 1000]) + res = tf.matmul(x, y) + + with tf.Session() as sess: + sess.run(res) + + +# Simulate events +while True: + handle_some_event() + time.sleep(2) + + diff --git a/examples/tensorflow/manual.py b/examples/tensorflow/manual.py new file mode 100644 index 0000000..1b25517 --- /dev/null +++ b/examples/tensorflow/manual.py @@ -0,0 +1,23 @@ +from __future__ import print_function +import sys +import tensorflow as tf +sys.path.append(".") +import stackimpact + + +agent = stackimpact.start( + agent_key = 'agent key here', + app_name = 'MyTensorFlowScript', + auto_profiling = False) + +agent.start_tf_profiler() + +x = tf.random_normal([1000, 1000]) +y = tf.random_normal([1000, 1000]) +res = tf.matmul(x, y) + +with tf.Session() as sess: + sess.run(res) + + +agent.stop_tf_profiler() diff --git a/examples/tensorflow/manual_keras.py b/examples/tensorflow/manual_keras.py new file mode 100644 index 0000000..dbf5e7c --- /dev/null +++ b/examples/tensorflow/manual_keras.py @@ -0,0 +1,35 @@ +from keras.models import Sequential +from keras.layers.core import Dense, Dropout, Activation +from keras.optimizers import SGD +import numpy as np +import sys + +sys.path.append(".") +import stackimpact + + +agent = stackimpact.start( + agent_key = 'agent key here', + app_name = 'MyKerasScript', + auto_profiling = False) + + +agent.start_tf_profiler() + +X = np.array([[0,0],[0,1],[1,0],[1,1]]) +y = np.array([[0],[1],[1],[0]]) + +model = Sequential() +model.add(Dense(8, input_dim=2)) +model.add(Activation('tanh')) +model.add(Dense(1)) +model.add(Activation('sigmoid')) + +sgd = SGD(lr=0.1) +model.compile(loss='binary_crossentropy', optimizer=sgd) + +model.fit(X, y, batch_size=1, nb_epoch=1000) +print(model.predict_proba(X)) + +agent.stop_tf_profiler() + diff --git a/publish.sh b/publish.sh new file mode 100755 index 0000000..0083c0f --- /dev/null +++ b/publish.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -e + +pandoc --from=markdown --to=rst --output=README.rst 'README.md' + +python3 -m unittest discover -v -s tests -p *_test.py + +rm -f dist/*.tar.gz +python setup.py sdist + +for bundle in dist/*.tar.gz; do + echo "Publishing $bundle..." + twine upload $bundle +done + diff --git a/setup.py b/setup.py index f58712d..b08b788 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ def read(fname): setup( name = 'stackimpact', - version = '1.2.2', + version = '1.2.3', description = 'StackImpact Python Profiler', long_description = read('README.rst'), author = 'StackImpact', diff --git a/stackimpact/agent.py b/stackimpact/agent.py index e71211f..3133feb 100644 --- a/stackimpact/agent.py +++ b/stackimpact/agent.py @@ -26,9 +26,10 @@ from .profilers.cpu_profiler import CPUProfiler from .profilers.allocation_profiler import AllocationProfiler from .profilers.block_profiler import BlockProfiler +from .profilers.tf_profiler import TFProfiler -class Span: +class Span(object): def __init__(self, stop_func = None): if stop_func: @@ -48,9 +49,9 @@ def __exit__(self, exc_type, exc_value, traceback): self.stop() -class Agent: +class Agent(object): - AGENT_VERSION = "1.2.2" + AGENT_VERSION = "1.2.3" SAAS_DASHBOARD_ADDRESS = "https://agent-api.stackimpact.com" def __init__(self, **kwargs): @@ -99,11 +100,19 @@ def __init__(self, **kwargs): config.report_interval = 120 self.block_reporter = ProfileReporter(self, BlockProfiler(self), config) + config = ProfilerConfig() + config.log_prefix = 'TensorFlow profiler' + config.max_profile_duration = 20 + config.max_span_duration = 5 + config.max_span_count = 30 + config.span_interval = 20 + config.report_interval = 120 + self.tf_reporter = ProfileReporter(self, TFProfiler(self), config) self.options = None - def get_option(self, name, default_val = None): + def get_option(self, name, default_val=None): if name not in self.options: return default_val else: @@ -151,6 +160,7 @@ def start(self, **kwargs): self.cpu_reporter.setup() self.allocation_reporter.setup() self.block_reporter.setup() + self.tf_reporter.setup() self.span_reporter.setup() self.error_reporter.setup() self.process_reporter.setup() @@ -201,6 +211,7 @@ def enable(self): self.cpu_reporter.start() self.allocation_reporter.start() self.block_reporter.start() + self.tf_reporter.start() self.span_reporter.start() self.error_reporter.start() self.process_reporter.start() @@ -212,13 +223,14 @@ def disable(self): self.cpu_reporter.stop() self.allocation_reporter.stop() self.block_reporter.stop() + self.tf_reporter.stop() self.span_reporter.stop() self.error_reporter.stop() self.process_reporter.stop() self.config.set_agent_enabled(False) - def profile(self, name = 'Default'): + def profile(self, name='Default'): if not self.agent_started or self.span_active: return Span(None) @@ -232,19 +244,21 @@ def profile(self, name = 'Default'): active_reporters.append(self.allocation_reporter) if self.block_reporter.started: active_reporters.append(self.block_reporter) + if self.tf_reporter.started: + active_reporters.append(self.tf_reporter) if len(active_reporters) > 0: selected_reporter = active_reporters[int(math.floor(random.random() * len(active_reporters)))] if not selected_reporter.start_profiling(True, True): selected_reporter = None - start_timestamp = timestamp() + start_timestamp = time.time() def stop_func(): if selected_reporter: selected_reporter.stop_profiling() - duration = timestamp() - start_timestamp + duration = time.time() - start_timestamp self.span_reporter.record_span(name, duration) self.span_active = False @@ -298,15 +312,24 @@ def stop_block_profiler(self): self._stop_profiler(self.block_reporter) + def start_tf_profiler(self): + self._start_profiler(self.tf_reporter) + + + def stop_tf_profiler(self): + self._stop_profiler(self.tf_reporter) + + def report(self): if not self.agent_started or self.get_option('auto_profiling'): return - self.configLoader.load(True) + self.config_loader.load(True) - self.cpuReporter.report(True); - self.allocationReporter.report(True); - self.asyncReporter.report(True); + self.cpu_reporter.report(True); + self.allocation_reporter.report(True); + self.block_reporter.report(True); + self.tf_reporter.report(True); self.messageQueue.flush(True) @@ -325,6 +348,7 @@ def destroy(self): self.cpu_reporter.stop() self.allocation_reporter.stop() self.block_reporter.stop() + self.tf_reporter.stop() self.error_reporter.stop() self.span_reporter.stop() self.process_reporter.stop() @@ -332,6 +356,7 @@ def destroy(self): self.cpu_reporter.destroy() self.allocation_reporter.destroy() self.block_reporter.destroy() + self.tf_reporter.destroy() self.error_reporter.destroy() self.span_reporter.destroy() self.process_reporter.destroy() @@ -421,7 +446,7 @@ def run_in_main_thread(self, func): -class TimerWraper(): +class TimerWraper(object): def __init__(self): self.timer = None self.cancel_lock = threading.Lock() diff --git a/stackimpact/api_request.py b/stackimpact/api_request.py index c5d0192..c2dd016 100644 --- a/stackimpact/api_request.py +++ b/stackimpact/api_request.py @@ -20,7 +20,7 @@ from urllib.parse import urlencode -class APIRequest: +class APIRequest(object): def __init__(self, agent): self.agent = agent @@ -42,11 +42,12 @@ def post(self, endpoint, payload): req_body = { 'runtime_type': 'python', 'runtime_version': '{0.major}.{0.minor}.{0.micro}'.format(sys.version_info), + 'runtime_path': sys.prefix, 'agent_version': self.agent.AGENT_VERSION, 'app_name': self.agent.get_option('app_name'), 'app_version': self.agent.get_option('app_version'), 'app_environment': self.agent.get_option('app_environment'), - 'host_name': self.agent.get_option('host_name', socket.gethostname()), + 'host_name': self.agent.get_option('host_name', host_name), 'process_id': os.getpid(), 'run_id': self.agent.run_id, 'run_ts': self.agent.run_ts, @@ -55,9 +56,9 @@ def post(self, endpoint, payload): } gzip_out = BytesIO() - with gzip.GzipFile(fileobj=gzip_out, mode="w") as f: - f.write(json.dumps(req_body).encode('utf-8')) - f.close() + with gzip.GzipFile(fileobj=gzip_out, mode="w") as out_file: + out_file.write(json.dumps(req_body).encode('utf-8')) + out_file.close() gzip_out_val = gzip_out.getvalue() if isinstance(gzip_out_val, str): diff --git a/stackimpact/config.py b/stackimpact/config.py index 2a8181b..17f8c5c 100644 --- a/stackimpact/config.py +++ b/stackimpact/config.py @@ -1,6 +1,6 @@ import threading -class Config: +class Config(object): def __init__(self, agent): self.agent = agent self.agent_enabled = False diff --git a/stackimpact/config_loader.py b/stackimpact/config_loader.py index 1879c1f..d122e8e 100644 --- a/stackimpact/config_loader.py +++ b/stackimpact/config_loader.py @@ -2,7 +2,7 @@ from .utils import timestamp -class ConfigLoader: +class ConfigLoader(object): LOAD_DELAY = 2 LOAD_INTERVAL = 120 @@ -24,7 +24,7 @@ def stop(self): self.load_timer = None - def load(self, with_interval = False): + def load(self, with_interval=False): now = timestamp() if with_interval and self.last_load_ts > now - self.LOAD_INTERVAL: return @@ -53,10 +53,12 @@ def load(self, with_interval = False): self.agent.cpu_reporter.start() self.agent.allocation_reporter.start() self.agent.block_reporter.start() + self.agent.tf_reporter.start() else: self.agent.cpu_reporter.stop() self.agent.allocation_reporter.stop() self.agent.block_reporter.stop() + self.agent.tf_reporter.stop() if self.agent.config.is_agent_enabled(): self.agent.error_reporter.start() diff --git a/stackimpact/frame.py b/stackimpact/frame.py index 5f4a984..f89204f 100644 --- a/stackimpact/frame.py +++ b/stackimpact/frame.py @@ -1,7 +1,7 @@ import re -class Frame: +class Frame(object): def __init__(self, func_name, filename, lineno): self.func_name = func_name @@ -14,9 +14,9 @@ def __init__(self, func_name, filename, lineno): def match(self, other): - return ((other.func_name == None or other.func_name == self.func_name) and - (other.filename == None or other.filename == self.filename) and - (other.lineno == None or other.lineno == self.lineno)) + return ((other.func_name is None or other.func_name == self.func_name) and + (other.filename is None or other.filename == self.filename) and + (other.lineno is None or other.lineno == self.lineno)) def __eq__(self, other): @@ -26,7 +26,7 @@ def __eq__(self, other): def __str__(self): if not self.cached_str: - if not self.lineno == None and self.lineno > 0: + if self.lineno is not None and self.lineno > 0: self.cached_str = '{0} ({1}:{2})'.format(self.func_name, self.filename, self.lineno) else: self.cached_str = '{0} ({1})'.format(self.func_name, self.filename) diff --git a/stackimpact/frame_cache.py b/stackimpact/frame_cache.py index 408a480..f6f1eb7 100644 --- a/stackimpact/frame_cache.py +++ b/stackimpact/frame_cache.py @@ -9,7 +9,7 @@ if runtime_info.GEVENT: import gevent -class FrameCache: +class FrameCache(object): MAX_CACHE_SIZE = 2500 def __init__(self, agent): diff --git a/stackimpact/message_queue.py b/stackimpact/message_queue.py index d9ed49b..2b96704 100644 --- a/stackimpact/message_queue.py +++ b/stackimpact/message_queue.py @@ -6,7 +6,7 @@ from .utils import timestamp, base64_encode -class MessageQueue: +class MessageQueue(object): FLUSH_INTERVAL = 5; MESSAGE_TTL = 10 * 60 @@ -32,20 +32,20 @@ def stop(self): def add(self, topic, message): - m = { + entry = { 'topic': topic, 'content': message, 'added_at': timestamp() } with self.queue_lock: - self.queue.append(m) + self.queue.append(entry) self.agent.log('Added message to the queue for topic: ' + topic) self.agent.log(message) - def flush(self, with_interval = False): + def flush(self, with_interval=False): if len(self.queue) == 0: return diff --git a/stackimpact/metric.py b/stackimpact/metric.py index 8587615..d3abc0c 100644 --- a/stackimpact/metric.py +++ b/stackimpact/metric.py @@ -6,7 +6,7 @@ from .utils import timestamp, generate_uuid, generate_sha1 -class Metric: +class Metric(object): TYPE_STATE = 'state' TYPE_COUNTER = 'counter' @@ -37,7 +37,9 @@ class Metric: NAME_UNCOLLECTED_ALLOCATIONS = 'Uncollected allocations' NAME_BLOCKING_CALL_TIMES = 'Blocking call times' NAME_HANDLED_EXCEPTIONS = 'Handled exceptions' - + NAME_TF_OPERATION_TIMES = 'TensorFlow operation times' + NAME_TF_OPERATION_ALLOCATION_RATE = 'TensorFlow operation allocation rate' + UNIT_NONE = '' UNIT_MILLISECOND = 'millisecond' UNIT_MICROSECOND = 'microsecond' @@ -141,15 +143,37 @@ def to_dict(self): class Breakdown: + TYPE_CALLGRAPH = 'callgraph' + TYPE_DEVICE = 'device' + TYPE_CALLSITE = 'callsite' + TYPE_OPERATION = 'operation' + TYPE_ERROR = 'error' + RESERVOIR_SIZE = 1000 - def __init__(self, name): + def __init__(self, name, typ = None): self.name = name + self.type = typ + self.metadata = dict() + self.children = dict() self.measurement = 0 self.num_samples = 0 self.reservoir = [] - self.children = {} - self._overhead = 0 + + + def set_type(self, typ): + self.type = typ + + + def add_metadata(self, key, value): + self.metadata[key] = value + + + def get_metadata(self, key): + if key in self.metadata: + return self.metadata[key] + else: + return None def find_child(self, name): @@ -160,21 +184,21 @@ def find_child(self, name): def max_child(self): - max = None + max_ch = None for name, child in self.children.items(): - if max == None or child.measurement > max.measurement: - max = child + if max_ch is None or child.measurement > max_ch.measurement: + max_ch = child - return max + return max_ch def min_child(self): - min = None + min_ch = None for name, child in self.children.items(): - if min == None or child.measurement < min.measurement: - min = child + if min_ch == None or child.measurement < min_ch.measurement: + min_ch = child - return min + return min_ch def add_child(self, child): @@ -194,28 +218,28 @@ def find_or_add_child(self, name): return child - def filter(self, from_level, min, max): - self.filter_level(1, from_level, min, max) + def filter(self, from_level, min_measurement, max_measurement): + self.filter_level(1, from_level, min_measurement, max_measurement) - def filter_level(self, current_level, from_level, min, max): + def filter_level(self, current_level, from_level, min_measurement, max_measurement): for name in list(self.children.keys()): child = self.children[name] - if current_level >= from_level and (child.measurement < min or child.measurement > max): + if current_level >= from_level and (child.measurement < min_measurement or child.measurement > max_measurement): del self.children[name] else: - child.filter_level(current_level + 1, from_level, min, max) + child.filter_level(current_level + 1, from_level, min_measurement, max_measurement) def depth(self): - max = 0 + max_depth = 0 for name, child in self.children.items(): - d = child.depth() - if d > max: - max = d + child_depth = child.depth() + if child_depth > max_depth: + max_depth = child_depth - return max + 1 + return max_depth + 1 def propagate(self): @@ -284,6 +308,14 @@ def normalize(self, factor): child.normalize(factor) + def scale(self, factor): + self.measurement = self.measurement * factor + self.num_samples = int(math.ceil(self.num_samples * factor)) + + for name, child in self.children.items(): + child.scale(factor) + + def round(self): self.measurement = round(self.measurement) @@ -305,6 +337,7 @@ def to_dict(self): node_map = { "name": self.name, + "metadata": self.metadata, "measurement": self.measurement, "num_samples": self.num_samples, "children": children_map, @@ -318,13 +351,13 @@ def __str__(self): def dump_level(self, level): - s = '' + dump_str = '' for i in range(0, level): - s += ' ' + dump_str += ' ' - s += '{0} - {1} ({2})\n'.format(self.name, self.measurement, self.num_samples) + dump_str += '{0} - {1} ({2})\n'.format(self.name, self.measurement, self.num_samples) for name, child in self.children.items(): - s += child.dump_level(level + 1) + dump_str += child.dump_level(level + 1) - return s + return dump_str diff --git a/stackimpact/profilers/allocation_profiler.py b/stackimpact/profilers/allocation_profiler.py index 56448cd..c75ad8f 100644 --- a/stackimpact/profilers/allocation_profiler.py +++ b/stackimpact/profilers/allocation_profiler.py @@ -14,7 +14,7 @@ import tracemalloc -class AllocationProfiler: +class AllocationProfiler(object): MAX_TRACEBACK_SIZE = 25 # number of frames MAX_MEMORY_OVERHEAD = 10 * 1e6 # 10MB MAX_PROFILED_ALLOCATIONS = 25 @@ -45,7 +45,7 @@ def setup(self): def reset(self): - self.profile = Breakdown('root') + self.profile = Breakdown('Allocation call graph', Breakdown.TYPE_CALLGRAPH) def start_profiler(self): @@ -120,4 +120,5 @@ def process_snapshot(self, snapshot, duration): frame_name = '{0}:{1}'.format(frame.filename, frame.lineno) current_node = current_node.find_or_add_child(frame_name) + current_node.set_type(Breakdown.TYPE_CALLSITE) current_node.increment(stat.size, stat.count) diff --git a/stackimpact/profilers/block_profiler.py b/stackimpact/profilers/block_profiler.py index d876001..81782a0 100644 --- a/stackimpact/profilers/block_profiler.py +++ b/stackimpact/profilers/block_profiler.py @@ -16,7 +16,7 @@ import gevent -class BlockProfiler: +class BlockProfiler(object): SAMPLING_RATE = 0.05 MAX_TRACEBACK_SIZE = 25 # number of frames @@ -73,7 +73,7 @@ def destroy(self): def reset(self): - self.profile = Breakdown('root') + self.profile = Breakdown('Execution call graph', Breakdown.TYPE_CALLGRAPH) def start_profiler(self): @@ -99,7 +99,7 @@ def build_profile(self, duration): 'category': Metric.CATEGORY_BLOCK_PROFILE, 'name': Metric.NAME_BLOCKING_CALL_TIMES, 'unit': Metric.UNIT_MILLISECOND, - 'unit_interval': None, + 'unit_interval': 1, 'profile': self.profile }] @@ -119,6 +119,7 @@ def process_sample(self, signal_frame, sample_time, main_thread_id): current_node = self.profile for frame in reversed(stack): current_node = current_node.find_or_add_child(str(frame)) + current_node.set_type(Breakdown.TYPE_CALLSITE) current_node.increment(sample_time, 1) thread_id, thread_frame, stack = None, None, None @@ -126,8 +127,6 @@ def process_sample(self, signal_frame, sample_time, main_thread_id): items = None current_frames = None - self.profile._overhead += (time.clock() - start) - def recover_stack(self, thread_frame): stack = [] diff --git a/stackimpact/profilers/cpu_profiler.py b/stackimpact/profilers/cpu_profiler.py index f648deb..dacce79 100644 --- a/stackimpact/profilers/cpu_profiler.py +++ b/stackimpact/profilers/cpu_profiler.py @@ -14,7 +14,7 @@ -class CPUProfiler: +class CPUProfiler(object): SAMPLING_RATE = 0.01 MAX_TRACEBACK_SIZE = 25 # number of frames @@ -56,7 +56,7 @@ def _sample(signum, signal_frame): def reset(self): - self.profile = Breakdown('root') + self.profile = Breakdown('Execution call graph', Breakdown.TYPE_CALLGRAPH) def start_profiler(self): @@ -133,5 +133,6 @@ def update_profile(self, profile, stack): for frame in reversed(stack): current_node = current_node.find_or_add_child(str(frame)) + current_node.set_type(Breakdown.TYPE_CALLSITE) current_node.increment(0, 1) diff --git a/stackimpact/profilers/tf_profiler.py b/stackimpact/profilers/tf_profiler.py new file mode 100644 index 0000000..de3d189 --- /dev/null +++ b/stackimpact/profilers/tf_profiler.py @@ -0,0 +1,246 @@ +from __future__ import division + +import os +import sys +import time +import threading +import re +import signal +from functools import wraps + +from ..runtime import min_version, runtime_info, patch, unpatch +from ..metric import Metric +from ..metric import Breakdown +from ..frame import Frame + +if runtime_info.GEVENT: + import gevent + + +class Sampler(object): + + SAMPLING_INTERVAL = 1 + MAX_SAMPLED_RUNS = 10 + MAX_SAMPLED_DURATION = 0.1 + + def __init__(self, agent): + self.last_update_ts = time.time() + self.count = 0 + self.duration = 0 + + + def sample(self): + if self.last_update_ts < time.time() - self.SAMPLING_INTERVAL: + self.last_update_ts = time.time() + self.count = 0 + self.duration = 0 + return True + elif self.count < self.MAX_SAMPLED_RUNS and self.duration < self.MAX_SAMPLED_DURATION: + return True + + return False + + + def update(self, run_duration): + self.count = self.count + 1 + self.duration = self.duration + run_duration + + + +class TFProfiler(object): + + def __init__(self, agent): + self.agent = agent + self.ready = False + self.cpu_profile = None + self.allocation_profile = None + self.profile_lock = threading.Lock() + self.profiler_active = False + self.sampler = Sampler(agent) + self.sampling_active = False + self.run_metadata = None + self.total_run_duration = None + self.sampled_run_duration = None + + + def setup(self): + if self.agent.get_option('tf_profiler_disabled'): + return + + try: + import tensorflow as tf + except Exception: + self.agent.log('TensorFlow not found.') + return + + try: + def before_run(args, kwargs): + session = args[0] + data = None + + if self.profiler_active: + data = dict() + data['start_ts'] = time.time() + + if not self.sampling_active and self.sampler.sample(): + self.sampling_active = True + self.agent.log('Tracing TensorFlow session.') + + if 'options' in kwargs: + kwargs['options'].trace_level = tf.RunOptions.FULL_TRACE + else: + kwargs['options'] = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) + + if 'run_metadata' not in kwargs: + kwargs['run_metadata'] = tf.RunMetadata() + self.run_metadata = kwargs['run_metadata'] + + return args, kwargs, data + + def after_run(args, kwargs, ret, data): + session = args[0] + + if data and 'start_ts' in data: + run_duration = time.time() - data['start_ts'] + self.total_run_duration = self.total_run_duration + run_duration + if self.sampling_active: + self.sampled_run_duration = self.sampled_run_duration + run_duration + self.sampler.update(run_duration) + self.update_cpu_profile(session.graph, self.run_metadata.step_stats.dev_stats) + self.update_allocation_profile(session.graph, self.run_metadata.step_stats.dev_stats) + self.sampling_active = False + + if not patch(tf.Session, 'run', before_run, after_run): + self.agent.log('Error wrapping TensorFlow session.') + + except Exception: + self.agent.log('Error setting up TensorFlow session wrappers.') + self.agent.exception() + return + + self.ready = True + + + def destroy(self): + if not self.ready: + return + + + def reset(self): + self.cpu_profile = Breakdown('Operation definition call graph', Breakdown.TYPE_CALLGRAPH) + self.allocation_profile = Breakdown('Operation definition call graph', Breakdown.TYPE_CALLGRAPH) + self.total_run_duration = 0 + self.sampled_run_duration = 0 + + + def start_profiler(self): + self.agent.log('Activating TensorFlow profiler.') + + self.profiler_active = True + + + def stop_profiler(self): + self.profiler_active = False + + self.agent.log('Deactivating TensorFlow profiler.') + + + def build_profile(self, duration): + with self.profile_lock: + if self.sampled_run_duration > 0: + scale = self.total_run_duration / self.sampled_run_duration + + self.cpu_profile.scale(scale) + self.cpu_profile.normalize(duration) + self.cpu_profile.propagate() + + self.allocation_profile.scale(scale) + self.allocation_profile.normalize(duration) + self.allocation_profile.propagate() + self.allocation_profile.floor() + self.allocation_profile.filter(2, 1000, float("inf")) + + return [{ + 'category': Metric.CATEGORY_CPU_PROFILE, + 'name': Metric.NAME_TF_OPERATION_TIMES, + 'unit': Metric.UNIT_MILLISECOND, + 'unit_interval': 1, + 'profile': self.cpu_profile + }, + { + 'category': Metric.CATEGORY_MEMORY_PROFILE, + 'name': Metric.NAME_TF_OPERATION_ALLOCATION_RATE, + 'unit': Metric.UNIT_BYTE, + 'unit_interval': 1, + 'profile': self.allocation_profile + }] + + + def update_cpu_profile(self, graph, dev_stats): + with self.profile_lock: + op_index = dict() + for op in graph.get_operations(): + op_index[op.name] = op + + for device_index, device_stats in enumerate(dev_stats): + device_node = self.cpu_profile.find_or_add_child(device_stats.device) + + for node_stats in device_stats.node_stats: + if node_stats.node_name == '_SOURCE' or node_stats.node_name == '_SINK': + continue + + duration = (node_stats.op_end_rel_micros - node_stats.op_start_rel_micros) / 1000 + if duration == 0: + continue + + if node_stats.node_name in op_index: + op = op_index[node_stats.node_name] + tb = op.traceback + if tb: + current_node = device_node + for tb_frame in tb: + frame = Frame(tb_frame[2], tb_frame[0], tb_frame[1]) + current_node = current_node.find_or_add_child(str(frame)) + current_node.set_type(Breakdown.TYPE_CALLSITE) + + current_node = current_node.find_or_add_child(node_stats.node_name) + current_node.add_metadata('Type', op.type) + current_node.set_type(Breakdown.TYPE_OPERATION) + current_node.increment(duration, 1) + + + + def update_allocation_profile(self, graph, dev_stats): + with self.profile_lock: + op_index = dict() + for op in graph.get_operations(): + op_index[op.name] = op + + for device_index, device_stats in enumerate(dev_stats): + device_node = self.allocation_profile.find_or_add_child(device_stats.device) + + for node_stats in device_stats.node_stats: + if node_stats.node_name == '_SOURCE' or node_stats.node_name == '_SINK': + continue + + num_bytes = 0 + for index, output in enumerate(node_stats.output): + allocation = output.tensor_description.allocation_description + num_bytes = num_bytes + allocation.requested_bytes + if num_bytes == 0: + continue + + if node_stats.node_name in op_index: + op = op_index[node_stats.node_name] + tb = op.traceback + if tb: + current_node = device_node + for tb_frame in tb: + frame = Frame(tb_frame[2], tb_frame[0], tb_frame[1]) + current_node = current_node.find_or_add_child(str(frame)) + current_node.set_type(Breakdown.TYPE_CALLSITE) + + current_node = current_node.find_or_add_child(node_stats.node_name) + current_node.add_metadata('Type', op.type) + current_node.set_type(Breakdown.TYPE_OPERATION) + current_node.increment(num_bytes, 1) diff --git a/stackimpact/reporters/error_reporter.py b/stackimpact/reporters/error_reporter.py index ae6ecb7..e66de83 100644 --- a/stackimpact/reporters/error_reporter.py +++ b/stackimpact/reporters/error_reporter.py @@ -10,13 +10,13 @@ from ..frame import Frame -class ErrorReporter: +class ErrorReporter(object): MAX_QUEUED_EXC = 100 def __init__(self, agent): self.agent = agent - self.setup_done = False + self.ready = False self.started = False self.process_timer = None self.report_timer = None @@ -30,7 +30,7 @@ def setup(self): if self.agent.get_option('error_profiler_disabled'): return - def _exc_info(ret): + def _exc_info(args, kwargs, ret, data): try: if not self.agent.agent_started or self.agent.agent_destroyed: return @@ -43,11 +43,11 @@ def _exc_info(ret): patch(sys, 'exc_info', None, _exc_info) - self.setup_done = True + self.ready = True def destroy(self): - if not self.setup_done: + if not self.ready: return unpatch(sys, 'exc_info') @@ -55,7 +55,7 @@ def destroy(self): def reset(self): with self.profile_lock: - self.profile = Breakdown('root') + self.profile = Breakdown('Error call graph', Breakdown.TYPE_ERROR) self.added_exceptions = {} @@ -91,6 +91,8 @@ def stop(self): def report(self): with self.profile_lock: + self.profile.propagate() + metric = Metric(self.agent, Metric.TYPE_PROFILE, Metric.CATEGORY_ERROR_PROFILE, Metric.NAME_HANDLED_EXCEPTIONS, Metric.UNIT_NONE) measurement = metric.create_measurement(Metric.TRIGGER_TIMER, self.profile.measurement, 60, self.profile) self.agent.message_queue.add('metric', metric.to_dict()) @@ -146,12 +148,10 @@ def update_profile(self, exc): return current_node = self.profile - current_node.increment(1, 0) for frame in reversed(stack): current_node = current_node.find_or_add_child(str(frame)) - current_node.increment(1, 0) - + current_node.set_type(Breakdown.TYPE_CALLSITE) message = '' if exc_type: @@ -171,4 +171,5 @@ def update_profile(self, exc): else: message_node = current_node.find_or_add_child('Other') + message_node.set_type(Breakdown.TYPE_ERROR) message_node.increment(1, 0) diff --git a/stackimpact/reporters/process_reporter.py b/stackimpact/reporters/process_reporter.py index dc3c195..5a1f0c7 100644 --- a/stackimpact/reporters/process_reporter.py +++ b/stackimpact/reporters/process_reporter.py @@ -10,7 +10,7 @@ from ..runtime import runtime_info, min_version, read_cpu_time, read_max_rss, read_current_rss, read_vm_size from ..metric import Metric -class ProcessReporter: +class ProcessReporter(object): def __init__(self, agent): self.agent = agent diff --git a/stackimpact/reporters/profile_reporter.py b/stackimpact/reporters/profile_reporter.py index 11579bf..57a9e9d 100644 --- a/stackimpact/reporters/profile_reporter.py +++ b/stackimpact/reporters/profile_reporter.py @@ -14,7 +14,7 @@ from ..frame import Frame -class ProfilerConfig: +class ProfilerConfig(object): def __init__(self): self.log_prefix = None @@ -161,7 +161,7 @@ def stop_profiling(self): self.agent.log(self.config.log_prefix + ': stopped.') - def report(self, with_interval = False): + def report(self, with_interval=False): if not self.started: return diff --git a/stackimpact/reporters/span_reporter.py b/stackimpact/reporters/span_reporter.py index 15c213c..0db0c32 100644 --- a/stackimpact/reporters/span_reporter.py +++ b/stackimpact/reporters/span_reporter.py @@ -10,7 +10,8 @@ from ..frame import Frame -class SpanReporter: +class SpanReporter(object): + MAX_QUEUED_EXC = 100 diff --git a/stackimpact/runtime.py b/stackimpact/runtime.py index 3094383..0df5a41 100644 --- a/stackimpact/runtime.py +++ b/stackimpact/runtime.py @@ -11,7 +11,7 @@ pass -class runtime_info: +class runtime_info(object): OS_LINUX = (sys.platform.startswith('linux')) OS_DARWIN = (sys.platform == 'darwin') OS_WIN = (sys.platform == 'win32') @@ -27,11 +27,11 @@ class runtime_info: pass -VmRSSRe = re.compile('VmRSS:\s+(\d+)\s+kB') -VmSizeRe = re.compile('VmSize:\s+(\d+)\s+kB') +VM_RSS_REGEXP = re.compile('VmRSS:\s+(\d+)\s+kB') +VM_SIZE_REGEXP = re.compile('VmSize:\s+(\d+)\s+kB') -def min_version(major, minor = 0): +def min_version(major, minor=0): return (sys.version_info.major == major and sys.version_info.minor >= minor) @@ -60,9 +60,9 @@ def read_current_rss(): except Exception: return None - m = VmRSSRe.search(output) - if m: - return int(float(m.group(1))) + match = VM_RSS_REGEXP.search(output) + if match: + return int(float(match.group(1))) return None @@ -78,38 +78,42 @@ def read_vm_size(): except Exception: return None - m = VmSizeRe.search(output) - if m: - return int(float(m.group(1))) + match = VM_SIZE_REGEXP.search(output) + if match: + return int(float(match.group(1))) return None def patch(obj, func_name, before_func, after_func): if not hasattr(obj, func_name): - return + return False target_func = getattr(obj, func_name) # already patched if hasattr(target_func, '__stackimpact_orig__'): - return + return True @wraps(target_func) - def wrapper(*args, **kwds): + def wrapper(*args, **kwargs): + data = None + if before_func: - before_func(*args, **kwds) + args, kwargs, data = before_func(args, kwargs) - ret = target_func(*args, **kwds) + ret = target_func(*args, **kwargs) if after_func: - after_func(ret) + after_func(args, kwargs, ret, data) return ret wrapper.__orig__ = target_func setattr(obj, func_name, wrapper) + return True + def unpatch(obj, func_name): if not hasattr(obj, func_name): @@ -122,7 +126,7 @@ def unpatch(obj, func_name): setattr(obj, func_name, getattr(wrapper, '__stackimpact_orig__')) -def register_signal(signal_number, handler_func, once = False): +def register_signal(signal_number, handler_func, once=False): prev_handler = None def _handler(signum, frame): diff --git a/stackimpact/utils.py b/stackimpact/utils.py index 1ed5258..caf7109 100644 --- a/stackimpact/utils.py +++ b/stackimpact/utils.py @@ -27,7 +27,7 @@ def generate_uuid(): def generate_sha1(text): - h = hashlib.sha1() - h.update(text.encode('utf-8')) - return h.hexdigest() + sha1_hash = hashlib.sha1() + sha1_hash.update(text.encode('utf-8')) + return sha1_hash.hexdigest() diff --git a/tests/agent_test.py b/tests/agent_test.py index 490469e..2e59c05 100644 --- a/tests/agent_test.py +++ b/tests/agent_test.py @@ -187,5 +187,43 @@ def blocking_call(): + + def test_tf_profile(self): + stackimpact._agent = None + agent = stackimpact.start( + dashboard_address = 'http://localhost:5001', + agent_key = 'key1', + app_name = 'TestPythonApp', + auto_profiling = False, + debug = True + ) + + if not agent.tf_reporter.profiler.ready: + return + + messages = [] + def add_mock(topic, message): + messages.append(message) + agent.message_queue.add = add_mock + + agent.start_tf_profiler() + + import tensorflow as tf + + x = tf.random_normal([1000, 1000]) + y = tf.random_normal([1000, 1000]) + res = tf.matmul(x, y) + + with tf.Session() as sess: + sess.run(res) + + agent.stop_tf_profiler() + + self.assertTrue('test_tf_profile' in str(messages)) + + agent.destroy() + + + if __name__ == '__main__': unittest.main() diff --git a/tests/metric_test.py b/tests/metric_test.py index 53cc3e0..9d112ec 100644 --- a/tests/metric_test.py +++ b/tests/metric_test.py @@ -33,7 +33,7 @@ def test_counter_metric(self): - def test_breakdown_filter(self): + def test_profile_filter(self): root = Breakdown('root') root.measurement = 10 @@ -56,7 +56,7 @@ def test_breakdown_filter(self): self.assertFalse(child2.find_child('child2child1')) - def test_breakdown_depth(self): + def test_profile_depth(self): root = Breakdown("root") child1 = Breakdown("child1") @@ -73,7 +73,7 @@ def test_breakdown_depth(self): self.assertEqual(child2.depth(), 2) - def test_breakdown_p95(self): + def test_profile_p95(self): root = Breakdown("root") child1 = Breakdown("child1") @@ -94,7 +94,7 @@ def test_breakdown_p95(self): self.assertEqual(root.measurement, 6.5) - def test_breakdown_p95_big(self): + def test_profile_p95_big(self): root = Breakdown("root") for i in range(0, 10000): diff --git a/tests/profilers/tf_profiler_test.py b/tests/profilers/tf_profiler_test.py new file mode 100644 index 0000000..abbafef --- /dev/null +++ b/tests/profilers/tf_profiler_test.py @@ -0,0 +1,67 @@ + +import time +import unittest +import random +import threading +import sys +import traceback + +import stackimpact +from stackimpact.runtime import runtime_info + + +# virtualenv -p python3 venv_tf +# source venv_tf/bin/activate +# pip install tensorflow +# pip install keras + +class TFProfilerTestCase(unittest.TestCase): + + def test_record_tf_profile(self): + stackimpact._agent = None + agent = stackimpact.start( + dashboard_address = 'http://localhost:5001', + agent_key = 'key1', + app_name = 'TestPythonApp', + auto_profiling = False, + debug = True + ) + + agent.tf_reporter.profiler.reset() + + if not agent.tf_reporter.profiler.ready: + return + + def record(): + agent.tf_reporter.profiler.start_profiler() + time.sleep(1) + agent.tf_reporter.profiler.stop_profiler() + + + record_t = threading.Thread(target=record) + record_t.start() + + import tensorflow as tf + + x = tf.random_normal([1000, 1000]) + y = tf.random_normal([1000, 1000]) + res = tf.matmul(x, y) + + with tf.Session() as sess: + sess.run(res) + + record_t.join() + + profile = agent.tf_reporter.profiler.build_profile(1)[0]['profile'].to_dict() + #print(profile) + self.assertTrue('test_record_tf_profile' in str(profile)) + + profile = agent.tf_reporter.profiler.build_profile(1)[1]['profile'].to_dict() + #print(profile) + self.assertTrue('test_record_tf_profile' in str(profile)) + + agent.destroy() + + +if __name__ == '__main__': + unittest.main() From 6e67111acb7ebf98d0029f333afa4574de141e9a Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Tue, 25 Sep 2018 11:26:07 +0000 Subject: [PATCH 23/26] remove report method --- README.md | 4 ++-- examples/demo/flask_app.py | 3 ++- setup.py | 2 +- stackimpact/agent.py | 22 +++++++------------- stackimpact/profilers/allocation_profiler.py | 2 +- tests/agent_test.py | 2 -- 6 files changed, 13 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index a031345..63e8956 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Learn more on the [features](https://stackimpact.com/features/) page (with scree #### How it works -The StackImpact profiler agent is imported into a program and used as a normal package. When the program runs, various sampling profilers are started and stopped automatically by the agent and/or programmatically using the agent methods. The agent periodically reports recorded profiles and metrics to the StackImpact Dashboard. If an application has multiple processes, also referred to as workers, instances or nodes, only one or two processes will have active agents at any point of time. +The StackImpact profiler agent is imported into a program and used as a normal package. When the program runs, various sampling profilers are started and stopped automatically by the agent and/or programmatically using the agent methods. The agent periodically reports recorded profiles and metrics to the StackImpact Dashboard. The agent can also operate in manual mode, which should be used in development only. #### Documentation @@ -165,4 +165,4 @@ To enable debug logging, add `debug = True` to startup options. If the debug log ## Overhead -The agent overhead is measured to be less than 1% for applications under high load. For applications that are horizontally scaled to multiple processes, StackImpact agents are only active on a small subset of the processes at any point of time, therefore the total overhead is much lower. +The agent overhead is measured to be less than 1% for applications under high load. diff --git a/examples/demo/flask_app.py b/examples/demo/flask_app.py index 999695b..30030a4 100644 --- a/examples/demo/flask_app.py +++ b/examples/demo/flask_app.py @@ -1,4 +1,4 @@ -#env AGENT_KEY=agnetkeyhere FLASK_APP=examples/flask_app.py flask run -p 5010 +#env AGENT_KEY=agnetkeyhere FLASK_APP=examples/demo/flask_app.py flask run -p 5010 from __future__ import print_function @@ -28,6 +28,7 @@ # StackImpact agent initialization agent = stackimpact.start( agent_key = os.environ['AGENT_KEY'], + dashboard_address = os.environ['DASHBOARD_ADDRESS'], app_name = 'ExamplePythonFlaskApp', app_version = '1.0.0', debug = True) diff --git a/setup.py b/setup.py index b08b788..653ea42 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ def read(fname): setup( name = 'stackimpact', - version = '1.2.3', + version = '1.2.4', description = 'StackImpact Python Profiler', long_description = read('README.rst'), author = 'StackImpact', diff --git a/stackimpact/agent.py b/stackimpact/agent.py index 3133feb..1fe84cf 100644 --- a/stackimpact/agent.py +++ b/stackimpact/agent.py @@ -51,7 +51,7 @@ def __exit__(self, exc_type, exc_value, traceback): class Agent(object): - AGENT_VERSION = "1.2.3" + AGENT_VERSION = "1.2.4" SAAS_DASHBOARD_ADDRESS = "https://agent-api.stackimpact.com" def __init__(self, **kwargs): @@ -261,6 +261,12 @@ def stop_func(): duration = time.time() - start_timestamp self.span_reporter.record_span(name, duration) + if not self.get_option('auto_profiling'): + self.config_loader.load(True) + if selected_reporter: + selected_reporter.report(True); + self.message_queue.flush(True) + self.span_active = False return Span(stop_func) @@ -320,20 +326,6 @@ def stop_tf_profiler(self): self._stop_profiler(self.tf_reporter) - def report(self): - if not self.agent_started or self.get_option('auto_profiling'): - return - - self.config_loader.load(True) - - self.cpu_reporter.report(True); - self.allocation_reporter.report(True); - self.block_reporter.report(True); - self.tf_reporter.report(True); - - self.messageQueue.flush(True) - - def destroy(self): if not self.agent_started: self.log('Agent has not been started') diff --git a/stackimpact/profilers/allocation_profiler.py b/stackimpact/profilers/allocation_profiler.py index c75ad8f..a852593 100644 --- a/stackimpact/profilers/allocation_profiler.py +++ b/stackimpact/profilers/allocation_profiler.py @@ -91,7 +91,7 @@ def build_profile(self, duration): 'category': Metric.CATEGORY_MEMORY_PROFILE, 'name': Metric.NAME_UNCOLLECTED_ALLOCATIONS, 'unit': Metric.UNIT_BYTE, - 'unit_interval': None, + 'unit_interval': 1, 'profile': self.profile }] diff --git a/tests/agent_test.py b/tests/agent_test.py index 2e59c05..e8777be 100644 --- a/tests/agent_test.py +++ b/tests/agent_test.py @@ -50,7 +50,6 @@ def test_profile(self): dashboard_address = 'http://localhost:5001', agent_key = 'key1', app_name = 'TestPythonApp', - auto_profiling = False, debug = True ) @@ -77,7 +76,6 @@ def test_with_profile(self): dashboard_address = 'http://localhost:5001', agent_key = 'key1', app_name = 'TestPythonApp', - auto_profiling = False, debug = True ) From f932af2b1c74802bb02ade964315cf0140aa73da Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Tue, 14 May 2019 16:14:20 +0200 Subject: [PATCH 24/26] remove deprecated functionality --- README.md | 11 -- README.rst | 21 +-- examples/tensorflow/app.py | 32 ---- examples/tensorflow/manual.py | 23 --- examples/tensorflow/manual_keras.py | 35 ---- setup.py | 2 +- stackimpact/agent.py | 29 +--- stackimpact/profilers/tf_profiler.py | 246 --------------------------- tests/agent_test.py | 41 +---- tests/profilers/tf_profiler_test.py | 67 -------- 10 files changed, 8 insertions(+), 499 deletions(-) delete mode 100644 examples/tensorflow/app.py delete mode 100644 examples/tensorflow/manual.py delete mode 100644 examples/tensorflow/manual_keras.py delete mode 100644 stackimpact/profilers/tf_profiler.py delete mode 100644 tests/profilers/tf_profiler_test.py diff --git a/README.md b/README.md index 63e8956..6a519af 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,6 @@ StackImpact is a production-grade performance profiler built for both production #### Features * Continuous hot spot profiling of CPU usage, memory allocation and blocking calls. -* TensorFlow profiling. * Error and exception monitoring. * Health monitoring including CPU, memory, garbage collection and other runtime metrics. * Alerts on profile anomalies. @@ -143,16 +142,6 @@ agent.start_allocation_profiler(); agent.stop_allocation_profiler(); ``` -```python -# Start TensorFlow profiler. -agent.start_tf_profiler(); -``` - -```python -# Stop TensorFlow profiler and report the recorded profile to the Dashboard. -agent.stop_tf_profiler(); -``` - #### Analyzing performance data in the Dashboard Once your application is restarted, you can start observing continuous CPU, memory, I/O, and other hot spot profiles, execution bottlenecks as well as process metrics in the [Dashboard](https://dashboard.stackimpact.com/). diff --git a/README.rst b/README.rst index afdb03f..a98a63b 100644 --- a/README.rst +++ b/README.rst @@ -22,7 +22,6 @@ Features - Continuous hot spot profiling of CPU usage, memory allocation and blocking calls. -- TensorFlow profiling. - Error and exception monitoring. - Health monitoring including CPU, memory, garbage collection and other runtime metrics. @@ -39,9 +38,8 @@ The StackImpact profiler agent is imported into a program and used as a normal package. When the program runs, various sampling profilers are started and stopped automatically by the agent and/or programmatically using the agent methods. The agent periodically reports recorded -profiles and metrics to the StackImpact Dashboard. If an application has -multiple processes, also referred to as workers, instances or nodes, -only one or two processes will have active agents at any point of time. +profiles and metrics to the StackImpact Dashboard. The agent can also +operate in manual mode, which should be used in development only. Documentation ^^^^^^^^^^^^^ @@ -193,16 +191,6 @@ disabled with ``auto_profiling: False``. # Stop heap allocation profiler and report the recorded profile to the Dashboard. agent.stop_allocation_profiler(); -.. code:: python - - # Start TensorFlow profiler. - agent.start_tf_profiler(); - -.. code:: python - - # Stop TensorFlow profiler and report the recorded profile to the Dashboard. - agent.stop_tf_profiler(); - Analyzing performance data in the Dashboard ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -222,7 +210,4 @@ Overhead -------- The agent overhead is measured to be less than 1% for applications under -high load. For applications that are horizontally scaled to multiple -processes, StackImpact agents are only active on a small subset of the -processes at any point of time, therefore the total overhead is much -lower. +high load. diff --git a/examples/tensorflow/app.py b/examples/tensorflow/app.py deleted file mode 100644 index 9b301bd..0000000 --- a/examples/tensorflow/app.py +++ /dev/null @@ -1,32 +0,0 @@ -from __future__ import print_function -import random -import time -import sys -import threading -import tensorflow as tf -sys.path.append(".") -import stackimpact - - -agent = stackimpact.start( - agent_key = 'agent key here', - app_name = 'MyTensorFlowScript') - - -def handle_some_event(): - with agent.profile(): - tf.reset_default_graph() - x = tf.random_normal([1000, 1000]) - y = tf.random_normal([1000, 1000]) - res = tf.matmul(x, y) - - with tf.Session() as sess: - sess.run(res) - - -# Simulate events -while True: - handle_some_event() - time.sleep(2) - - diff --git a/examples/tensorflow/manual.py b/examples/tensorflow/manual.py deleted file mode 100644 index 1b25517..0000000 --- a/examples/tensorflow/manual.py +++ /dev/null @@ -1,23 +0,0 @@ -from __future__ import print_function -import sys -import tensorflow as tf -sys.path.append(".") -import stackimpact - - -agent = stackimpact.start( - agent_key = 'agent key here', - app_name = 'MyTensorFlowScript', - auto_profiling = False) - -agent.start_tf_profiler() - -x = tf.random_normal([1000, 1000]) -y = tf.random_normal([1000, 1000]) -res = tf.matmul(x, y) - -with tf.Session() as sess: - sess.run(res) - - -agent.stop_tf_profiler() diff --git a/examples/tensorflow/manual_keras.py b/examples/tensorflow/manual_keras.py deleted file mode 100644 index dbf5e7c..0000000 --- a/examples/tensorflow/manual_keras.py +++ /dev/null @@ -1,35 +0,0 @@ -from keras.models import Sequential -from keras.layers.core import Dense, Dropout, Activation -from keras.optimizers import SGD -import numpy as np -import sys - -sys.path.append(".") -import stackimpact - - -agent = stackimpact.start( - agent_key = 'agent key here', - app_name = 'MyKerasScript', - auto_profiling = False) - - -agent.start_tf_profiler() - -X = np.array([[0,0],[0,1],[1,0],[1,1]]) -y = np.array([[0],[1],[1],[0]]) - -model = Sequential() -model.add(Dense(8, input_dim=2)) -model.add(Activation('tanh')) -model.add(Dense(1)) -model.add(Activation('sigmoid')) - -sgd = SGD(lr=0.1) -model.compile(loss='binary_crossentropy', optimizer=sgd) - -model.fit(X, y, batch_size=1, nb_epoch=1000) -print(model.predict_proba(X)) - -agent.stop_tf_profiler() - diff --git a/setup.py b/setup.py index 653ea42..9989e3f 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ def read(fname): setup( name = 'stackimpact', - version = '1.2.4', + version = '1.2.6', description = 'StackImpact Python Profiler', long_description = read('README.rst'), author = 'StackImpact', diff --git a/stackimpact/agent.py b/stackimpact/agent.py index 1fe84cf..a30fcef 100644 --- a/stackimpact/agent.py +++ b/stackimpact/agent.py @@ -26,7 +26,6 @@ from .profilers.cpu_profiler import CPUProfiler from .profilers.allocation_profiler import AllocationProfiler from .profilers.block_profiler import BlockProfiler -from .profilers.tf_profiler import TFProfiler class Span(object): @@ -34,6 +33,8 @@ class Span(object): def __init__(self, stop_func = None): if stop_func: self.stop_func = stop_func + else: + self.stop_func = None def stop(self): @@ -51,7 +52,7 @@ def __exit__(self, exc_type, exc_value, traceback): class Agent(object): - AGENT_VERSION = "1.2.4" + AGENT_VERSION = "1.2.6" SAAS_DASHBOARD_ADDRESS = "https://agent-api.stackimpact.com" def __init__(self, **kwargs): @@ -100,15 +101,6 @@ def __init__(self, **kwargs): config.report_interval = 120 self.block_reporter = ProfileReporter(self, BlockProfiler(self), config) - config = ProfilerConfig() - config.log_prefix = 'TensorFlow profiler' - config.max_profile_duration = 20 - config.max_span_duration = 5 - config.max_span_count = 30 - config.span_interval = 20 - config.report_interval = 120 - self.tf_reporter = ProfileReporter(self, TFProfiler(self), config) - self.options = None @@ -160,7 +152,6 @@ def start(self, **kwargs): self.cpu_reporter.setup() self.allocation_reporter.setup() self.block_reporter.setup() - self.tf_reporter.setup() self.span_reporter.setup() self.error_reporter.setup() self.process_reporter.setup() @@ -211,7 +202,6 @@ def enable(self): self.cpu_reporter.start() self.allocation_reporter.start() self.block_reporter.start() - self.tf_reporter.start() self.span_reporter.start() self.error_reporter.start() self.process_reporter.start() @@ -223,7 +213,6 @@ def disable(self): self.cpu_reporter.stop() self.allocation_reporter.stop() self.block_reporter.stop() - self.tf_reporter.stop() self.span_reporter.stop() self.error_reporter.stop() self.process_reporter.stop() @@ -244,8 +233,6 @@ def profile(self, name='Default'): active_reporters.append(self.allocation_reporter) if self.block_reporter.started: active_reporters.append(self.block_reporter) - if self.tf_reporter.started: - active_reporters.append(self.tf_reporter) if len(active_reporters) > 0: selected_reporter = active_reporters[int(math.floor(random.random() * len(active_reporters)))] @@ -318,14 +305,6 @@ def stop_block_profiler(self): self._stop_profiler(self.block_reporter) - def start_tf_profiler(self): - self._start_profiler(self.tf_reporter) - - - def stop_tf_profiler(self): - self._stop_profiler(self.tf_reporter) - - def destroy(self): if not self.agent_started: self.log('Agent has not been started') @@ -340,7 +319,6 @@ def destroy(self): self.cpu_reporter.stop() self.allocation_reporter.stop() self.block_reporter.stop() - self.tf_reporter.stop() self.error_reporter.stop() self.span_reporter.stop() self.process_reporter.stop() @@ -348,7 +326,6 @@ def destroy(self): self.cpu_reporter.destroy() self.allocation_reporter.destroy() self.block_reporter.destroy() - self.tf_reporter.destroy() self.error_reporter.destroy() self.span_reporter.destroy() self.process_reporter.destroy() diff --git a/stackimpact/profilers/tf_profiler.py b/stackimpact/profilers/tf_profiler.py deleted file mode 100644 index de3d189..0000000 --- a/stackimpact/profilers/tf_profiler.py +++ /dev/null @@ -1,246 +0,0 @@ -from __future__ import division - -import os -import sys -import time -import threading -import re -import signal -from functools import wraps - -from ..runtime import min_version, runtime_info, patch, unpatch -from ..metric import Metric -from ..metric import Breakdown -from ..frame import Frame - -if runtime_info.GEVENT: - import gevent - - -class Sampler(object): - - SAMPLING_INTERVAL = 1 - MAX_SAMPLED_RUNS = 10 - MAX_SAMPLED_DURATION = 0.1 - - def __init__(self, agent): - self.last_update_ts = time.time() - self.count = 0 - self.duration = 0 - - - def sample(self): - if self.last_update_ts < time.time() - self.SAMPLING_INTERVAL: - self.last_update_ts = time.time() - self.count = 0 - self.duration = 0 - return True - elif self.count < self.MAX_SAMPLED_RUNS and self.duration < self.MAX_SAMPLED_DURATION: - return True - - return False - - - def update(self, run_duration): - self.count = self.count + 1 - self.duration = self.duration + run_duration - - - -class TFProfiler(object): - - def __init__(self, agent): - self.agent = agent - self.ready = False - self.cpu_profile = None - self.allocation_profile = None - self.profile_lock = threading.Lock() - self.profiler_active = False - self.sampler = Sampler(agent) - self.sampling_active = False - self.run_metadata = None - self.total_run_duration = None - self.sampled_run_duration = None - - - def setup(self): - if self.agent.get_option('tf_profiler_disabled'): - return - - try: - import tensorflow as tf - except Exception: - self.agent.log('TensorFlow not found.') - return - - try: - def before_run(args, kwargs): - session = args[0] - data = None - - if self.profiler_active: - data = dict() - data['start_ts'] = time.time() - - if not self.sampling_active and self.sampler.sample(): - self.sampling_active = True - self.agent.log('Tracing TensorFlow session.') - - if 'options' in kwargs: - kwargs['options'].trace_level = tf.RunOptions.FULL_TRACE - else: - kwargs['options'] = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) - - if 'run_metadata' not in kwargs: - kwargs['run_metadata'] = tf.RunMetadata() - self.run_metadata = kwargs['run_metadata'] - - return args, kwargs, data - - def after_run(args, kwargs, ret, data): - session = args[0] - - if data and 'start_ts' in data: - run_duration = time.time() - data['start_ts'] - self.total_run_duration = self.total_run_duration + run_duration - if self.sampling_active: - self.sampled_run_duration = self.sampled_run_duration + run_duration - self.sampler.update(run_duration) - self.update_cpu_profile(session.graph, self.run_metadata.step_stats.dev_stats) - self.update_allocation_profile(session.graph, self.run_metadata.step_stats.dev_stats) - self.sampling_active = False - - if not patch(tf.Session, 'run', before_run, after_run): - self.agent.log('Error wrapping TensorFlow session.') - - except Exception: - self.agent.log('Error setting up TensorFlow session wrappers.') - self.agent.exception() - return - - self.ready = True - - - def destroy(self): - if not self.ready: - return - - - def reset(self): - self.cpu_profile = Breakdown('Operation definition call graph', Breakdown.TYPE_CALLGRAPH) - self.allocation_profile = Breakdown('Operation definition call graph', Breakdown.TYPE_CALLGRAPH) - self.total_run_duration = 0 - self.sampled_run_duration = 0 - - - def start_profiler(self): - self.agent.log('Activating TensorFlow profiler.') - - self.profiler_active = True - - - def stop_profiler(self): - self.profiler_active = False - - self.agent.log('Deactivating TensorFlow profiler.') - - - def build_profile(self, duration): - with self.profile_lock: - if self.sampled_run_duration > 0: - scale = self.total_run_duration / self.sampled_run_duration - - self.cpu_profile.scale(scale) - self.cpu_profile.normalize(duration) - self.cpu_profile.propagate() - - self.allocation_profile.scale(scale) - self.allocation_profile.normalize(duration) - self.allocation_profile.propagate() - self.allocation_profile.floor() - self.allocation_profile.filter(2, 1000, float("inf")) - - return [{ - 'category': Metric.CATEGORY_CPU_PROFILE, - 'name': Metric.NAME_TF_OPERATION_TIMES, - 'unit': Metric.UNIT_MILLISECOND, - 'unit_interval': 1, - 'profile': self.cpu_profile - }, - { - 'category': Metric.CATEGORY_MEMORY_PROFILE, - 'name': Metric.NAME_TF_OPERATION_ALLOCATION_RATE, - 'unit': Metric.UNIT_BYTE, - 'unit_interval': 1, - 'profile': self.allocation_profile - }] - - - def update_cpu_profile(self, graph, dev_stats): - with self.profile_lock: - op_index = dict() - for op in graph.get_operations(): - op_index[op.name] = op - - for device_index, device_stats in enumerate(dev_stats): - device_node = self.cpu_profile.find_or_add_child(device_stats.device) - - for node_stats in device_stats.node_stats: - if node_stats.node_name == '_SOURCE' or node_stats.node_name == '_SINK': - continue - - duration = (node_stats.op_end_rel_micros - node_stats.op_start_rel_micros) / 1000 - if duration == 0: - continue - - if node_stats.node_name in op_index: - op = op_index[node_stats.node_name] - tb = op.traceback - if tb: - current_node = device_node - for tb_frame in tb: - frame = Frame(tb_frame[2], tb_frame[0], tb_frame[1]) - current_node = current_node.find_or_add_child(str(frame)) - current_node.set_type(Breakdown.TYPE_CALLSITE) - - current_node = current_node.find_or_add_child(node_stats.node_name) - current_node.add_metadata('Type', op.type) - current_node.set_type(Breakdown.TYPE_OPERATION) - current_node.increment(duration, 1) - - - - def update_allocation_profile(self, graph, dev_stats): - with self.profile_lock: - op_index = dict() - for op in graph.get_operations(): - op_index[op.name] = op - - for device_index, device_stats in enumerate(dev_stats): - device_node = self.allocation_profile.find_or_add_child(device_stats.device) - - for node_stats in device_stats.node_stats: - if node_stats.node_name == '_SOURCE' or node_stats.node_name == '_SINK': - continue - - num_bytes = 0 - for index, output in enumerate(node_stats.output): - allocation = output.tensor_description.allocation_description - num_bytes = num_bytes + allocation.requested_bytes - if num_bytes == 0: - continue - - if node_stats.node_name in op_index: - op = op_index[node_stats.node_name] - tb = op.traceback - if tb: - current_node = device_node - for tb_frame in tb: - frame = Frame(tb_frame[2], tb_frame[0], tb_frame[1]) - current_node = current_node.find_or_add_child(str(frame)) - current_node.set_type(Breakdown.TYPE_CALLSITE) - - current_node = current_node.find_or_add_child(node_stats.node_name) - current_node.add_metadata('Type', op.type) - current_node.set_type(Breakdown.TYPE_OPERATION) - current_node.increment(num_bytes, 1) diff --git a/tests/agent_test.py b/tests/agent_test.py index e8777be..6f87141 100644 --- a/tests/agent_test.py +++ b/tests/agent_test.py @@ -182,46 +182,7 @@ def blocking_call(): self.assertTrue('blocking_call' in str(messages)) agent.destroy() - - - - - def test_tf_profile(self): - stackimpact._agent = None - agent = stackimpact.start( - dashboard_address = 'http://localhost:5001', - agent_key = 'key1', - app_name = 'TestPythonApp', - auto_profiling = False, - debug = True - ) - - if not agent.tf_reporter.profiler.ready: - return - - messages = [] - def add_mock(topic, message): - messages.append(message) - agent.message_queue.add = add_mock - - agent.start_tf_profiler() - - import tensorflow as tf - - x = tf.random_normal([1000, 1000]) - y = tf.random_normal([1000, 1000]) - res = tf.matmul(x, y) - - with tf.Session() as sess: - sess.run(res) - - agent.stop_tf_profiler() - - self.assertTrue('test_tf_profile' in str(messages)) - - agent.destroy() - - + if __name__ == '__main__': unittest.main() diff --git a/tests/profilers/tf_profiler_test.py b/tests/profilers/tf_profiler_test.py deleted file mode 100644 index abbafef..0000000 --- a/tests/profilers/tf_profiler_test.py +++ /dev/null @@ -1,67 +0,0 @@ - -import time -import unittest -import random -import threading -import sys -import traceback - -import stackimpact -from stackimpact.runtime import runtime_info - - -# virtualenv -p python3 venv_tf -# source venv_tf/bin/activate -# pip install tensorflow -# pip install keras - -class TFProfilerTestCase(unittest.TestCase): - - def test_record_tf_profile(self): - stackimpact._agent = None - agent = stackimpact.start( - dashboard_address = 'http://localhost:5001', - agent_key = 'key1', - app_name = 'TestPythonApp', - auto_profiling = False, - debug = True - ) - - agent.tf_reporter.profiler.reset() - - if not agent.tf_reporter.profiler.ready: - return - - def record(): - agent.tf_reporter.profiler.start_profiler() - time.sleep(1) - agent.tf_reporter.profiler.stop_profiler() - - - record_t = threading.Thread(target=record) - record_t.start() - - import tensorflow as tf - - x = tf.random_normal([1000, 1000]) - y = tf.random_normal([1000, 1000]) - res = tf.matmul(x, y) - - with tf.Session() as sess: - sess.run(res) - - record_t.join() - - profile = agent.tf_reporter.profiler.build_profile(1)[0]['profile'].to_dict() - #print(profile) - self.assertTrue('test_record_tf_profile' in str(profile)) - - profile = agent.tf_reporter.profiler.build_profile(1)[1]['profile'].to_dict() - #print(profile) - self.assertTrue('test_record_tf_profile' in str(profile)) - - agent.destroy() - - -if __name__ == '__main__': - unittest.main() From e8376b5b4c4322dc6c509a97c5d5aeafc6690bde Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Tue, 14 May 2019 14:45:03 +0000 Subject: [PATCH 25/26] cleanup files --- .gitignore | 1 + publish.sh | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 4b2835f..49269b2 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ dist stackimpact.egg-info __pycache__ venv* +env/ diff --git a/publish.sh b/publish.sh index 0083c0f..e502767 100755 --- a/publish.sh +++ b/publish.sh @@ -13,4 +13,3 @@ for bundle in dist/*.tar.gz; do echo "Publishing $bundle..." twine upload $bundle done - From 4d0a415b790c89e7bee1d70216f948b7fec11540 Mon Sep 17 00:00:00 2001 From: Dmitri Melikyan Date: Tue, 14 May 2019 14:48:05 +0000 Subject: [PATCH 26/26] add python 3.7 to setup.py --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 9989e3f..e5a2368 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,7 @@ def read(fname): 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: Implementation :: CPython', 'Topic :: Software Development', 'Topic :: System :: Monitoring',