From 1156f9c9be590f5851150ea06482781b103b521d Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 26 Mar 2024 22:07:14 +0100 Subject: [PATCH 1/3] process-monitor: Better identify process names. Currently it's hard to say which ovsdb process is which, we can only guess based on the process pid. Look deeper into the process command line arguments and determine the name based on the name of a pid file or a name of the python script. For example: Before: ovsdb-server-ovn-central-az0-1-138 ovsdb-server-ovn-central-az0-1-153 ovsdb-server-ovn-central-az0-1-64 ovsdb-server-ovn-central-az0-1-80 ovn-northd-ovn-central-az0-1-91 ovn-ic-ovn-central-az0-1-184 python3-ovn-tester-50 python3-ovn-tester-51 After: ovn_ic_nb_db|ovn-central-az0-1|138 ovn_ic_sb_db|ovn-central-az0-1|153 ovnnb_db|ovn-central-az0-1|64 ovnsb_db|ovn-central-az0-1|80 ovn-northd|ovn-central-az0-1|91 ovn-ic|ovn-central-az0-1|184 process-monitor|ovn-tester|50 ovn_tester|ovn-tester|51 Using '|' as a delimiter for easier parsing in future commits. Signed-off-by: Ilya Maximets --- ovn-fake-multinode-utils/process-monitor.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ovn-fake-multinode-utils/process-monitor.py b/ovn-fake-multinode-utils/process-monitor.py index 5cbb49de..54ad8725 100644 --- a/ovn-fake-multinode-utils/process-monitor.py +++ b/ovn-fake-multinode-utils/process-monitor.py @@ -36,7 +36,14 @@ def monitor(suffix: str, out_file: str, exit_file: str) -> None: tme = time.time() for p in processes: try: - name = p.name() + "-" + suffix + "-" + str(p.pid) + name = p.name() + for arg in p.cmdline(): + if arg.endswith('.pid') or arg.endswith('.py'): + name = arg.split('/')[-1].split('.')[0] + break + + name = name + "|" + suffix + "|" + str(p.pid) + # cpu_percent(seconds) call will block # for the amount of seconds specified. cpu = p.cpu_percent(0.5) From 6c979d0719ceb24a3ec56d073a9a1c507c135cd6 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Tue, 26 Mar 2024 22:29:02 +0100 Subject: [PATCH 2/3] process-monitor: Exclude health monitors. These processes do not normally contribute much and only make graphs more crowded and harder to navigate. Signed-off-by: Ilya Maximets --- ovn-fake-multinode-utils/process-monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ovn-fake-multinode-utils/process-monitor.py b/ovn-fake-multinode-utils/process-monitor.py index 54ad8725..5afd0bf8 100644 --- a/ovn-fake-multinode-utils/process-monitor.py +++ b/ovn-fake-multinode-utils/process-monitor.py @@ -22,7 +22,7 @@ def monitor(suffix: str, out_file: str, exit_file: str) -> None: for p in psutil.process_iter(): if any(name in p.name() for name in process_names): processes.add(p) - elif any( + elif p.name() != 'monitor' and any( name in part for part in p.cmdline() for name in process_names From 7f0955206323b79fa7d7c18620d93a2c4cf4c1e8 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 27 Mar 2024 02:34:40 +0100 Subject: [PATCH 3/3] process-stats: Add support for graphs with aggregate resource usage. We take all the stats and generate sum/mean/max/min graphs per process type as well as an extra graph for all OVN processes together and another one for all OVS processes together. Resulted HTML also contains the peak of the sum of RSS and the peak of the sum of CPU usage for all OVN processes together. Signed-off-by: Ilya Maximets --- do.sh | 10 +++- utils/process-stats.py | 116 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 112 insertions(+), 14 deletions(-) diff --git a/do.sh b/do.sh index ef2845ab..9d94e03a 100755 --- a/do.sh +++ b/do.sh @@ -406,13 +406,19 @@ function mine_data() { resource_usage_logs=$(find ${out_dir}/logs -name process-stats.json \ | grep -E 'ovn-tester|ovn-central-az[0-2]-') python3 ${topdir}/utils/process-stats.py \ - resource-usage-report-central.html ${resource_usage_logs} + -o resource-usage-report-central.html ${resource_usage_logs} # Collecting stats only for 3 workers to avoid bloating the report. resource_usage_logs=$(find ${out_dir}/logs -name process-stats.json \ | grep ovn-scale | head -3) python3 ${topdir}/utils/process-stats.py \ - resource-usage-report-worker.html ${resource_usage_logs} + -o resource-usage-report-worker.html ${resource_usage_logs} + + # Preparing reports for aggregate resource usage. + resource_usage_logs=$(find ${out_dir}/logs -name process-stats.json) + python3 ${topdir}/utils/process-stats.py --aggregate \ + -o resource-usage-report-aggregate.html ${resource_usage_logs} + deactivate popd diff --git a/utils/process-stats.py b/utils/process-stats.py index 40dbeb0e..5ffa4e51 100644 --- a/utils/process-stats.py +++ b/utils/process-stats.py @@ -1,45 +1,118 @@ +import argparse import json +import logging import os import pandas as pd import plotly.express as px import sys -from datetime import datetime from typing import Dict, List +FORMAT = '%(asctime)s |%(levelname)s| %(message)s' +logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=FORMAT) +log = logging.getLogger(__name__) + + def read_file(filename: str) -> Dict: with open(filename, "r") as file: return json.load(file) -def resource_stats_generate(filename: str, data: Dict) -> None: +def aggregated(df: pd.DataFrame) -> (pd.DataFrame, int): + column_names = list(df.columns) + value_name = column_names[2] + + log.info(f'Pivot and interpolate {value_name} ...') + df = df.pivot_table( + index='Time', columns='Process', values=value_name, aggfunc='mean' + ).interpolate(method='time', limit_direction='both') + + result = pd.DataFrame(index=df.index) + processes = {p.split('|')[0] for p in df.columns} + + log.info(f'Aggregating {value_name} ...') + for p in processes: + df_filtered = df.filter(regex='^' + p) + result[p + '|sum'] = df_filtered.sum(axis=1) + result[p + '|mean'] = df_filtered.mean(axis=1) + result[p + '|max'] = df_filtered.max(axis=1) + result[p + '|min'] = df_filtered.min(axis=1) + + result['ovn|sum'] = df.filter(regex=r'^ovn.*\|ovn-(central|scale).*').sum( + axis=1 + ) + ovn_max = result['ovn|sum'].astype('int').max() + + result['ovs|sum'] = df.filter(regex=r'^ovs.*\|ovn-(central|scale).*').sum( + axis=1 + ) + + result = result.astype('int').reset_index().melt(id_vars=['Time']) + result.columns = column_names + result = result.sort_values(['Process', 'Time']) + + return result, ovn_max + + +def resource_stats_generate( + filename: str, data: Dict, aggregate: bool +) -> None: rss: List[List] = [] cpu: List[List] = [] + log.info('Preprocessing ...') for ts, time_slice in sorted(data.items()): + tme = pd.Timestamp.fromtimestamp(float(ts)).round('1s') for name, res in time_slice.items(): - tme = datetime.fromtimestamp(float(ts)) rss_mb = int(res['rss']) >> 20 rss.append([tme, name, rss_mb]) cpu.append([tme, name, float(res['cpu'])]) + log.info('Creating DataFrame ...') df_rss = pd.DataFrame(rss, columns=['Time', 'Process', 'RSS (MB)']) df_cpu = pd.DataFrame(cpu, columns=['Time', 'Process', 'CPU (%)']) + if aggregate: + df_rss, max_sum_rss = aggregated(df_rss) + df_cpu, max_sum_cpu = aggregated(df_cpu) + + log.info('Creating charts ...') rss_chart = px.line( df_rss, x='Time', y='RSS (MB)', color='Process', - title='Resident Set Size', + title=('Aggregate ' if aggregate else '') + 'Resident Set Size', ) cpu_chart = px.line( - df_cpu, x='Time', y='CPU (%)', color='Process', title='CPU usage' + df_cpu, + x='Time', + y='CPU (%)', + color='Process', + title=('Aggregate ' if aggregate else '') + 'CPU usage', ) + log.info(f'Writing HTML to {filename} ...') with open(filename, 'w') as report_file: report_file.write('') + if aggregate: + report_file.write( + f''' + + + + + + + + + + + +
Max(Sum(OVN RSS)) {max_sum_rss} MB
Max(Sum(OVN CPU)) {max_sum_cpu} %
+ ''' + ) report_file.write( rss_chart.to_html( full_html=False, @@ -60,17 +133,36 @@ def resource_stats_generate(filename: str, data: Dict) -> None: if __name__ == '__main__': - if len(sys.argv) < 3: - print(f'Usage: {sys.argv[0]} output-file input-file [input-file ...]') - sys.exit(1) + parser = argparse.ArgumentParser( + description='Generate resource usage charts.' + ) + parser.add_argument( + '--aggregate', action='store_true', help='generate aggregate charts' + ) + parser.add_argument( + '-o', '--output', required=True, help='file to write an HTML result' + ) + parser.add_argument( + 'input_files', + metavar='input-file', + type=str, + nargs='+', + help='JSON file with recorded process statistics', + ) - if os.path.isfile(sys.argv[1]): - print(f'Output file {sys.argv[1]} already exists') + args = parser.parse_args() + + if os.path.isfile(args.output): + log.fatal(f'Output file {args.output} already exists') sys.exit(2) + log.info(f'Processing stats from {len(args.input_files)} files.') + + log.info('Reading ...') data: Dict = {} - for f in sys.argv[2:]: + for f in args.input_files: d = read_file(f) data.update(d) - resource_stats_generate(sys.argv[1], data) + resource_stats_generate(args.output, data, args.aggregate) + log.info('Done.')