From f6d8efe931175640f013639b83f7f543fd495db5 Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Mon, 23 Dec 2024 16:25:48 +0000 Subject: [PATCH 1/2] chore: use `antctl` to start/stop nodes Replaces the old references to `safenode-manager`. --- resources/ansible/start_nodes.yml | 2 +- resources/ansible/stop_nodes.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/ansible/start_nodes.yml b/resources/ansible/start_nodes.yml index c1f50783..ebdafe1a 100644 --- a/resources/ansible/start_nodes.yml +++ b/resources/ansible/start_nodes.yml @@ -6,4 +6,4 @@ interval: "{{ interval }}" tasks: - name: start - ansible.builtin.command: "safenode-manager start --interval {{ interval }}" + ansible.builtin.command: "antctl start --interval {{ interval }}" diff --git a/resources/ansible/stop_nodes.yml b/resources/ansible/stop_nodes.yml index 2c60500d..8c3ec10e 100644 --- a/resources/ansible/stop_nodes.yml +++ b/resources/ansible/stop_nodes.yml @@ -10,6 +10,6 @@ {% if delay is defined %} sleep {{ delay | default(0) }} {% endif %} - safenode-manager stop --interval {{ interval }} + antctl stop --interval {{ interval }} args: executable: /bin/bash \ No newline at end of file From 602019fafd8ea180918d2532cbbd17f743ad2f3a Mon Sep 17 00:00:00 2001 From: Chris O'Neil Date: Mon, 23 Dec 2024 23:33:31 +0000 Subject: [PATCH 2/2] feat: provide `reset-to-n-nodes` command This command runs a playbook that uses a Bash script to clear out existing nodes then create the specified number of new nodes. It is intended to be used in the production environment to reset nodes for a particular environment, to avoid having to bootstrap a new one. In production, there will be intervals applied to the `stop` and `start` commands such that the existing node services will be stopped gradually, and the new services will come online slowly, probably with an interval of about 5 minutes between each. --- resources/ansible/reset_to_n_nodes.yml | 6 + .../roles/reset-to-n-nodes/tasks/main.yml | 15 ++ .../templates/reset_to_n_nodes.sh.j2 | 74 ++++++++++ src/ansible/mod.rs | 5 + src/main.rs | 128 +++++++++++++++++- 5 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 resources/ansible/reset_to_n_nodes.yml create mode 100644 resources/ansible/roles/reset-to-n-nodes/tasks/main.yml create mode 100644 resources/ansible/roles/reset-to-n-nodes/templates/reset_to_n_nodes.sh.j2 diff --git a/resources/ansible/reset_to_n_nodes.yml b/resources/ansible/reset_to_n_nodes.yml new file mode 100644 index 00000000..ca955967 --- /dev/null +++ b/resources/ansible/reset_to_n_nodes.yml @@ -0,0 +1,6 @@ +--- +- name: reset to n nodes + hosts: all + become: True + roles: + - reset-to-n-nodes diff --git a/resources/ansible/roles/reset-to-n-nodes/tasks/main.yml b/resources/ansible/roles/reset-to-n-nodes/tasks/main.yml new file mode 100644 index 00000000..66dcc504 --- /dev/null +++ b/resources/ansible/roles/reset-to-n-nodes/tasks/main.yml @@ -0,0 +1,15 @@ +# An assumption is being made that an environment for running nodes was already setup. +# Nodes will be stopped slowly, then everything will be cleared using the `reset` command. +--- +- name: copy script + template: + src: reset_to_n_nodes.sh.j2 + dest: /usr/local/bin/reset_to_n_nodes.sh + mode: '0755' + owner: root + group: root + +- name: run script + ansible.builtin.shell: /usr/local/bin/reset_to_n_nodes.sh + args: + executable: /bin/bash \ No newline at end of file diff --git a/resources/ansible/roles/reset-to-n-nodes/templates/reset_to_n_nodes.sh.j2 b/resources/ansible/roles/reset-to-n-nodes/templates/reset_to_n_nodes.sh.j2 new file mode 100644 index 00000000..cfd66dbc --- /dev/null +++ b/resources/ansible/roles/reset-to-n-nodes/templates/reset_to_n_nodes.sh.j2 @@ -0,0 +1,74 @@ +# An assumption is being made that an environment for running nodes was already setup. +# Nodes will be stopped slowly, then everything will be cleared using the `reset` command. +# The node services will then be created again, using the settings from the previous node registry. +# After which, they will be started, using an interval between each. +# In the production environment, it's advisable for the interval to be quite large, e.g., 5 minutes. + +#!/bin/bash + +set -euo pipefail + +readonly ANTCTL="/usr/local/bin/antctl" +readonly JQ="/usr/bin/jq" +readonly NODE_REGISTRY="/var/antctl/node_registry.json" + +node_count={{ node_count }} + +if [ "{{ evm_network_type }}" = "evm-custom" ]; then + rpc_url=$(cat ${NODE_REGISTRY} | ${JQ} -r '.nodes[0].evm_network.Custom.rpc_url_http') + payment_token_address=$(cat ${NODE_REGISTRY} | ${JQ} -r '.nodes[0].evm_network.Custom.payment_token_address') + data_payments_address=$(cat ${NODE_REGISTRY} | ${JQ} -r '.nodes[0].evm_network.Custom.data_payments_address') +fi + +network_contacts_url=$(cat ${NODE_REGISTRY} | ${JQ} -r '.nodes[0].peers_args.network_contacts_url[0]') +peer_multiaddr=$(cat ${NODE_REGISTRY} | ${JQ} -r '.nodes[0].peers_args.addrs[0]') +rewards_address=$(cat ${NODE_REGISTRY} | ${JQ} -r '.nodes[0].rewards_address') +network_id=$(cat ${NODE_REGISTRY} | ${JQ} -r '.nodes[0].network_id') +max_archived_log_files=$(cat ${NODE_REGISTRY} | ${JQ} -r '.nodes[0].max_archived_log_files') +max_log_files=$(cat ${NODE_REGISTRY} | ${JQ} -r '.nodes[0].max_log_files') + +# The delay is useful when there is only one node running. +{% if delay is defined %} +sleep {{ delay | default(0) }} +{% endif %} +antctl stop --interval {{ stop_interval }} + +${ANTCTL} reset --force + +base_rpc_port=13000 +base_metrics_port=14000 + +for ((i=0; i "nodes.yml".to_string(), AnsiblePlaybook::PeerCacheNodes => "peer_cache_node.yml".to_string(), AnsiblePlaybook::RpcClient => "safenode_rpc_client.yml".to_string(), + AnsiblePlaybook::ResetToNNodes => "reset_to_n_nodes.yml".to_string(), AnsiblePlaybook::StartFaucet => "start_faucet.yml".to_string(), AnsiblePlaybook::StartNodes => "start_nodes.yml".to_string(), AnsiblePlaybook::StartTelegraf => "start_telegraf.yml".to_string(), diff --git a/src/main.rs b/src/main.rs index 5ce77ba2..f2ceeabe 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1047,6 +1047,51 @@ enum Commands { #[clap(long, value_parser = parse_provider, verbatim_doc_comment, default_value_t = CloudProvider::DigitalOcean)] provider: CloudProvider, }, + /// Reset nodes to a specified count. + /// + /// This will stop all nodes, clear their data, and start the specified number of nodes. + #[clap(name = "reset-to-n-nodes")] + ResetToNNodes { + /// Provide a list of VM names to use as a custom inventory. + /// + /// This will reset nodes on a particular subset of VMs. + #[clap(name = "custom-inventory", long, use_value_delimiter = true)] + custom_inventory: Option>, + /// The EVM network to use. + /// + /// Valid values are "arbitrum-one", "arbitrum-sepolia", or "custom". + #[clap(long, value_parser = parse_evm_network)] + evm_network_type: EvmNetwork, + /// Maximum number of forks Ansible will use to execute tasks on target hosts. + #[clap(long, default_value_t = 50)] + forks: usize, + /// The name of the environment. + #[arg(short = 'n', long)] + name: String, + /// The number of nodes to run after reset. + #[arg(long)] + node_count: u16, + /// Specify the type of node VM to reset the nodes on. If not provided, the nodes on + /// all the node VMs will be reset. This is mutually exclusive with the '--custom-inventory' argument. + /// + /// Valid values are "peer-cache", "genesis", "generic" and "private". + #[arg(long, conflicts_with = "custom-inventory")] + node_type: Option, + /// The cloud provider for the environment. + #[clap(long, value_parser = parse_provider, verbatim_doc_comment, default_value_t = CloudProvider::DigitalOcean)] + provider: CloudProvider, + /// The interval between starting each node in milliseconds. + #[clap(long, value_parser = |t: &str| -> Result { Ok(t.parse().map(Duration::from_millis)?)}, default_value = "2000")] + start_interval: Duration, + /// The interval between stopping each node in milliseconds. + #[clap(long, value_parser = |t: &str| -> Result { Ok(t.parse().map(Duration::from_millis)?)}, default_value = "2000")] + stop_interval: Duration, + /// Supply a version number for the antnode binary. + /// + /// If not provided, the latest version will be used. + #[arg(long)] + version: Option, + }, } #[derive(Subcommand, Debug)] @@ -1914,7 +1959,7 @@ async fn main() -> Result<()> { eyre!("Genesis node not found. Most likely this is a bootstrap deployment."))?, &inventory.genesis_multiaddr.clone().ok_or_else(|| eyre!("Genesis node not found. Most likely this is a bootstrap deployment."))?, - )?), + )?) )?; } @@ -3020,6 +3065,87 @@ async fn main() -> Result<()> { Ok(()) } + Commands::ResetToNNodes { + custom_inventory, + evm_network_type, + forks, + name, + node_count, + node_type, + provider, + start_interval, + stop_interval, + version, + } => { + // We will use 50 forks for the initial run to retrieve the inventory, then recreate the + // deployer using the custom fork value. + let testnet_deployer = TestnetDeployBuilder::default() + .ansible_forks(50) + .environment_name(&name) + .provider(provider) + .build()?; + let inventory_service = DeploymentInventoryService::from(&testnet_deployer); + let inventory = inventory_service + .generate_or_retrieve_inventory(&name, true, None) + .await?; + if inventory.is_empty() { + return Err(eyre!("The {name} environment does not exist")); + } + + let testnet_deployer = TestnetDeployBuilder::default() + .ansible_forks(forks) + .environment_name(&name) + .provider(provider) + .build()?; + testnet_deployer.init().await?; + + let antnode_version = get_version_from_option(version, &ReleaseType::AntNode).await?; + let mut extra_vars = ExtraVarsDocBuilder::default(); + extra_vars.add_variable("environment_name", &name); + extra_vars.add_variable("evm_network_type", &evm_network_type.to_string()); + extra_vars.add_variable("node_count", &node_count.to_string()); + extra_vars.add_variable("start_interval", &start_interval.as_millis().to_string()); + extra_vars.add_variable("stop_interval", &stop_interval.as_millis().to_string()); + extra_vars.add_variable("version", &antnode_version.to_string()); + + let ansible_runner = &testnet_deployer.ansible_provisioner.ansible_runner; + + if let Some(custom_inventory) = custom_inventory { + println!("Running the playbook with a custom inventory"); + let custom_vms = get_custom_inventory(&inventory, &custom_inventory)?; + generate_custom_environment_inventory( + &custom_vms, + &name, + &ansible_runner.working_directory_path.join("inventory"), + )?; + ansible_runner.run_playbook( + AnsiblePlaybook::ResetToNNodes, + AnsibleInventoryType::Custom, + Some(extra_vars.build()), + )?; + return Ok(()); + } + + if let Some(node_type) = node_type { + println!("Running the playbook for {node_type:?} nodes"); + ansible_runner.run_playbook( + AnsiblePlaybook::ResetToNNodes, + node_type.to_ansible_inventory_type(), + Some(extra_vars.build()), + )?; + return Ok(()); + } + + println!("Running the playbook for all node types"); + for node_inv_type in AnsibleInventoryType::iter_node_type() { + ansible_runner.run_playbook( + AnsiblePlaybook::ResetToNNodes, + node_inv_type, + Some(extra_vars.build()), + )?; + } + Ok(()) + } } }