Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(qa): minor qa updates #243

Merged
merged 7 commits into from
Jun 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions qa/terraform/nodes.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ variable "ssh_keys" {

variable "instance_tags" {
type = list(string)
default = ["malachite"]
default = ["Malachite"]
}

resource "digitalocean_droplet" "cc" {
Expand All @@ -17,7 +17,10 @@ resource "digitalocean_droplet" "cc" {
# Build takes about 2.5 minutes on an 8-core Digital Ocean server
#size = "s-8vcpu-16gb"
ssh_keys = var.ssh_keys
user_data = file("user-data/cc-data.txt")
user_data = templatefile("user-data/cc-data.txt", {
malachite_dashboard = filebase64("../viewer/config-grafana/provisioning/dashboards-data/main.json")
node_dashboard = filebase64("../viewer/config-grafana/provisioning/dashboards-data/node-exporter-full.json")
})
}

resource "digitalocean_droplet" "small" {
Expand Down
85 changes: 42 additions & 43 deletions qa/terraform/templates/commands.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
# D_REGION - the Digital Ocean region where the servers are deployed
# PSSH_H - space-separated list of all the node server IP addresses for pssh input
# PSSH_P - the number of parallel processes to run with pssh
# SSH_OPTS - options for ssh run locally (forward agent, disable known_hosts)
# MALACHITE_DIR - the path to the malachite repository directory
# MALACHITE_DIR - the path to the malachite repository directory
# IS_CC - 1 means we are on the CC server, 0 we are not. (Used to determine the docker -H parameter.)
# IS_CC - 1 means we are on the CC server, 0 we are not. (Used to determine the docker -H parameter when run locally.)
##
# Aliases for easy manual access to the servers (don't use these in scripts)
# ssh-cc - ssh into the cc server
Expand All @@ -17,16 +17,17 @@
# xssh - parallel ssh command to all servers. Change PSSH_H and PSSH_P for different behavior.
# get_ip - get the IP address of a node server for programmatic use (example: get_ip 0)
# ok_cc - check if the CC server is ready to be used and update its services (DNS hosts, commands.sh, etc)
# ok_all - check if all servers are ready to be used (scanning the SSH keys could take a while)
# ok_all - check if all servers are ready to be used
# deploy_cc - build the local source code into a docker image on the cc server and push it to the cc registry
# setup_config - create configuration on the cc server
# done-pull - pull the node image on all the node servers. Accepts list of IDs or "all". (example: dnode-pull 0 1 2)
# dnode-run - run the application on a node server. Accepts list of IDs or "all". (example: dnode-run 0 1 2)
# dnode-log - get the logs of the application from a node server (example: dnode-log 0 -f)
# dnode-stop - stop the application on a node server. Accepts list of IDs or "all". (example: dnode-stop 0 2)

# get_prometheus_data - create a compressed prometheus data file (and download it from the cc server)
# dnode-rm - remove node container from server. Accepts list of IDs or "all". (example: dnode-rm 0 1 2)
# cheat_sheet - get some help on the order of commands to run
# fetch_log - fetch the logs from all the node servers (example: fetch_log 0 1 2)
# get_prometheus_data - create a compressed prometheus data file (and download it from the cc server)
##

export CANDC="${cc.ip}"
Expand All @@ -37,20 +38,22 @@ export D_N="${length(small)+length(large)}"
export D_REGION="${region}"
export PSSH_H="${join(" ",ips)}"
export PSSH_P="30"
# Arrays require advanced shell, SSH_OPTS is not POSIX compatible
export SSH_OPTS=(-A -o LogLevel=ERROR -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o GlobalKnownHostsFile=/dev/null)
export MALACHITE_DIR="$(dirname $(dirname ${path}))"
export IS_CC=0
export _CC_DOCKER_SHIM="-H ssh://root@$CANDC"
if [ $IS_CC -eq 1 ]; then
export _CC_DOCKER_SHIM=""
fi

alias ssh-cc="ssh -A root@${cc.ip}"
alias ssh-cc="ssh $SSH_OPTS root@${cc.ip}"
%{~ for i,n in concat(small, large) }
alias ssh-node${i}="ssh -A root@${n.ip}"
alias ssh-node${i}="ssh $SSH_OPTS root@${n.ip}"
%{~ endfor }

xssh() {
pssh -l root -i -v -p $PSSH_P -H "$PSSH_H" "$@"
pssh -l root -i -v -O LogLevel=ERROR -O StrictHostKeyChecking=no -O UserKnownHostsFile=/dev/null -O GlobalKnownHostsFile=/dev/null -p $PSSH_P -H "$PSSH_H" "$@"
}

get_ip() {
Expand All @@ -62,21 +65,16 @@ get_ip() {
}

ok_cc() {
_keyscan_cc 2> /dev/null
PSSH_P=1 PSSH_H=$CANDC xssh "cat /etc/done" && \
echo "Updating cc server..." && \
scp -q "$${1:-$${MALACHITE_DIR}/qa/terraform/hosts}" root@$${CANDC}:/etc/hosts && \
ssh root@$${CANDC} "systemctl restart dnsmasq" && \
scp -q "$${1:-$${MALACHITE_DIR}/qa/terraform/commands.sh}" root@$${CANDC}:/etc/profile.d/commands.sh && \
ssh root@$${CANDC} \
"sed -i 's,^export MALACHITE_DIR=.*,export MALACHITE_DIR=/root/malachite,' /etc/profile.d/commands.sh && \
sed -i 's,^export IS_CC=.*,export IS_CC=1,' /etc/profile.d/commands.sh && \
source /etc/profile.d/commands.sh && \
_keyscan_all_servers 2> /dev/null"
_keyscan_cc 2> /dev/null # needed for deploy_cc
PSSH_P=1 PSSH_H=$CANDC xssh "cat /etc/done"
sftp -C -q root@$${CANDC} <<EOF
put $${1:-$${MALACHITE_DIR}/qa/terraform/hosts} /etc/hosts
put $${1:-$${MALACHITE_DIR}/qa/terraform/commands.sh} /etc/profile.d/commands.sh
EOF
ssh $SSH_OPTS root@$${CANDC} "sed -i -e 's,^export MALACHITE_DIR=.*,export MALACHITE_DIR=/root/malachite,' -e 's,^export IS_CC=.*,export IS_CC=1,' /etc/profile.d/commands.sh && systemctl reload dnsmasq"
}

ok_all() {
_keyscan_all_servers 2> /dev/null
xssh "cat /etc/done && mount /data" # Mount /data in case a QA node came online earlier than CC
}

Expand All @@ -90,7 +88,7 @@ setup_config() {
if _is_cc; then
_change_config all
else
ssh root@$CANDC "source /etc/profile.d/commands.sh && _change_config all"
ssh $SSH_OPTS root@$CANDC "source /etc/profile.d/commands.sh && _change_config all"
fi
}

Expand All @@ -113,7 +111,7 @@ dnode-log() {
F="-f"
fi
fi
docker -H ssh://root@$IP logs $F node
ssh $SSH_OPTS root@$IP docker logs $F node
}

dnode-stop() {
Expand All @@ -126,17 +124,18 @@ dnode-rm() {

cheat_sheet() {
cat <<EOF
Commands and their dependencies:
(terminal1) | (terminal2 run in parallel)
ok_cc
(ssh-cc)
deploy_cc | ok_all
setup_config
dnode-run all
(wait for data)
dnode-stop all
fetch_log | get_prometheus_data
dnode-rm all
ok_cc
deploy_cc
ssh-cc
ok_all
setup_config
(_chance_one_config_entry)
dnode-run all
(wait for data)
dnode-stop all
fetch_log
get_prometheus_data
dnode-rm all
EOF
}

Expand All @@ -146,22 +145,24 @@ fetch_log() {

get_prometheus_data() {
if _is_cc; then
systemctl stop prometheus && rm prometheus.tgz 2> /dev/null && tar -cvzf prometheus.tgz -C /var/lib/prometheus/metrics2 . ; systemctl start prometheus
rm -f prometheus.tgz
systemctl stop prometheus && tar -cvzf prometheus.tgz -C /var/lib/prometheus/metrics2 .
systemctl start prometheus
else
ssh-cc "systemctl stop prometheus && rm prometheus.tgz 2> /dev/null && tar -cvzf prometheus.tgz -C /var/lib/prometheus/metrics2 . ; systemctl start prometheus"
ssh-cc "rm -f prometheus.tgz; systemctl stop prometheus && tar -cvzf prometheus.tgz -C /var/lib/prometheus/metrics2 . ; systemctl start prometheus"
scp -r root@$CANDC:prometheus.tgz .
fi
}

mem_usage() {
_mem_usage() {
PSSH_H="$(_parse_multiple_hosts "$@")" xssh -o mem_usage_out -e mem_usage_err "ps -e -o pid,user,%mem,cmd --sort=-%mem | head -2 | tail -1"
}

cpu_usage() {
_cpu_usage() {
PSSH_H="$(_parse_multiple_hosts "$@")" xssh -o cpu_usage_out -e cpu_usage_err "ps -e -o pid,user,%cpu,cmd --sort=-%cpu | head -2 | tail -1"
}

reset_prometheus_db() {
_reset_prometheus_db() {
if _is_cc; then
systemctl stop prometheus
rm -rf /var/lib/prometheus/metrics2/*
Expand All @@ -172,15 +173,15 @@ reset_prometheus_db() {
}

_is_cc() {
return $IS_CC
return $((1 - IS_CC))
}

_keyscan_cc() {
ssh-keygen -R $CANDC > /dev/null
ssh-keyscan -t ed25519 $CANDC >> $HOME/.ssh/known_hosts
}

_keyscan_all_servers() {
_keyscan_servers() {
_keyscan_cc 2> /dev/null
%{~ for n in concat(small, large) }
ssh-keygen -R ${n.ip} > /dev/null
Expand Down Expand Up @@ -231,9 +232,7 @@ _change_config() {
"moniker=test-$i" \
"consensus.p2p.listen_addr=/ip4/0.0.0.0/udp/27000/quic-v1" \
"mempool.p2p.listen_addr=/ip4/0.0.0.0/udp/28000/quic-v1" \
"metrics.listen_addr=0.0.0.0:9000" \
"test.time_allowance_factor=0.5" \
"test.exec_time_per_tx=500us" && \
"metrics.listen_addr=0.0.0.0:9000" && \
sconfig "$file" -t stringSlice \
"consensus.p2p.persistent_peers=$(_compose_persistent_peers)" \
"mempool.p2p.persistent_peers=$(_compose_persistent_peers 28000)" &
Expand Down
1 change: 1 addition & 0 deletions qa/terraform/templates/hosts.tmpl
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
127.0.0.1 localhost
${cc.ip} g-${cc.name}
%{~ for n in small }
${n.ip} g-${n.name}
Expand Down
Loading