From 26946f0d43ac61beb078c69d31f96a6d93164965 Mon Sep 17 00:00:00 2001 From: Joey Orlando Date: Mon, 2 Dec 2024 14:40:30 -0500 Subject: [PATCH] fix: improve Slack rate limiting logic when updating alert groups (#5287) # What this PR does https://www.loom.com/share/1ac33822301444748133ffe72638ddc4 The two asks in the [original GH issue](https://github.com/grafana/oncall-private/issues/2947) were: > 1. Make the error message clearer. We can identify if it's delivering or updating and being rate-limited. This is possible because Slack sets limits per API method. Also, this limit is a per-slack channel while we are posting messages & applying ratelimit per on-call integration, which confuses customers. > 2. Debounce update alert group message in Slack Both of these have been addressed in this PR ## Which issue(s) this PR closes Closes https://github.com/grafana/oncall-private/issues/2947 ## Checklist - [x] Unit, integration, and e2e (if applicable) tests updated - [x] Documentation added (or `pr:no public docs` PR label added if not required) - [x] Added the relevant release notes label (see labels prefixed w/ `release:`). These labels dictate how your PR will show up in the autogenerated release notes. --- .../alerts/models/alert_receive_channel.py | 8 +- .../apps/slack/alert_group_slack_service.py | 39 +- engine/apps/slack/constants.py | 1 - .../0007_migrate_slackmessage_channel_id.py | 0 engine/apps/slack/models/slack_message.py | 92 ++++- .../apps/slack/scenarios/distribute_alerts.py | 79 ++-- .../apps/slack/scenarios/resolution_note.py | 7 +- .../scenarios/slack_channel_integration.py | 5 +- engine/apps/slack/tasks.py | 161 ++++++-- .../scenario_steps/test_distribute_alerts.py | 49 ++- .../scenario_steps/test_resolution_note.py | 4 + .../test_slack_channel_integration.py | 16 +- .../test_update_alert_group_slack_message.py | 377 ++++++++++++++++++ engine/apps/slack/tests/test_slack_message.py | 317 +++++++++++++-- engine/apps/slack/utils.py | 5 - engine/settings/celery_task_routes.py | 2 + 16 files changed, 975 insertions(+), 187 deletions(-) rename engine/apps/slack/{ => migrations}/0007_migrate_slackmessage_channel_id.py (100%) create mode 100644 engine/apps/slack/tests/tasks/test_update_alert_group_slack_message.py diff --git a/engine/apps/alerts/models/alert_receive_channel.py b/engine/apps/alerts/models/alert_receive_channel.py index 91a7db6a12..f4661a3a10 100644 --- a/engine/apps/alerts/models/alert_receive_channel.py +++ b/engine/apps/alerts/models/alert_receive_channel.py @@ -29,7 +29,7 @@ metrics_remove_deleted_integration_from_cache, metrics_update_integration_cache, ) -from apps.slack.constants import SLACK_RATE_LIMIT_DELAY, SLACK_RATE_LIMIT_TIMEOUT +from apps.slack.constants import SLACK_RATE_LIMIT_TIMEOUT from apps.slack.tasks import post_slack_rate_limit_message from apps.slack.utils import post_message_to_channel from common.api_helpers.utils import create_engine_url @@ -442,12 +442,14 @@ def is_rate_limited_in_slack(self) -> bool: and self.rate_limited_in_slack_at + SLACK_RATE_LIMIT_TIMEOUT > timezone.now() ) - def start_send_rate_limit_message_task(self, delay=SLACK_RATE_LIMIT_DELAY): + def start_send_rate_limit_message_task(self, error_message_verb: str, delay: int) -> None: task_id = celery_uuid() + self.rate_limit_message_task_id = task_id self.rate_limited_in_slack_at = timezone.now() self.save(update_fields=["rate_limit_message_task_id", "rate_limited_in_slack_at"]) - post_slack_rate_limit_message.apply_async((self.pk,), countdown=delay, task_id=task_id) + + post_slack_rate_limit_message.apply_async((self.pk, error_message_verb), countdown=delay, task_id=task_id) @property def alert_groups_count(self) -> int: diff --git a/engine/apps/slack/alert_group_slack_service.py b/engine/apps/slack/alert_group_slack_service.py index 6ff514040f..019a6e217c 100644 --- a/engine/apps/slack/alert_group_slack_service.py +++ b/engine/apps/slack/alert_group_slack_service.py @@ -3,13 +3,9 @@ from apps.slack.client import SlackClient from apps.slack.errors import ( - SlackAPICantUpdateMessageError, SlackAPIChannelArchivedError, - SlackAPIChannelInactiveError, SlackAPIChannelNotFoundError, SlackAPIInvalidAuthError, - SlackAPIMessageNotFoundError, - SlackAPIRatelimitError, SlackAPITokenError, ) @@ -34,41 +30,12 @@ def __init__( else: self._slack_client = SlackClient(slack_team_identity) - def update_alert_group_slack_message(self, alert_group: "AlertGroup") -> None: - logger.info(f"Update message for alert_group {alert_group.pk}") - - try: - self._slack_client.chat_update( - # TODO: once _channel_id has been fully migrated to channel, remove _channel_id - # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p173255546 - # channel=alert_group.slack_message.channel.slack_id, - channel=alert_group.slack_message._channel_id, - ts=alert_group.slack_message.slack_id, - attachments=alert_group.render_slack_attachments(), - blocks=alert_group.render_slack_blocks(), - ) - logger.info(f"Message has been updated for alert_group {alert_group.pk}") - except SlackAPIRatelimitError as e: - if not alert_group.channel.is_maintenace_integration: - if not alert_group.channel.is_rate_limited_in_slack: - alert_group.channel.start_send_rate_limit_message_task(e.retry_after) - logger.info( - f"Message has not been updated for alert_group {alert_group.pk} due to slack rate limit." - ) - else: - raise - except ( - SlackAPIMessageNotFoundError, - SlackAPICantUpdateMessageError, - SlackAPIChannelInactiveError, - SlackAPITokenError, - SlackAPIChannelNotFoundError, - ): - pass - def publish_message_to_alert_group_thread( self, alert_group: "AlertGroup", attachments=None, mrkdwn=True, unfurl_links=True, text=None ) -> None: + """ + TODO: refactor this method and move it to the `SlackMessage` model, such that we can remove this class.. + """ # TODO: refactor checking the possibility of sending message to slack # do not try to post message to slack if integration is rate limited if alert_group.channel.is_rate_limited_in_slack: diff --git a/engine/apps/slack/constants.py b/engine/apps/slack/constants.py index f72529fdb8..f21b7299c2 100644 --- a/engine/apps/slack/constants.py +++ b/engine/apps/slack/constants.py @@ -10,7 +10,6 @@ SLACK_RATE_LIMIT_TIMEOUT = datetime.timedelta(minutes=5) SLACK_RATE_LIMIT_DELAY = 10 -CACHE_UPDATE_INCIDENT_SLACK_MESSAGE_LIFETIME = 60 * 10 BLOCK_SECTION_TEXT_MAX_SIZE = 2800 PRIVATE_METADATA_MAX_LENGTH = 3000 diff --git a/engine/apps/slack/0007_migrate_slackmessage_channel_id.py b/engine/apps/slack/migrations/0007_migrate_slackmessage_channel_id.py similarity index 100% rename from engine/apps/slack/0007_migrate_slackmessage_channel_id.py rename to engine/apps/slack/migrations/0007_migrate_slackmessage_channel_id.py diff --git a/engine/apps/slack/models/slack_message.py b/engine/apps/slack/models/slack_message.py index 37fa6f01e8..3355b701c1 100644 --- a/engine/apps/slack/models/slack_message.py +++ b/engine/apps/slack/models/slack_message.py @@ -3,7 +3,10 @@ import typing import uuid +from celery import uuid as celery_uuid +from django.core.cache import cache from django.db import models +from django.utils import timezone from apps.slack.client import SlackClient from apps.slack.constants import BLOCK_SECTION_TEXT_MAX_SIZE @@ -15,6 +18,7 @@ SlackAPIRatelimitError, SlackAPITokenError, ) +from apps.slack.tasks import update_alert_group_slack_message if typing.TYPE_CHECKING: from apps.alerts.models import AlertGroup @@ -30,6 +34,8 @@ class SlackMessage(models.Model): alert_group: typing.Optional["AlertGroup"] channel: "SlackChannel" + ALERT_GROUP_UPDATE_DEBOUNCE_INTERVAL_SECONDS = 45 + id = models.CharField(primary_key=True, default=uuid.uuid4, editable=False, max_length=36) slack_id = models.CharField(max_length=100) @@ -85,7 +91,7 @@ class SlackMessage(models.Model): active_update_task_id = models.CharField(max_length=100, null=True, default=None) """ - ID of the latest celery task to update the message + DEPRECATED/TODO: drop this field in a separate PR/release """ class Meta: @@ -259,3 +265,87 @@ def send_slack_notification( slack_user_identity.send_link_to_slack_message(slack_message) except (SlackAPITokenError, SlackAPIMethodNotSupportedForChannelTypeError): pass + + def _get_update_message_cache_key(self) -> str: + return f"update_alert_group_slack_message_{self.alert_group.pk}" + + def get_active_update_task_id(self) -> typing.Optional[str]: + return cache.get(self._get_update_message_cache_key(), default=None) + + def set_active_update_task_id(self, task_id: str) -> None: + """ + NOTE: we store the task ID in the cache for twice the debounce interval to ensure that the task ID is + EVENTUALLY removed. The background task which updates the message will remove the task ID from the cache, but + this is a safety measure in case the task fails to run or complete. The task ID would be removed from the cache + which would then allow the message to be updated again in a subsequent call to this method. + """ + cache.set( + self._get_update_message_cache_key(), + task_id, + timeout=self.ALERT_GROUP_UPDATE_DEBOUNCE_INTERVAL_SECONDS * 2, + ) + + def mark_active_update_task_as_complete(self) -> None: + self.last_updated = timezone.now() + self.save(update_fields=["last_updated"]) + + cache.delete(self._get_update_message_cache_key()) + + def update_alert_groups_message(self, debounce: bool) -> None: + """ + Schedule an update task for the associated alert group's Slack message, respecting the debounce interval. + + This method ensures that updates to the Slack message related to an alert group are not performed + too frequently, adhering to the `ALERT_GROUP_UPDATE_DEBOUNCE_INTERVAL_SECONDS` debounce interval. + It schedules a background task to update the message after the appropriate countdown. + + The method performs the following steps: + - Checks if there's already an active update task ID set in the cache. If so, exits to prevent + duplicate scheduling. + - Calculates the time since the last update (`last_updated` field) and determines the remaining time needed + to respect the debounce interval. + - Schedules the `update_alert_group_slack_message` task with the calculated countdown. + - Stores the task ID in the cache to prevent multiple tasks from being scheduled. + + debounce: bool - this is intended to be used when we want to debounce updates to the message. Examples: + - when set to True, we will skip scheduling an update task if there's an active update task (eg. debounce it) + - when set to False, we will immediately schedule an update task + """ + if not self.alert_group: + logger.warning( + f"skipping update_alert_groups_message as SlackMessage {self.pk} has no alert_group associated with it" + ) + return + + active_update_task_id = self.get_active_update_task_id() + if debounce and active_update_task_id is not None: + logger.info( + f"skipping update_alert_groups_message as SlackMessage {self.pk} has an active update task " + f"{active_update_task_id} and debounce is set to True" + ) + return + + now = timezone.now() + + # we previously weren't updating the last_updated field for messages, so there will be cases + # where the last_updated field is None + last_updated = self.last_updated or now + + time_since_last_update = (now - last_updated).total_seconds() + remaining_time = self.ALERT_GROUP_UPDATE_DEBOUNCE_INTERVAL_SECONDS - int(time_since_last_update) + countdown = max(remaining_time, 10) if debounce else 0 + + logger.info( + f"updating message for alert_group {self.alert_group.pk} in {countdown} seconds " + f"(debounce interval: {self.ALERT_GROUP_UPDATE_DEBOUNCE_INTERVAL_SECONDS})" + ) + + task_id = celery_uuid() + + # NOTE: we need to persist the task ID in the cache before scheduling the task to prevent + # a race condition where the task starts before the task ID is stored in the cache as the task + # does a check to verify that the celery task id matches the one stored in the cache + # + # (see update_alert_group_slack_message task for more details) + self.set_active_update_task_id(task_id) + update_alert_group_slack_message.apply_async((self.pk,), countdown=countdown, task_id=task_id) diff --git a/engine/apps/slack/scenarios/distribute_alerts.py b/engine/apps/slack/scenarios/distribute_alerts.py index cd088bf7b5..8be2095f05 100644 --- a/engine/apps/slack/scenarios/distribute_alerts.py +++ b/engine/apps/slack/scenarios/distribute_alerts.py @@ -3,15 +3,12 @@ import typing from datetime import datetime -from django.core.cache import cache - from apps.alerts.constants import ActionSource from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE from apps.alerts.incident_appearance.renderers.slack_renderer import AlertSlackRenderer from apps.alerts.models import Alert, AlertGroup, AlertGroupLogRecord, AlertReceiveChannel, Invitation from apps.api.permissions import RBACPermission from apps.slack.chatops_proxy_routing import make_private_metadata, make_value -from apps.slack.constants import CACHE_UPDATE_INCIDENT_SLACK_MESSAGE_LIFETIME from apps.slack.errors import ( SlackAPIChannelArchivedError, SlackAPIChannelNotFoundError, @@ -25,7 +22,7 @@ from apps.slack.models import SlackTeamIdentity, SlackUserIdentity from apps.slack.scenarios import scenario_step from apps.slack.slack_formatter import SlackFormatter -from apps.slack.tasks import send_message_to_thread_if_bot_not_in_channel, update_incident_slack_message +from apps.slack.tasks import send_message_to_thread_if_bot_not_in_channel from apps.slack.types import ( Block, BlockActionType, @@ -36,7 +33,6 @@ PayloadType, ScenarioRoute, ) -from apps.slack.utils import get_cache_key_update_incident_slack_message from common.utils import clean_markup, is_string_with_visible_characters from .step_mixins import AlertGroupActionsMixin @@ -116,6 +112,7 @@ def process_signal(self, alert: Alert) -> None: # do not try to post alert group message to slack if its channel is rate limited if alert_receive_channel.is_rate_limited_in_slack: logger.info("Skip posting or updating alert_group in Slack due to rate limit") + AlertGroup.objects.filter( pk=alert_group_pk, slack_message_sent=False, @@ -184,9 +181,9 @@ def process_signal(self, alert: Alert) -> None: if not alert_receive_channel.is_maintenace_integration: # we do not want to rate limit maintenace alerts.. reason_to_skip_escalation = AlertGroup.RATE_LIMITED - extra_log_msg += ( - f" integration is a maintenance integration alert_receive_channel={alert_receive_channel}" - ) + extra_log_msg += f" integration is a maintenance integration alert_receive_channel={alert_receive_channel.pk}" + + alert_receive_channel.start_send_rate_limit_message_task("Delivering", e.retry_after) else: reraise_exception = True elif isinstance(e, SlackAPITokenError): @@ -268,18 +265,24 @@ def process_signal(self, alert: Alert) -> None: else: # if a new alert comes in, and is grouped to an alert group that has already been posted to Slack, # then we will update that existing Slack message - if not should_skip_escalation_in_slack: - update_task_id = update_incident_slack_message.apply_async( - (self.slack_team_identity.pk, alert_group_pk), - countdown=10, + + alert_group_slack_message = alert_group.slack_message + if not alert_group_slack_message: + logger.info( + f"Skip updating alert group in Slack because alert_group {alert_group_pk} doesn't " + "have a slack message associated with it" ) - cache.set( - get_cache_key_update_incident_slack_message(alert_group_pk), - update_task_id, - timeout=CACHE_UPDATE_INCIDENT_SLACK_MESSAGE_LIFETIME, + return + elif should_skip_escalation_in_slack: + logger.info( + f"Skip updating alert group in Slack because alert_group {alert_group_pk} is set to skip escalation" ) - else: - logger.info("Skip updating alert_group in Slack due to rate limit") + return + + # NOTE: very important. We need to debounce the update_alert_groups_message call here. This is because + # we may possibly receive a flood of incoming alerts. We do not want to trigger a Slack message update + # for each of these, and hence we should instead debounce them + alert_group_slack_message.update_alert_groups_message(debounce=True) def process_scenario( self, @@ -324,13 +327,16 @@ def process_scenario( # for old version with user slack_id selection warning_text = "Oops! Something goes wrong, please try again" self.open_warning_window(payload, warning_text) + if selected_user is not None: Invitation.invite_user(selected_user, alert_group, self.user) else: - self.alert_group_slack_service.update_alert_group_slack_message(alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + alert_group.slack_message.update_alert_groups_message(debounce=False) def process_signal(self, log_record: AlertGroupLogRecord) -> None: - self.alert_group_slack_service.update_alert_group_slack_message(log_record.alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + log_record.alert_group.slack_message.update_alert_groups_message(debounce=False) class SilenceGroupStep(AlertGroupActionsMixin, scenario_step.ScenarioStep): @@ -360,7 +366,8 @@ def process_scenario( ) def process_signal(self, log_record: AlertGroupLogRecord) -> None: - self.alert_group_slack_service.update_alert_group_slack_message(log_record.alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + log_record.alert_group.slack_message.update_alert_groups_message(debounce=False) class UnSilenceGroupStep(AlertGroupActionsMixin, scenario_step.ScenarioStep): @@ -381,7 +388,8 @@ def process_scenario( alert_group.un_silence_by_user_or_backsync(self.user, action_source=ActionSource.SLACK) def process_signal(self, log_record: AlertGroupLogRecord) -> None: - self.alert_group_slack_service.update_alert_group_slack_message(log_record.alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + log_record.alert_group.slack_message.update_alert_groups_message(debounce=False) class SelectAttachGroupStep(AlertGroupActionsMixin, scenario_step.ScenarioStep): @@ -555,7 +563,8 @@ def process_signal(self, log_record: AlertGroupLogRecord) -> None: unfurl_links=True, ) - self.alert_group_slack_service.update_alert_group_slack_message(alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + alert_group.slack_message.update_alert_groups_message(debounce=False) def process_scenario( self, @@ -625,7 +634,8 @@ def process_scenario( alert_group.un_attach_by_user(self.user, action_source=ActionSource.SLACK) def process_signal(self, log_record: AlertGroupLogRecord) -> None: - self.alert_group_slack_service.update_alert_group_slack_message(log_record.alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + log_record.alert_group.slack_message.update_alert_groups_message(debounce=False) class StopInvitationProcess(AlertGroupActionsMixin, scenario_step.ScenarioStep): @@ -658,7 +668,8 @@ def process_scenario( Invitation.stop_invitation(invitation_id, self.user) def process_signal(self, log_record: AlertGroupLogRecord) -> None: - self.alert_group_slack_service.update_alert_group_slack_message(log_record.invitation.alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + log_record.alert_group.slack_message.update_alert_groups_message(debounce=False) class ResolveGroupStep(AlertGroupActionsMixin, scenario_step.ScenarioStep): @@ -696,11 +707,11 @@ def process_scenario( alert_group.resolve_by_user_or_backsync(self.user, action_source=ActionSource.SLACK) def process_signal(self, log_record: AlertGroupLogRecord) -> None: - alert_group = log_record.alert_group # Do not rerender alert_groups which happened while maintenance. # They have no slack messages, since they just attached to the maintenance incident. - if not alert_group.happened_while_maintenance: - self.alert_group_slack_service.update_alert_group_slack_message(alert_group) + if not log_record.alert_group.happened_while_maintenance: + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + log_record.alert_group.slack_message.update_alert_groups_message(debounce=False) class UnResolveGroupStep(AlertGroupActionsMixin, scenario_step.ScenarioStep): @@ -721,7 +732,8 @@ def process_scenario( alert_group.un_resolve_by_user_or_backsync(self.user, action_source=ActionSource.SLACK) def process_signal(self, log_record: AlertGroupLogRecord) -> None: - self.alert_group_slack_service.update_alert_group_slack_message(log_record.alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + log_record.alert_group.slack_message.update_alert_groups_message(debounce=False) class AcknowledgeGroupStep(AlertGroupActionsMixin, scenario_step.ScenarioStep): @@ -742,7 +754,8 @@ def process_scenario( alert_group.acknowledge_by_user_or_backsync(self.user, action_source=ActionSource.SLACK) def process_signal(self, log_record: AlertGroupLogRecord) -> None: - self.alert_group_slack_service.update_alert_group_slack_message(log_record.alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + log_record.alert_group.slack_message.update_alert_groups_message(debounce=False) class UnAcknowledgeGroupStep(AlertGroupActionsMixin, scenario_step.ScenarioStep): @@ -811,7 +824,8 @@ def process_signal(self, log_record: AlertGroupLogRecord) -> None: alert_group, attachments=message_attachments, text=text ) - self.alert_group_slack_service.update_alert_group_slack_message(alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + slack_message.update_alert_groups_message(debounce=False) logger.debug(f"Finished process_signal in UnAcknowledgeGroupStep for alert_group {alert_group.pk}") @@ -932,7 +946,8 @@ def process_signal(self, log_record: AlertGroupLogRecord) -> None: text=f"Wiped by {log_record.author.get_username_with_slack_verbal()}", ) - self.alert_group_slack_service.update_alert_group_slack_message(alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + alert_group.slack_message.update_alert_groups_message(debounce=False) class DeleteGroupStep(scenario_step.ScenarioStep): diff --git a/engine/apps/slack/scenarios/resolution_note.py b/engine/apps/slack/scenarios/resolution_note.py index 130a102f85..9fe9536871 100644 --- a/engine/apps/slack/scenarios/resolution_note.py +++ b/engine/apps/slack/scenarios/resolution_note.py @@ -222,7 +222,8 @@ def process_scenario( except SlackAPIError: pass - self.alert_group_slack_service.update_alert_group_slack_message(alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + slack_message.update_alert_groups_message(debounce=False) else: warning_text = "Unable to add this message to resolution note." self.open_warning_window(payload, warning_text) @@ -261,6 +262,7 @@ def post_or_update_resolution_note_in_thread(self, resolution_note: "ResolutionN resolution_note_slack_message = resolution_note.resolution_note_slack_message alert_group = resolution_note.alert_group alert_group_slack_message = alert_group.slack_message + blocks = self.get_resolution_note_blocks(resolution_note) # TODO: once _channel_id has been fully migrated to channel, remove _channel_id @@ -321,7 +323,8 @@ def post_or_update_resolution_note_in_thread(self, resolution_note: "ResolutionN def update_alert_group_resolution_note_button(self, alert_group: "AlertGroup") -> None: if alert_group.slack_message is not None: - self.alert_group_slack_service.update_alert_group_slack_message(alert_group) + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + alert_group.slack_message.update_alert_groups_message(debounce=False) def add_resolution_note_reaction(self, slack_thread_message: "ResolutionNoteSlackMessage"): try: diff --git a/engine/apps/slack/scenarios/slack_channel_integration.py b/engine/apps/slack/scenarios/slack_channel_integration.py index 4d9b6be314..140b2af7b3 100644 --- a/engine/apps/slack/scenarios/slack_channel_integration.py +++ b/engine/apps/slack/scenarios/slack_channel_integration.py @@ -145,9 +145,10 @@ def delete_thread_message_from_resolution_note( except ResolutionNoteSlackMessage.DoesNotExist: pass else: - alert_group = slack_thread_message.alert_group slack_thread_message.delete() - self.alert_group_slack_service.update_alert_group_slack_message(alert_group) + + # don't debounce, so that we update the message immediately, this isn't a high traffic activity + slack_thread_message.alert_group.slack_message.update_alert_groups_message(debounce=False) STEPS_ROUTING: ScenarioRoute.RoutingSteps = [ diff --git a/engine/apps/slack/tasks.py b/engine/apps/slack/tasks.py index bfd6f22fd5..deff2cba28 100644 --- a/engine/apps/slack/tasks.py +++ b/engine/apps/slack/tasks.py @@ -11,19 +11,19 @@ from apps.slack.alert_group_slack_service import AlertGroupSlackService from apps.slack.client import SlackClient -from apps.slack.constants import CACHE_UPDATE_INCIDENT_SLACK_MESSAGE_LIFETIME, SLACK_BOT_ID +from apps.slack.constants import SLACK_BOT_ID from apps.slack.errors import ( + SlackAPICantUpdateMessageError, + SlackAPIChannelInactiveError, + SlackAPIChannelNotFoundError, SlackAPIInvalidAuthError, + SlackAPIMessageNotFoundError, SlackAPIPlanUpgradeRequiredError, SlackAPIRatelimitError, SlackAPITokenError, SlackAPIUsergroupNotFoundError, ) -from apps.slack.utils import ( - get_cache_key_update_incident_slack_message, - get_populate_slack_channel_task_id_key, - post_message_to_channel, -) +from apps.slack.utils import get_populate_slack_channel_task_id_key, post_message_to_channel from common.custom_celery_tasks import shared_dedicated_queue_retry_task from common.utils import batch_queryset @@ -32,39 +32,120 @@ @shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True) -def update_incident_slack_message(slack_team_identity_pk, alert_group_pk): - cache_key = get_cache_key_update_incident_slack_message(alert_group_pk) - cached_task_id = cache.get(cache_key) - current_task_id = update_incident_slack_message.request.id - - if cached_task_id is None: - update_task_id = update_incident_slack_message.apply_async( - (slack_team_identity_pk, alert_group_pk), - countdown=10, +def update_alert_group_slack_message(slack_message_pk: int) -> None: + """ + Background task to update the Slack message for an alert group. + + This function is intended to be executed as a Celery task. It performs the following: + - Compares the current task ID with the task ID stored in the cache. + - If they do not match, it means a newer task has been scheduled, so the current task exits to prevent duplicated updates. + - Does the actual update of the Slack message. + - Upon successful completion, clears the task ID from the cache to allow future updates (also note that + the task ID is set in the cache with a timeout, so it will be automatically cleared after a certain period, even + if this task fails to clear it. See `SlackMessage.update_alert_groups_message` for more details). + + Args: + slack_message_pk (int): The primary key of the `SlackMessage` instance to update. + """ + from apps.slack.models import SlackMessage + + current_task_id = update_alert_group_slack_message.request.id + + logger.info( + f"update_alert_group_slack_message for slack message {slack_message_pk} started with task_id {current_task_id}" + ) + + try: + slack_message = SlackMessage.objects.get(pk=slack_message_pk) + except SlackMessage.DoesNotExist: + logger.warning(f"SlackMessage {slack_message_pk} doesn't exist") + return + + active_update_task_id = slack_message.get_active_update_task_id() + if current_task_id != active_update_task_id: + logger.warning( + f"update_alert_group_slack_message skipped, because current_task_id ({current_task_id}) " + f"does not equal to active_update_task_id ({active_update_task_id}) " ) - cache.set(cache_key, update_task_id, timeout=CACHE_UPDATE_INCIDENT_SLACK_MESSAGE_LIFETIME) + return - return ( - f"update_incident_slack_message rescheduled because of current task_id ({current_task_id})" - f" for alert_group {alert_group_pk} doesn't exist in cache" + alert_group = slack_message.alert_group + if not alert_group: + logger.warning( + f"skipping update_alert_group_slack_message as SlackMessage {slack_message_pk} " + "doesn't have an alert group associated with it" ) - if not current_task_id == cached_task_id: - return ( - f"update_incident_slack_message skipped, because of current task_id ({current_task_id})" - f" doesn't equal to cached task_id ({cached_task_id}) for alert_group {alert_group_pk}" + return + + alert_group_pk = alert_group.pk + alert_receive_channel = alert_group.channel + alert_receive_channel_is_rate_limited = alert_receive_channel.is_rate_limited_in_slack + + if alert_group.skip_escalation_in_slack: + logger.warning( + f"skipping update_alert_group_slack_message as AlertGroup {alert_group_pk} " + "has skip_escalation_in_slack set to True" ) + return + elif alert_receive_channel_is_rate_limited: + logger.warning( + f"skipping update_alert_group_slack_message as AlertGroup {alert_group.pk}'s " + f"integration ({alert_receive_channel.pk}) is rate-limited" + ) + return + + slack_client = SlackClient(slack_message.slack_team_identity) + try: + slack_client.chat_update( + # TODO: once _channel_id has been fully migrated to channel, remove _channel_id + # see https://raintank-corp.slack.com/archives/C06K1MQ07GS/p173255546 + # channel=slack_message.channel.slack_id, + channel=slack_message._channel_id, + ts=slack_message.slack_id, + attachments=alert_group.render_slack_attachments(), + blocks=alert_group.render_slack_blocks(), + ) + + logger.info(f"Message has been updated for alert_group {alert_group_pk}") + except SlackAPIRatelimitError as e: + if not alert_receive_channel.is_maintenace_integration: + if not alert_receive_channel_is_rate_limited: + alert_receive_channel.start_send_rate_limit_message_task("Updating", e.retry_after) + logger.info(f"Message has not been updated for alert_group {alert_group_pk} due to slack rate limit.") + else: + raise + except ( + SlackAPIMessageNotFoundError, + SlackAPICantUpdateMessageError, + SlackAPIChannelInactiveError, + SlackAPITokenError, + SlackAPIChannelNotFoundError, + ): + pass + + slack_message.mark_active_update_task_as_complete() + + +@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True) +def update_incident_slack_message(slack_team_identity_pk: int, alert_group_pk: int) -> None: + """ + TODO: this method has been deprecated, and all references to it removed, remove it once task queues no + longer reference it. + """ from apps.alerts.models import AlertGroup - from apps.slack.models import SlackTeamIdentity - slack_team_identity = SlackTeamIdentity.objects.get(pk=slack_team_identity_pk) alert_group = AlertGroup.objects.get(pk=alert_group_pk) - if alert_group.skip_escalation_in_slack or alert_group.channel.is_rate_limited_in_slack: - return "Skip message update in Slack due to rate limit" - if alert_group.slack_message is None: - return "Skip message update in Slack due to absence of slack message" - AlertGroupSlackService(slack_team_identity).update_alert_group_slack_message(alert_group) + # NOTE: alert_group can't be None here, AlertGroup.objects.get(pk=alert_group_pk) would + # raise AlertGroup.DoesNotExist in this case + if not alert_group.slack_message: + logger.info( + f"skipping update_incident_slack_message as AlertGroup {alert_group_pk} doesn't have a slack message" + ) + return + + alert_group.slack_message.update_alert_groups_message(debounce=False) @shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True) @@ -153,7 +234,6 @@ def send_message_to_thread_if_bot_not_in_channel( """ Send message to alert group's thread if bot is not in current channel """ - from apps.alerts.models import AlertGroup from apps.slack.models import SlackTeamIdentity @@ -286,7 +366,13 @@ def populate_slack_user_identities(organization_pk): @shared_dedicated_queue_retry_task( autoretry_for=(Exception,), retry_backoff=True, max_retries=1 if settings.DEBUG else None ) -def post_slack_rate_limit_message(integration_id): +def post_slack_rate_limit_message(integration_id: int, error_message_verb: typing.Optional[str] = None) -> None: + """ + NOTE: error_message_verb was added to the function signature to allow for more descriptive error messages. + + We set it to None by default to maintain backwards compatibility with existing tasks. The default of None + can likely be removed in the near future (once existing tasks on the queue have been processed). + """ from apps.alerts.models import AlertReceiveChannel try: @@ -304,11 +390,14 @@ def post_slack_rate_limit_message(integration_id): default_route = integration.channel_filters.get(is_default=True) if (slack_channel := default_route.slack_channel_or_org_default) is not None: + # NOTE: see function docstring above 👆 + if error_message_verb is None: + error_message_verb = "Sending messages for" + text = ( - f"Delivering and updating alert groups of integration {integration.verbal_name} in Slack is " - f"temporarily stopped due to rate limit. You could find new alert groups at " - f"<{integration.new_incidents_web_link}|web page " - '"Alert Groups">' + f"{error_message_verb} Alert Groups in Slack, for integration {integration.verbal_name}, is " + f"temporarily rate-limited (due to a Slack rate-limit). Meanwhile, you can still find new Alert Groups " + f'in the <{integration.new_incidents_web_link}|"Alert Groups" web page>' ) post_message_to_channel(integration.organization, slack_channel.slack_id, text) diff --git a/engine/apps/slack/tests/scenario_steps/test_distribute_alerts.py b/engine/apps/slack/tests/scenario_steps/test_distribute_alerts.py index e815d78036..06017ef996 100644 --- a/engine/apps/slack/tests/scenario_steps/test_distribute_alerts.py +++ b/engine/apps/slack/tests/scenario_steps/test_distribute_alerts.py @@ -2,7 +2,6 @@ from unittest.mock import patch import pytest -from django.core.cache import cache from django.utils import timezone from apps.alerts.models import AlertGroup, AlertReceiveChannel @@ -10,7 +9,6 @@ from apps.slack.models import SlackMessage from apps.slack.scenarios.distribute_alerts import IncomingAlertStep from apps.slack.tests.conftest import build_slack_response -from apps.slack.utils import get_cache_key_update_incident_slack_message SLACK_MESSAGE_TS = "1234567890.123456" SLACK_POST_MESSAGE_SUCCESS_RESPONSE = {"ts": SLACK_MESSAGE_TS} @@ -283,22 +281,20 @@ def test_process_signal_send_message_to_thread_if_bot_not_in_channel( ) @patch("apps.slack.client.SlackClient.chat_postMessage") - @patch("apps.slack.scenarios.distribute_alerts.update_incident_slack_message") + @patch("apps.slack.models.SlackMessage.update_alert_groups_message") @pytest.mark.django_db def test_process_signal_update_existing_message( self, - mock_update_incident_slack_message, + mock_update_alert_groups_message, mock_chat_postMessage, make_slack_team_identity, make_slack_channel, + make_slack_message, make_organization, make_alert_receive_channel, make_alert_group, make_alert, ): - mocked_update_incident_task_id = "1234" - mock_update_incident_slack_message.apply_async.return_value = mocked_update_incident_task_id - slack_team_identity = make_slack_team_identity() slack_channel = make_slack_channel(slack_team_identity) organization = make_organization(slack_team_identity=slack_team_identity, default_slack_channel=slack_channel) @@ -310,6 +306,8 @@ def test_process_signal_update_existing_message( slack_message_sent=True, reason_to_skip_escalation=AlertGroup.NO_REASON, ) + make_slack_message(slack_channel, alert_group=alert_group) + assert alert_group.skip_escalation_in_slack is False alert = make_alert(alert_group, raw_request_data={}) @@ -317,23 +315,20 @@ def test_process_signal_update_existing_message( step = IncomingAlertStep(slack_team_identity) step.process_signal(alert) - # assert that the background task is scheduled - mock_update_incident_slack_message.apply_async.assert_called_once_with( - (slack_team_identity.pk, alert_group.pk), countdown=10 - ) + # assert that the SlackMessage is updated, and that it is debounced + mock_update_alert_groups_message.assert_called_once_with(debounce=True) mock_chat_postMessage.assert_not_called() - # Verify that the cache is set correctly - assert cache.get(get_cache_key_update_incident_slack_message(alert_group.pk)) == mocked_update_incident_task_id - @patch("apps.slack.client.SlackClient.chat_postMessage") - @patch("apps.slack.scenarios.distribute_alerts.update_incident_slack_message") + @patch("apps.slack.models.SlackMessage.update_alert_groups_message") @pytest.mark.django_db def test_process_signal_do_not_update_due_to_skip_escalation( self, - mock_update_incident_slack_message, + mock_update_alert_groups_message, mock_chat_postMessage, make_organization_with_slack_team_identity, + make_slack_channel, + make_slack_message, make_alert_receive_channel, make_alert_group, make_alert, @@ -343,6 +338,7 @@ def test_process_signal_do_not_update_due_to_skip_escalation( """ organization, slack_team_identity = make_organization_with_slack_team_identity() alert_receive_channel = make_alert_receive_channel(organization) + slack_channel = make_slack_channel(slack_team_identity) # Simulate that slack_message_sent is already True and skip escalation due to RATE_LIMITED alert_group = make_alert_group( @@ -351,12 +347,13 @@ def test_process_signal_do_not_update_due_to_skip_escalation( reason_to_skip_escalation=AlertGroup.RATE_LIMITED, # Ensures skip_escalation_in_slack is True ) alert = make_alert(alert_group, raw_request_data={}) + make_slack_message(slack_channel, alert_group=alert_group) step = IncomingAlertStep(slack_team_identity) step.process_signal(alert) - # assert that the background task is not scheduled - mock_update_incident_slack_message.apply_async.assert_not_called() + # assert that we don't update the SlackMessage + mock_update_alert_groups_message.assert_not_called() mock_chat_postMessage.assert_not_called() @patch("apps.slack.client.SlackClient.chat_postMessage", side_effect=TimeoutError) @@ -399,6 +396,7 @@ def test_process_signal_timeout_error( assert SlackMessage.objects.count() == 0 assert not alert.delivered + @patch("apps.alerts.models.AlertReceiveChannel.start_send_rate_limit_message_task") @pytest.mark.parametrize( "reason,slack_error", [ @@ -412,6 +410,7 @@ def test_process_signal_timeout_error( @pytest.mark.django_db def test_process_signal_slack_errors( self, + mock_start_send_rate_limit_message_task, make_slack_team_identity, make_organization, make_alert_receive_channel, @@ -433,7 +432,8 @@ def test_process_signal_slack_errors( with patch.object(step._slack_client, "chat_postMessage") as mock_chat_postMessage: error_response = build_slack_response({"error": slack_error}) error_class = get_error_class(error_response) - mock_chat_postMessage.side_effect = error_class(error_response) + slack_error_raised = error_class(error_response) + mock_chat_postMessage.side_effect = slack_error_raised step.process_signal(alert) @@ -446,6 +446,13 @@ def test_process_signal_slack_errors( blocks=alert_group.render_slack_blocks(), ) + if error_class == SlackAPIRatelimitError: + mock_start_send_rate_limit_message_task.assert_called_once_with( + "Delivering", slack_error_raised.retry_after + ) + else: + mock_start_send_rate_limit_message_task.assert_not_called() + # For these Slack errors, retrying won't really help, so we should not set slack_message_sent back to False assert alert_group.slack_message_sent is True @@ -454,6 +461,7 @@ def test_process_signal_slack_errors( assert SlackMessage.objects.count() == 0 assert not alert.delivered + @patch("apps.alerts.models.AlertReceiveChannel.start_send_rate_limit_message_task") @patch( "apps.slack.client.SlackClient.chat_postMessage", side_effect=SlackAPIRatelimitError(build_slack_response({"error": "ratelimited"})), @@ -462,6 +470,7 @@ def test_process_signal_slack_errors( def test_process_signal_slack_api_ratelimit_for_maintenance_integration( self, mock_chat_postMessage, + mock_start_send_rate_limit_message_task, make_slack_team_identity, make_slack_channel, make_organization, @@ -496,6 +505,8 @@ def test_process_signal_slack_api_ratelimit_for_maintenance_integration( alert_group.refresh_from_db() + mock_start_send_rate_limit_message_task.assert_not_called() + # Ensure that slack_message_sent is set back to False, this will allow us to retry.. a SlackAPIRatelimitError, # may have been a transient error that is "recoverable" # diff --git a/engine/apps/slack/tests/scenario_steps/test_resolution_note.py b/engine/apps/slack/tests/scenario_steps/test_resolution_note.py index 0f1598041c..28124202b5 100644 --- a/engine/apps/slack/tests/scenario_steps/test_resolution_note.py +++ b/engine/apps/slack/tests/scenario_steps/test_resolution_note.py @@ -347,12 +347,14 @@ def test_resolution_notes_modal_closed_before_update( assert call_args[0] == "views.update" +@patch("apps.slack.models.SlackMessage.update_alert_groups_message") @patch.object(SlackClient, "reactions_add") @patch.object(SlackClient, "chat_getPermalink", return_value={"permalink": "https://example.com"}) @pytest.mark.django_db def test_add_to_resolution_note( _mock_chat_getPermalink, mock_reactions_add, + mock_update_alert_groups_message, make_organization_and_user_with_slack_identities, make_alert_receive_channel, make_alert_group, @@ -386,6 +388,8 @@ def test_add_to_resolution_note( step.process_scenario(slack_user_identity, slack_team_identity, payload) mock_reactions_add.assert_called_once() + mock_update_alert_groups_message.assert_called_once_with(debounce=False) + assert alert_group.resolution_notes.get().text == "Test resolution note" diff --git a/engine/apps/slack/tests/scenario_steps/test_slack_channel_integration.py b/engine/apps/slack/tests/scenario_steps/test_slack_channel_integration.py index 993692d6bc..a081cb0e06 100644 --- a/engine/apps/slack/tests/scenario_steps/test_slack_channel_integration.py +++ b/engine/apps/slack/tests/scenario_steps/test_slack_channel_integration.py @@ -393,8 +393,9 @@ def test_delete_thread_message_from_resolution_note_no_slack_user_identity( MockResolutionNoteSlackMessage.objects.get.assert_not_called() + @patch("apps.slack.models.SlackMessage.update_alert_groups_message") def test_delete_thread_message_from_resolution_note_no_message_found( - self, make_organization_and_user_with_slack_identities + self, mock_update_alert_groups_message, make_organization_and_user_with_slack_identities ) -> None: ( organization, @@ -418,19 +419,20 @@ def test_delete_thread_message_from_resolution_note_no_message_found( } step = SlackChannelMessageEventStep(slack_team_identity, organization, user) - step.alert_group_slack_service = Mock() - step.delete_thread_message_from_resolution_note(slack_user_identity, payload) - step.alert_group_slack_service.assert_not_called() + mock_update_alert_groups_message.assert_not_called() + @patch("apps.slack.models.SlackMessage.update_alert_groups_message") def test_delete_thread_message_from_resolution_note( self, + mock_update_alert_groups_message, make_organization_and_user_with_slack_identities, make_alert_receive_channel, make_alert_group, make_resolution_note_slack_message, make_slack_channel, + make_slack_message, ) -> None: channel_id = "potato" ts = 88945.4849 @@ -445,6 +447,7 @@ def test_delete_thread_message_from_resolution_note( slack_channel = make_slack_channel(slack_team_identity, slack_id=channel_id) integration = make_alert_receive_channel(organization) alert_group = make_alert_group(integration) + make_slack_message(alert_group=alert_group, slack_id=thread_ts, channel=slack_channel) payload = { "event": { @@ -461,11 +464,8 @@ def test_delete_thread_message_from_resolution_note( ) step = SlackChannelMessageEventStep(slack_team_identity, organization, user) - step.alert_group_slack_service = Mock() - step.delete_thread_message_from_resolution_note(slack_user_identity, payload) - step.alert_group_slack_service.update_alert_group_slack_message.assert_called_once_with(alert_group) assert ( ResolutionNoteSlackMessage.objects.filter( ts=ts, @@ -475,6 +475,8 @@ def test_delete_thread_message_from_resolution_note( == 0 ) + mock_update_alert_groups_message.assert_called_once_with(debounce=False) + def test_slack_message_has_no_alert_group( self, make_organization_and_user_with_slack_identities, diff --git a/engine/apps/slack/tests/tasks/test_update_alert_group_slack_message.py b/engine/apps/slack/tests/tasks/test_update_alert_group_slack_message.py new file mode 100644 index 0000000000..ee03833229 --- /dev/null +++ b/engine/apps/slack/tests/tasks/test_update_alert_group_slack_message.py @@ -0,0 +1,377 @@ +from unittest.mock import patch + +import pytest +from django.utils import timezone + +from apps.alerts.models import AlertGroup +from apps.slack.errors import ( + SlackAPICantUpdateMessageError, + SlackAPIChannelInactiveError, + SlackAPIChannelNotFoundError, + SlackAPIMessageNotFoundError, + SlackAPIRatelimitError, + SlackAPITokenError, +) +from apps.slack.tasks import update_alert_group_slack_message +from apps.slack.tests.conftest import build_slack_response + + +@pytest.fixture +def mocked_rate_limited_slack_response(): + return build_slack_response({}, status_code=429, headers={"Retry-After": 123}) + + +class TestUpdateAlertGroupSlackMessageTask: + @pytest.mark.django_db + def test_update_alert_group_slack_message_slack_message_not_found(self): + """ + Test that the task exits early if SlackMessage does not exist. + """ + # No need to patch anything, just run the task with a non-existing pk + update_alert_group_slack_message.apply((99999,), task_id="task-id") + + # Since there is no exception raised, the test passes + + @patch("apps.slack.tasks.SlackClient.chat_update") + @pytest.mark.django_db + def test_update_alert_group_slack_message_task_id_mismatch( + self, + mock_chat_update, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + make_alert, + ): + """ + Test that the task exits early if current_task_id doesn't match the task ID that exists in the cache + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + slack_channel = make_slack_channel(slack_team_identity) + + slack_message = make_slack_message(alert_group=alert_group, channel=slack_channel) + slack_message.set_active_update_task_id("original-task-id") + + update_alert_group_slack_message.apply((slack_message.pk,), task_id="different-task-id") + + # Ensure that SlackClient.chat_update is not called + mock_chat_update.assert_not_called() + + @patch("apps.slack.tasks.SlackClient.chat_update") + @pytest.mark.django_db + def test_update_alert_group_slack_message_no_alert_group( + self, + mock_chat_update, + make_organization_with_slack_team_identity, + make_slack_channel, + make_slack_message, + ): + """ + Test that the task exits early if SlackMessage has no alert_group. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + slack_channel = make_slack_channel(slack_team_identity) + slack_message = make_slack_message(alert_group=None, channel=slack_channel, organization=organization) + + update_alert_group_slack_message.apply((slack_message.pk,), task_id="task-id") + + # Ensure that SlackClient.chat_update is not called + mock_chat_update.assert_not_called() + + @patch("apps.slack.tasks.SlackClient.chat_update") + @pytest.mark.django_db + def test_update_alert_group_slack_message_skip_escalation_in_slack( + self, + mock_chat_update, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + make_alert, + ): + """ + Test that the task exits early if alert_group.skip_escalation_in_slack is True. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group( + alert_receive_channel, + reason_to_skip_escalation=AlertGroup.CHANNEL_ARCHIVED, + ) + make_alert(alert_group=alert_group, raw_request_data={}) + slack_channel = make_slack_channel(slack_team_identity) + + slack_message = make_slack_message(alert_group=alert_group, channel=slack_channel) + slack_message.set_active_update_task_id("task-id") + + # Ensure skip_escalation_in_slack is True + assert alert_group.skip_escalation_in_slack is True + + update_alert_group_slack_message.apply((slack_message.pk,), task_id="task-id") + + # Ensure that SlackClient.chat_update is not called + mock_chat_update.assert_not_called() + + # Verify that the active update task ID is not cleared and last_updated is not set + slack_message.refresh_from_db() + assert slack_message.get_active_update_task_id() == "task-id" + assert slack_message.last_updated is None + + @patch("apps.slack.tasks.SlackClient.chat_update") + @pytest.mark.django_db + def test_update_alert_group_slack_message_alert_receive_channel_rate_limited( + self, + mock_chat_update, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + make_alert, + ): + """ + Test that the task exits early if alert_receive_channel.is_rate_limited_in_slack is True. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel( + organization, + rate_limited_in_slack_at=timezone.now(), + ) + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + slack_channel = make_slack_channel(slack_team_identity) + + slack_message = make_slack_message(alert_group=alert_group, channel=slack_channel) + slack_message.set_active_update_task_id("task-id") + + # Ensure is_rate_limited_in_slack is True + assert alert_receive_channel.is_rate_limited_in_slack is True + + update_alert_group_slack_message.apply((slack_message.pk,), task_id="task-id") + + # Ensure that SlackClient.chat_update is not called + mock_chat_update.assert_not_called() + + # Verify that the active update task ID is not cleared and last_updated is not set + slack_message.refresh_from_db() + assert slack_message.get_active_update_task_id() == "task-id" + assert slack_message.last_updated is None + + @patch("apps.slack.tasks.SlackClient.chat_update") + @pytest.mark.django_db + def test_update_alert_group_slack_message_successful( + self, + mock_chat_update, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + make_alert, + ): + """ + Test that the task successfully updates the alert group's Slack message. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + slack_channel = make_slack_channel(slack_team_identity) + + slack_message = make_slack_message(alert_group=alert_group, channel=slack_channel) + slack_message.set_active_update_task_id("task-id") + + update_alert_group_slack_message.apply((slack_message.pk,), task_id="task-id") + + # Assert that SlackClient.chat_update was called with correct parameters + mock_chat_update.assert_called_once_with( + channel=slack_message._channel_id, + ts=slack_message.slack_id, + attachments=alert_group.render_slack_attachments(), + blocks=alert_group.render_slack_blocks(), + ) + + # Verify that cache ID is cleared from the cache and last_updated is set + slack_message.refresh_from_db() + assert slack_message.get_active_update_task_id() is None + assert slack_message.last_updated is not None + + @patch("apps.slack.tasks.SlackClient.chat_update") + @patch("apps.alerts.models.AlertReceiveChannel.start_send_rate_limit_message_task") + @pytest.mark.django_db + def test_update_alert_group_slack_message_ratelimit_error_not_maintenance( + self, + mock_start_send_rate_limit_message_task, + mock_chat_update, + mocked_rate_limited_slack_response, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + make_alert, + ): + """ + Test handling of SlackAPIRatelimitError when not a maintenance integration. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + + # Ensure channel is not a maintenance integration and not already rate-limited + assert alert_receive_channel.is_maintenace_integration is False + assert alert_receive_channel.is_rate_limited_in_slack is False + + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + slack_channel = make_slack_channel(slack_team_identity) + + slack_message = make_slack_message(alert_group=alert_group, channel=slack_channel) + slack_message.set_active_update_task_id("task-id") + + # SlackClient.chat_update raises SlackAPIRatelimitError + slack_api_ratelimit_error = SlackAPIRatelimitError(mocked_rate_limited_slack_response) + mock_chat_update.side_effect = slack_api_ratelimit_error + + update_alert_group_slack_message.apply((slack_message.pk,), task_id="task-id") + + # Assert that start_send_rate_limit_message_task was called + mock_start_send_rate_limit_message_task.assert_called_with("Updating", slack_api_ratelimit_error.retry_after) + + @patch("apps.slack.tasks.SlackClient.chat_update") + @patch("apps.alerts.models.AlertReceiveChannel.start_send_rate_limit_message_task") + @pytest.mark.django_db + def test_update_alert_group_slack_message_ratelimit_error_is_maintenance( + self, + mock_start_send_rate_limit_message_task, + mock_chat_update, + mocked_rate_limited_slack_response, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + make_alert, + ): + """ + Test that SlackAPIRatelimitError is re-raised when it is a maintenance integration. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization, integration="maintenance") + + # Ensure channel is a maintenance integration and not already rate-limited + assert alert_receive_channel.is_maintenace_integration is True + assert alert_receive_channel.is_rate_limited_in_slack is False + + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + slack_channel = make_slack_channel(slack_team_identity) + + slack_message = make_slack_message(alert_group=alert_group, channel=slack_channel) + slack_message.set_active_update_task_id("task-id") + + # SlackClient.chat_update raises SlackAPIRatelimitError + slack_api_ratelimit_error = SlackAPIRatelimitError(mocked_rate_limited_slack_response) + mock_chat_update.side_effect = slack_api_ratelimit_error + + update_alert_group_slack_message.apply((slack_message.pk,), task_id="task-id") + + slack_message.refresh_from_db() + + # Assert that start_send_rate_limit_message_task was not called, task id is not cleared, and we don't + # update last_updated + mock_start_send_rate_limit_message_task.assert_not_called() + assert slack_message.get_active_update_task_id() == "task-id" + assert slack_message.last_updated is None + + @patch("apps.slack.tasks.SlackClient.chat_update") + @patch("apps.alerts.models.AlertReceiveChannel.start_send_rate_limit_message_task") + @pytest.mark.parametrize( + "ExceptionClass", + [ + SlackAPIMessageNotFoundError, + SlackAPICantUpdateMessageError, + SlackAPIChannelInactiveError, + SlackAPITokenError, + SlackAPIChannelNotFoundError, + ], + ) + @pytest.mark.django_db + def test_update_alert_group_slack_message_other_exceptions( + self, + mock_start_send_rate_limit_message_task, + mock_chat_update, + ExceptionClass, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + make_alert, + ): + """ + Test that other Slack API exceptions are handled silently. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + slack_channel = make_slack_channel(slack_team_identity) + + slack_message = make_slack_message(alert_group=alert_group, channel=slack_channel) + slack_message.set_active_update_task_id("task-id") + + # SlackClient.chat_update raises the exception class + mock_chat_update.side_effect = ExceptionClass("foo bar") + + # Call the task + update_alert_group_slack_message.apply((slack_message.pk,), task_id="task-id") + + # Ensure that exception was caught and passed + # SlackClient.chat_update was called + mock_chat_update.assert_called_once() + + # Assert that start_send_rate_limit_message_task was not called + mock_start_send_rate_limit_message_task.assert_not_called() + + # Verify that cache ID is cleared from the cache and last_updated is set + slack_message.refresh_from_db() + assert slack_message.get_active_update_task_id() is None + assert slack_message.last_updated is not None + + @patch("apps.slack.tasks.SlackClient.chat_update") + @pytest.mark.django_db + def test_update_alert_group_slack_message_unexpected_exception( + self, + mock_chat_update, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + make_alert, + ): + """ + Test that an unexpected exception propagates as expected. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + make_alert(alert_group=alert_group, raw_request_data={}) + + slack_channel = make_slack_channel(slack_team_identity) + slack_message = make_slack_message(alert_group=alert_group, channel=slack_channel) + slack_message.set_active_update_task_id("task-id") + + # SlackClient.chat_update raises a generic exception + mock_chat_update.side_effect = ValueError("Unexpected error") + + update_alert_group_slack_message.apply((slack_message.pk,), task_id="task-id") + + # Assert that task id is not cleared, and we don't update last_updated + assert slack_message.get_active_update_task_id() == "task-id" + assert slack_message.last_updated is None diff --git a/engine/apps/slack/tests/test_slack_message.py b/engine/apps/slack/tests/test_slack_message.py index 61419a1b27..5a766654f2 100644 --- a/engine/apps/slack/tests/test_slack_message.py +++ b/engine/apps/slack/tests/test_slack_message.py @@ -1,3 +1,4 @@ +from datetime import timedelta from unittest.mock import patch import pytest @@ -6,6 +7,7 @@ from apps.base.models import UserNotificationPolicy, UserNotificationPolicyLogRecord from apps.slack.client import SlackClient from apps.slack.errors import SlackAPIError +from apps.slack.models import SlackMessage from apps.slack.tests.conftest import build_slack_response @@ -28,6 +30,59 @@ def _slack_message_setup(cached_permalink): return _slack_message_setup +@patch.object( + SlackClient, + "chat_getPermalink", + return_value=build_slack_response({"ok": True, "permalink": "test_permalink"}), +) +@pytest.mark.django_db +def test_slack_message_permalink(mock_slack_api_call, slack_message_setup): + slack_message = slack_message_setup(cached_permalink=None) + assert slack_message.permalink == "test_permalink" + mock_slack_api_call.assert_called_once() + + +@patch.object( + SlackClient, + "chat_getPermalink", + side_effect=SlackAPIError(response=build_slack_response({"ok": False, "error": "message_not_found"})), +) +@pytest.mark.django_db +def test_slack_message_permalink_error(mock_slack_api_call, slack_message_setup): + slack_message = slack_message_setup(cached_permalink=None) + assert slack_message.permalink is None + mock_slack_api_call.assert_called_once() + + +@patch.object( + SlackClient, + "chat_getPermalink", + return_value=build_slack_response({"ok": True, "permalink": "test_permalink"}), +) +@pytest.mark.django_db +def test_slack_message_permalink_cache(mock_slack_api_call, slack_message_setup): + slack_message = slack_message_setup(cached_permalink="cached_permalink") + assert slack_message.permalink == "cached_permalink" + mock_slack_api_call.assert_not_called() + + +@patch.object( + SlackClient, + "chat_getPermalink", + return_value=build_slack_response({"ok": False, "error": "account_inactive"}), +) +@pytest.mark.django_db +def test_slack_message_permalink_token_revoked(mock_slack_api_call, slack_message_setup): + slack_message = slack_message_setup(cached_permalink=None) + slack_message.slack_team_identity.detected_token_revoked = timezone.now() + slack_message.slack_team_identity.save() + + assert slack_message.slack_team_identity is not None + assert slack_message.permalink is None + + mock_slack_api_call.assert_not_called() + + @pytest.mark.django_db def test_send_slack_notification( make_organization_and_user_with_slack_identities, @@ -86,54 +141,230 @@ def test_slack_message_deep_link( assert slack_message.deep_link == expected -@patch.object( - SlackClient, - "chat_getPermalink", - return_value=build_slack_response({"ok": True, "permalink": "test_permalink"}), -) -@pytest.mark.django_db -def test_slack_message_permalink(mock_slack_api_call, slack_message_setup): - slack_message = slack_message_setup(cached_permalink=None) - assert slack_message.permalink == "test_permalink" - mock_slack_api_call.assert_called_once() +class TestSlackMessageUpdateAlertGroupsMessage: + @patch("apps.slack.models.slack_message.update_alert_group_slack_message") + @pytest.mark.django_db + def test_update_alert_groups_message_no_alert_group( + self, + mock_update_alert_group_slack_message, + make_organization_with_slack_team_identity, + make_slack_channel, + make_slack_message, + ): + """ + Test that the method exits early if alert_group is None. + """ + organization, slack_team_identity = make_organization_with_slack_team_identity() + slack_channel = make_slack_channel(slack_team_identity) + slack_message = make_slack_message(channel=slack_channel, alert_group=None, organization=organization) + slack_message.update_alert_groups_message(debounce=True) -@patch.object( - SlackClient, - "chat_getPermalink", - side_effect=SlackAPIError(response=build_slack_response({"ok": False, "error": "message_not_found"})), -) -@pytest.mark.django_db -def test_slack_message_permalink_error(mock_slack_api_call, slack_message_setup): - slack_message = slack_message_setup(cached_permalink=None) - assert slack_message.permalink is None - mock_slack_api_call.assert_called_once() + # Ensure no task is scheduled + mock_update_alert_group_slack_message.apply_async.assert_not_called() + @patch("apps.slack.models.slack_message.update_alert_group_slack_message") + @pytest.mark.django_db + def test_update_alert_groups_message_active_task_exists( + self, + mock_update_alert_group_slack_message, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_alert_group, + make_slack_channel, + make_slack_message, + ): + """ + Test that the method exits early if a task ID is set in the cache and debounce is True. + """ + task_id = "some-task-id" -@patch.object( - SlackClient, - "chat_getPermalink", - return_value=build_slack_response({"ok": True, "permalink": "test_permalink"}), -) -@pytest.mark.django_db -def test_slack_message_permalink_cache(mock_slack_api_call, slack_message_setup): - slack_message = slack_message_setup(cached_permalink="cached_permalink") - assert slack_message.permalink == "cached_permalink" - mock_slack_api_call.assert_not_called() + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + slack_channel = make_slack_channel(slack_team_identity) + slack_message = make_slack_message(channel=slack_channel, alert_group=alert_group) + slack_message.set_active_update_task_id(task_id) -@patch.object( - SlackClient, - "chat_getPermalink", - return_value=build_slack_response({"ok": False, "error": "account_inactive"}), -) -@pytest.mark.django_db -def test_slack_message_permalink_token_revoked(mock_slack_api_call, slack_message_setup): - slack_message = slack_message_setup(cached_permalink=None) - slack_message.slack_team_identity.detected_token_revoked = timezone.now() - slack_message.slack_team_identity.save() + slack_message.update_alert_groups_message(debounce=True) - assert slack_message.slack_team_identity is not None - assert slack_message.permalink is None + # Ensure no task is scheduled + mock_update_alert_group_slack_message.apply_async.assert_not_called() - mock_slack_api_call.assert_not_called() + # Ensure task ID in the cache remains unchanged + assert slack_message.get_active_update_task_id() == task_id + + @patch("apps.slack.models.slack_message.celery_uuid") + @patch("apps.slack.models.slack_message.update_alert_group_slack_message") + @pytest.mark.django_db + def test_update_alert_groups_message_last_updated_none( + self, + mock_update_alert_group_slack_message, + mock_celery_uuid, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + ): + """ + Test that the method handles last_updated being None and schedules with default debounce interval. + """ + task_id = "some-task-id" + mock_celery_uuid.return_value = task_id + + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + + slack_channel = make_slack_channel(slack_team_identity) + slack_message = make_slack_message(channel=slack_channel, alert_group=alert_group, last_updated=None) + + assert slack_message.get_active_update_task_id() is None + + slack_message.update_alert_groups_message(debounce=True) + + # Verify that apply_async was called with correct countdown + mock_update_alert_group_slack_message.apply_async.assert_called_once_with( + (slack_message.pk,), + countdown=SlackMessage.ALERT_GROUP_UPDATE_DEBOUNCE_INTERVAL_SECONDS, + task_id=task_id, + ) + + # Verify task ID is set in the cache + assert slack_message.get_active_update_task_id() == task_id + + @patch("apps.slack.models.slack_message.celery_uuid") + @patch("apps.slack.models.slack_message.update_alert_group_slack_message") + @pytest.mark.django_db + def test_update_alert_groups_message_schedules_task_correctly( + self, + mock_update_alert_group_slack_message, + mock_celery_uuid, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + ): + """ + Test that the method schedules the task with correct countdown and updates the task ID in the cache + """ + task_id = "some-task-id" + mock_celery_uuid.return_value = task_id + + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + + slack_channel = make_slack_channel(slack_team_identity) + slack_message = make_slack_message( + channel=slack_channel, + alert_group=alert_group, + last_updated=timezone.now() - timedelta(seconds=10), + ) + + assert slack_message.get_active_update_task_id() is None + + slack_message.update_alert_groups_message(debounce=True) + + # Verify that apply_async was called with correct countdown + mock_update_alert_group_slack_message.apply_async.assert_called_once_with( + (slack_message.pk,), + countdown=35, + task_id=task_id, + ) + + # Verify the task ID in the cache is updated to new task_id + slack_message.refresh_from_db() + assert slack_message.get_active_update_task_id() == task_id + + @patch("apps.slack.models.slack_message.celery_uuid") + @patch("apps.slack.models.slack_message.update_alert_group_slack_message") + @pytest.mark.django_db + def test_update_alert_groups_message_handles_minimum_countdown( + self, + mock_update_alert_group_slack_message, + mock_celery_uuid, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + ): + """ + Test that the countdown is at least 10 seconds when the debounce interval has passed. + """ + task_id = "some-task-id" + mock_celery_uuid.return_value = task_id + + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + + slack_channel = make_slack_channel(slack_team_identity) + slack_message = make_slack_message( + channel=slack_channel, + alert_group=alert_group, + last_updated=timezone.now() + - timedelta(seconds=SlackMessage.ALERT_GROUP_UPDATE_DEBOUNCE_INTERVAL_SECONDS + 1), + ) + + assert slack_message.get_active_update_task_id() is None + + slack_message.update_alert_groups_message(debounce=True) + + # Verify that apply_async was called with correct countdown + mock_update_alert_group_slack_message.apply_async.assert_called_once_with( + (slack_message.pk,), + # Since the time since last update exceeds the debounce interval, countdown should be 10 + countdown=10, + task_id=task_id, + ) + + # Verify the task ID in the cache is updated to new task_id + slack_message.refresh_from_db() + assert slack_message.get_active_update_task_id() == task_id + + @patch("apps.slack.models.slack_message.celery_uuid") + @patch("apps.slack.models.slack_message.update_alert_group_slack_message") + @pytest.mark.django_db + def test_update_alert_groups_message_debounce_false_schedules_immediately( + self, + mock_update_alert_group_slack_message, + mock_celery_uuid, + make_organization_with_slack_team_identity, + make_alert_receive_channel, + make_slack_channel, + make_slack_message, + make_alert_group, + ): + """ + Test that when debounce is False, the task is scheduled immediately with countdown=0, + even if a task ID is set in the cache. + """ + new_task_id = "new-task-id" + mock_celery_uuid.return_value = new_task_id + + organization, slack_team_identity = make_organization_with_slack_team_identity() + alert_receive_channel = make_alert_receive_channel(organization) + alert_group = make_alert_group(alert_receive_channel) + slack_channel = make_slack_channel(slack_team_identity) + + # Set up SlackMessage with existing task ID in the cache + slack_message = make_slack_message(channel=slack_channel, alert_group=alert_group) + slack_message.set_active_update_task_id("existing-task-id") + + slack_message.update_alert_groups_message(debounce=False) + + # Verify that apply_async was called with countdown=0 + mock_update_alert_group_slack_message.apply_async.assert_called_once_with( + (slack_message.pk,), + countdown=0, + task_id=new_task_id, + ) + + # Verify the task ID in the cache is updated to new task_id + slack_message.refresh_from_db() + assert slack_message.get_active_update_task_id() == new_task_id diff --git a/engine/apps/slack/utils.py b/engine/apps/slack/utils.py index 8b3515ba30..300b20f01e 100644 --- a/engine/apps/slack/utils.py +++ b/engine/apps/slack/utils.py @@ -103,10 +103,5 @@ def format_datetime_to_slack_with_time(timestamp: float, format: SlackDateFormat return _format_datetime_to_slack(timestamp, f"{{{format}}} {{time}}") -def get_cache_key_update_incident_slack_message(alert_group_pk: str) -> str: - CACHE_KEY_PREFIX = "update_incident_slack_message" - return f"{CACHE_KEY_PREFIX}_{alert_group_pk}" - - def get_populate_slack_channel_task_id_key(slack_team_identity_id: str) -> str: return f"SLACK_CHANNELS_TASK_ID_TEAM_{slack_team_identity_id}" diff --git a/engine/settings/celery_task_routes.py b/engine/settings/celery_task_routes.py index 7ef62121dd..08f42631a4 100644 --- a/engine/settings/celery_task_routes.py +++ b/engine/settings/celery_task_routes.py @@ -170,7 +170,9 @@ "apps.slack.tasks.send_message_to_thread_if_bot_not_in_channel": {"queue": "slack"}, "apps.slack.tasks.start_update_slack_user_group_for_schedules": {"queue": "slack"}, "apps.slack.tasks.unpopulate_slack_user_identities": {"queue": "slack"}, + # TODO: remove apps.slack.tasks.update_incident_slack_message after current tasks in queue have been processed "apps.slack.tasks.update_incident_slack_message": {"queue": "slack"}, + "apps.slack.tasks.update_alert_group_slack_message": {"queue": "slack"}, "apps.slack.tasks.update_slack_user_group_for_schedules": {"queue": "slack"}, "apps.slack.representatives.alert_group_representative.on_create_alert_slack_representative_async": { "queue": "slack"