Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix audit/unenroll calls when agent runs fleet-server #6085

Merged
merged 26 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
4fbee98
Fix audit/unenroll calls when agent runs fleet-server
michel-laterman Nov 19, 2024
8b036d7
Fix integration test
michel-laterman Nov 20, 2024
1bcf7f7
Increase output for failed test
michel-laterman Nov 20, 2024
9950ed5
move notification attempt
michel-laterman Nov 20, 2024
1b0f34c
move notification attempt
michel-laterman Nov 20, 2024
d8ef38e
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Nov 21, 2024
c349511
audit/uninstall will call localhost with certonly tls
michel-laterman Nov 21, 2024
9da083e
Fix linter
michel-laterman Nov 22, 2024
3710448
remove tls setting
michel-laterman Nov 29, 2024
7d68d68
Add PR link to changelog
michel-laterman Nov 29, 2024
1de9ae4
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 2, 2024
e5cff30
stop checking if we need to notify in an error is encountered
michel-laterman Dec 2, 2024
63110ce
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 3, 2024
79b3645
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 4, 2024
8804ab8
Shorten non-fatal error messages to prevent output issues in terminals
michel-laterman Dec 9, 2024
62bb765
Don't load agentinfo, use ID value from config instead
michel-laterman Dec 9, 2024
933d6c8
Add unprivilged tests to audit unenroll integration tests
michel-laterman Dec 10, 2024
8596b93
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 10, 2024
5913866
Add comments for setting hosts
michel-laterman Dec 11, 2024
71c9cf5
Merge remote-tracking branch 'origin/main' into fix-audit-unenroll-fl…
michel-laterman Dec 11, 2024
5b3fdbf
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 12, 2024
8cc461b
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 12, 2024
60ba11c
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 13, 2024
6615db5
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 13, 2024
f01bc6a
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 16, 2024
0ad9069
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: bug-fix

# Change summary; a 80ish characters long description of the change.
summary: Fix audit/unenroll call when running fleet-server

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
description: Fix the call to the audit/unenroll endpoint that occurs on uninstall when the fleet-server is running locally.

# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component: fleet-server

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
pr: https://github.com/elastic/elastic-agent/pull/6085

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
issue: https://github.com/elastic/elastic-agent/issues/5752
82 changes: 51 additions & 31 deletions internal/pkg/agent/install/uninstall.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"github.com/schollz/progressbar/v3"

"github.com/elastic/elastic-agent-libs/logp"
"github.com/elastic/elastic-agent/internal/pkg/agent/application/info"
"github.com/elastic/elastic-agent/internal/pkg/agent/application/paths"
"github.com/elastic/elastic-agent/internal/pkg/agent/application/secret"
"github.com/elastic/elastic-agent/internal/pkg/agent/configuration"
Expand Down Expand Up @@ -48,6 +47,13 @@ var (
fleetAuditWaitMax = time.Second * 10
)

// agentInfo is a custom type that implements the fleetapi.AgentInfo interface
type agentInfo string

func (a *agentInfo) AgentID() string {
return string(*a)
}

// Uninstall uninstalls persistently Elastic Agent on the system.
func Uninstall(ctx context.Context, cfgFile, topPath, uninstallToken string, log *logp.Logger, pt *progressbar.ProgressBar) error {
cwd, err := os.Getwd()
Expand All @@ -59,6 +65,47 @@ func Uninstall(ctx context.Context, cfgFile, topPath, uninstallToken string, log
return fmt.Errorf("uninstall must be run from outside the installed path '%s'", topPath)
}

// check if the agent was installed using --unprivileged by checking the file vault for the agent secret (needed on darwin to correctly load the vault)
unprivileged, err := checkForUnprivilegedVault(ctx)
cmacknz marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return fmt.Errorf("error checking for unprivileged vault: %w", err)
}

// will only notify fleet of the uninstall command if it can gather config and agentinfo, and is not a stand-alone install
localFleet := false
notifyFleet := false
var agentID agentInfo
var cfg *configuration.Configuration
func() { // check if we need to notify in a func to allow us to return early if a (non-fatal) error is encountered.
// read local config
c, err := operations.LoadFullAgentConfig(ctx, log, cfgFile, false, unprivileged)
if err != nil {
pt.Describe("notify Fleet failed: unable to read config")
return
}
cfg, err = configuration.NewFromConfig(c)
if err != nil {
pt.Describe("notify Fleet failed: error transforming config")
return
}

if cfg != nil && !configuration.IsStandalone(cfg.Fleet) {
agentID = agentInfo(cfg.Settings.ID)
notifyFleet = true
if cfg.Fleet != nil && cfg.Fleet.Server != nil {
localFleet = true
}
}
}()

// Skip on Windows because of https://github.com/elastic/elastic-agent/issues/5952
swiatekm marked this conversation as resolved.
Show resolved Hide resolved
// Once the root-cause is identified then this can be re-enabled on Windows.
// Notify fleet-server while it is still running if it's running locally
if notifyFleet && localFleet && runtime.GOOS != "windows" {
cfg.Fleet.Client.Hosts = []string{cfg.Fleet.Client.Host}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what's going on here; but Client.Hosts had the remote (cloud) fleet-server address, and Client.Host was the internal port.
I'm going to take some time tomorrow to verify that our normal agent operations still use the internal port when the agent is running fleet-server.

Copy link
Contributor Author

@michel-laterman michel-laterman Nov 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agent does connect to localhost:8221; but strangely enough fleet-server does not populate the server.address attribute in HTTP logs; i'll open a fleet-server PR to fix this
pr here: elastic/fleet-server#4142

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cfg.Fleet.Client.Hosts = []string{cfg.Fleet.Client.Host} is a bit weird. This is perhaps something else that could be encapsulated into the output of an RPC call to the running agent service since agent itself must know that these represent two different things.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should avoid using an RPC call because we don't want to introduce the requirement that an agent is running for the uninstall to succeed.

Copy link
Member

@pchila pchila Dec 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I think is happening here:

I guess that the merged result of that is not quite right in this scenario and we end up with the discrepancy between Host and Hosts

From discussion with @nchaulet at the time, it was determined that Fleet will send back only Hosts field while the separate Host, Protocol, Path is only handled by agent when initializing config to store command line args. This refactor has been done in context of mTLS from policy support PR as an attempt to clean up the policy change handler.

I filed an issue to have a look and possible remote Host, Protocol and Path fields from the agent config at the time but maybe this needs further evaluation/discussion taking into account fleet bootstrap scenarios

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we link to that issue and add this explanation to the code? This is going to be really not obvious to someone reading this code without the context of this PR.

notifyFleetAuditUninstall(ctx, log, pt, cfg, &agentID) //nolint:errcheck // ignore the error as we can't act on it
}

// ensure service is stopped
status, err := EnsureStoppedService(topPath, pt)
if err != nil {
Expand All @@ -71,12 +118,6 @@ func Uninstall(ctx context.Context, cfgFile, topPath, uninstallToken string, log
return fmt.Errorf("failed trying to kill any running watcher: %w", err)
}

// check if the agent was installed using --unprivileged by checking the file vault for the agent secret (needed on darwin to correctly load the vault)
unprivileged, err := checkForUnprivilegedVault(ctx)
if err != nil {
return fmt.Errorf("error checking for unprivileged vault: %w", err)
}

// Uninstall components first
if err := uninstallComponents(ctx, cfgFile, uninstallToken, log, pt, unprivileged); err != nil {
// If service status was running it was stopped to uninstall the components.
Expand Down Expand Up @@ -111,27 +152,6 @@ func Uninstall(ctx context.Context, cfgFile, topPath, uninstallToken string, log
}
}

// will only notify fleet of the uninstall command if it can gather config and agentinfo, and is not a stand-alone install
notifyFleet := false
var ai *info.AgentInfo
c, err := operations.LoadFullAgentConfig(ctx, log, cfgFile, false, unprivileged)
if err != nil {
pt.Describe(fmt.Sprintf("unable to read agent config to determine if notifying Fleet is needed: %v", err))
}
cfg, err := configuration.NewFromConfig(c)
if err != nil {
pt.Describe(fmt.Sprintf("notify Fleet: unable to transform *config.Config to *configuration.Configuration: %v", err))
}

if cfg != nil && !configuration.IsStandalone(cfg.Fleet) {
ai, err = info.NewAgentInfo(ctx, false)
if err != nil {
pt.Describe(fmt.Sprintf("unable to read agent info, Fleet will not be notified of uninstall: %v", err))
} else {
notifyFleet = true
}
}

// remove existing directory
pt.Describe("Removing install directory")
err = RemovePath(topPath)
Expand All @@ -146,8 +166,8 @@ func Uninstall(ctx context.Context, cfgFile, topPath, uninstallToken string, log

// Skip on Windows because of https://github.com/elastic/elastic-agent/issues/5952
// Once the root-cause is identified then this can be re-enabled on Windows.
if notifyFleet && runtime.GOOS != "windows" {
notifyFleetAuditUninstall(ctx, log, pt, cfg, ai) //nolint:errcheck // ignore the error as we can't act on it
if notifyFleet && !localFleet && runtime.GOOS != "windows" {
notifyFleetAuditUninstall(ctx, log, pt, cfg, &agentID) //nolint:errcheck // ignore the error as we can't act on it
}

return nil
Expand All @@ -156,7 +176,7 @@ func Uninstall(ctx context.Context, cfgFile, topPath, uninstallToken string, log
// notifyFleetAuditUninstall will attempt to notify fleet-server of the agent's uninstall.
//
// There are retries for the attempt after a 10s wait, but it is a best-effort approach.
func notifyFleetAuditUninstall(ctx context.Context, log *logp.Logger, pt *progressbar.ProgressBar, cfg *configuration.Configuration, ai *info.AgentInfo) error {
func notifyFleetAuditUninstall(ctx context.Context, log *logp.Logger, pt *progressbar.ProgressBar, cfg *configuration.Configuration, ai fleetapi.AgentInfo) error {
ctx, cancel := context.WithCancel(ctx)
defer cancel()
pt.Describe("Attempting to notify Fleet of uninstall")
Expand Down
7 changes: 3 additions & 4 deletions internal/pkg/agent/install/uninstall_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
"go.uber.org/zap"

"github.com/elastic/elastic-agent-libs/logp"
"github.com/elastic/elastic-agent/internal/pkg/agent/application/info"
"github.com/elastic/elastic-agent/internal/pkg/agent/application/paths"
"github.com/elastic/elastic-agent/internal/pkg/agent/application/secret"
"github.com/elastic/elastic-agent/internal/pkg/agent/configuration"
Expand Down Expand Up @@ -178,7 +177,7 @@ func TestNotifyFleetAuditUnenroll(t *testing.T) {

log, _ := logp.NewInMemory("test", zap.NewDevelopmentEncoderConfig())
pt := progressbar.NewOptions(-1, progressbar.OptionSetWriter(io.Discard))
ai := &info.AgentInfo{}
var agentID agentInfo = "testID"

for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
Expand All @@ -194,7 +193,7 @@ func TestNotifyFleetAuditUnenroll(t *testing.T) {
},
},
}
err := notifyFleetAuditUninstall(context.Background(), log, pt, cfg, ai)
err := notifyFleetAuditUninstall(context.Background(), log, pt, cfg, &agentID)
if tc.err == nil {
assert.NoError(t, err)
} else {
Expand Down Expand Up @@ -222,7 +221,7 @@ func TestNotifyFleetAuditUnenroll(t *testing.T) {
},
},
}
err := notifyFleetAuditUninstall(context.Background(), log, pt, cfg, ai)
err := notifyFleetAuditUninstall(context.Background(), log, pt, cfg, &agentID)
assert.EqualError(t, err, "notify Fleet: failed")

})
Expand Down
6 changes: 3 additions & 3 deletions internal/pkg/fleetapi/ack_cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ const ackPath = "/api/fleet/agents/%s/acks"
type AckEvent struct {
EventType string `json:"type"` // 'STATE' | 'ERROR' | 'ACTION_RESULT' | 'ACTION'
SubType string `json:"subtype"` // 'RUNNING','STARTING','IN_PROGRESS','CONFIG','FAILED','STOPPING','STOPPED','DATA_DUMP','ACKNOWLEDGED','UNKNOWN';
Timestamp string `json:"timestamp"` // : '2019-01-05T14:32:03.36764-05:00',
Timestamp string `json:"timestamp"` // : '2019-01-05T14:32:03.36764-05:00'
ActionID string `json:"action_id"` // : '48cebde1-c906-4893-b89f-595d943b72a2',
AgentID string `json:"agent_id"` // : 'agent1',
Message string `json:"message,omitempty"` // : 'hello2',
Expand Down Expand Up @@ -84,11 +84,11 @@ func (e *AckResponse) Validate() error {
// AckCmd is a fleet API command.
type AckCmd struct {
client client.Sender
info agentInfo
info AgentInfo
}

// NewAckCmd creates a new api command.
func NewAckCmd(info agentInfo, client client.Sender) *AckCmd {
func NewAckCmd(info AgentInfo, client client.Sender) *AckCmd {
return &AckCmd{
client: client,
info: info,
Expand Down
4 changes: 2 additions & 2 deletions internal/pkg/fleetapi/audit_unenroll_cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,10 @@ func (e *AuditUnenrollRequest) Validate() error {

type AuditUnenrollCmd struct {
client client.Sender
info agentInfo
info AgentInfo
}

func NewAuditUnenrollCmd(info agentInfo, client client.Sender) *AuditUnenrollCmd {
func NewAuditUnenrollCmd(info AgentInfo, client client.Sender) *AuditUnenrollCmd {
return &AuditUnenrollCmd{
client: client,
info: info,
Expand Down
6 changes: 3 additions & 3 deletions internal/pkg/fleetapi/checkin_cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,15 @@ func (e *CheckinResponse) Validate() error {
// CheckinCmd is a fleet API command.
type CheckinCmd struct {
client client.Sender
info agentInfo
info AgentInfo
}

type agentInfo interface {
type AgentInfo interface {
AgentID() string
}

// NewCheckinCmd creates a new api command.
func NewCheckinCmd(info agentInfo, client client.Sender) *CheckinCmd {
func NewCheckinCmd(info AgentInfo, client client.Sender) *CheckinCmd {
return &CheckinCmd{
client: client,
info: info,
Expand Down
Loading
Loading