Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix audit/unenroll calls when agent runs fleet-server #6085

Merged
merged 26 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
4fbee98
Fix audit/unenroll calls when agent runs fleet-server
michel-laterman Nov 19, 2024
8b036d7
Fix integration test
michel-laterman Nov 20, 2024
1bcf7f7
Increase output for failed test
michel-laterman Nov 20, 2024
9950ed5
move notification attempt
michel-laterman Nov 20, 2024
1b0f34c
move notification attempt
michel-laterman Nov 20, 2024
d8ef38e
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Nov 21, 2024
c349511
audit/uninstall will call localhost with certonly tls
michel-laterman Nov 21, 2024
9da083e
Fix linter
michel-laterman Nov 22, 2024
3710448
remove tls setting
michel-laterman Nov 29, 2024
7d68d68
Add PR link to changelog
michel-laterman Nov 29, 2024
1de9ae4
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 2, 2024
e5cff30
stop checking if we need to notify in an error is encountered
michel-laterman Dec 2, 2024
63110ce
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 3, 2024
79b3645
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 4, 2024
8804ab8
Shorten non-fatal error messages to prevent output issues in terminals
michel-laterman Dec 9, 2024
62bb765
Don't load agentinfo, use ID value from config instead
michel-laterman Dec 9, 2024
933d6c8
Add unprivilged tests to audit unenroll integration tests
michel-laterman Dec 10, 2024
8596b93
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 10, 2024
5913866
Add comments for setting hosts
michel-laterman Dec 11, 2024
71c9cf5
Merge remote-tracking branch 'origin/main' into fix-audit-unenroll-fl…
michel-laterman Dec 11, 2024
5b3fdbf
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 12, 2024
8cc461b
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 12, 2024
60ba11c
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 13, 2024
6615db5
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 13, 2024
f01bc6a
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 16, 2024
0ad9069
Merge branch 'main' into fix-audit-unenroll-fleet-server
michel-laterman Dec 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: bug-fix

# Change summary; a 80ish characters long description of the change.
summary: Fix audit/unenroll call when running fleet-server

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
description: Fix the call to the audit/unenroll endpoint that occurs on uninstall when the fleet-server is running locally.

# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component: fleet-server

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
#pr: https://github.com/owner/repo/1234

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
issue: https://github.com/elastic/elastic-agent/issues/5752
68 changes: 40 additions & 28 deletions internal/pkg/agent/install/uninstall.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,45 @@ func Uninstall(ctx context.Context, cfgFile, topPath, uninstallToken string, log
return fmt.Errorf("uninstall must be run from outside the installed path '%s'", topPath)
}

// check if the agent was installed using --unprivileged by checking the file vault for the agent secret (needed on darwin to correctly load the vault)
unprivileged, err := checkForUnprivilegedVault(ctx)
cmacknz marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return fmt.Errorf("error checking for unprivileged vault: %w", err)
}

// will only notify fleet of the uninstall command if it can gather config and agentinfo, and is not a stand-alone install
localFleet := false
notifyFleet := false
var ai *info.AgentInfo
c, err := operations.LoadFullAgentConfig(ctx, log, cfgFile, false, unprivileged)
if err != nil {
pt.Describe(fmt.Sprintf("unable to read agent config to determine if notifying Fleet is needed: %v", err))
}
cfg, err := configuration.NewFromConfig(c)
if err != nil {
pt.Describe(fmt.Sprintf("notify Fleet: unable to transform *config.Config to *configuration.Configuration: %v", err))
}

if cfg != nil && !configuration.IsStandalone(cfg.Fleet) {
ai, err = info.NewAgentInfo(ctx, false)
if err != nil {
pt.Describe(fmt.Sprintf("unable to read agent info, Fleet will not be notified of uninstall: %v", err))
} else {
notifyFleet = true
}
if cfg.Fleet != nil && cfg.Fleet.Server != nil {
localFleet = true
}
}

// Skip on Windows because of https://github.com/elastic/elastic-agent/issues/5952
swiatekm marked this conversation as resolved.
Show resolved Hide resolved
// Once the root-cause is identified then this can be re-enabled on Windows.
// Notify fleet-server while it is still running if it's running locally
if notifyFleet && localFleet && runtime.GOOS != "windows" {
cfg.Fleet.Client.Hosts = []string{cfg.Fleet.Client.Host}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what's going on here; but Client.Hosts had the remote (cloud) fleet-server address, and Client.Host was the internal port.
I'm going to take some time tomorrow to verify that our normal agent operations still use the internal port when the agent is running fleet-server.

Copy link
Contributor Author

@michel-laterman michel-laterman Nov 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agent does connect to localhost:8221; but strangely enough fleet-server does not populate the server.address attribute in HTTP logs; i'll open a fleet-server PR to fix this
pr here: elastic/fleet-server#4142

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cfg.Fleet.Client.Hosts = []string{cfg.Fleet.Client.Host} is a bit weird. This is perhaps something else that could be encapsulated into the output of an RPC call to the running agent service since agent itself must know that these represent two different things.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should avoid using an RPC call because we don't want to introduce the requirement that an agent is running for the uninstall to succeed.

Copy link
Member

@pchila pchila Dec 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What I think is happening here:

I guess that the merged result of that is not quite right in this scenario and we end up with the discrepancy between Host and Hosts

From discussion with @nchaulet at the time, it was determined that Fleet will send back only Hosts field while the separate Host, Protocol, Path is only handled by agent when initializing config to store command line args. This refactor has been done in context of mTLS from policy support PR as an attempt to clean up the policy change handler.

I filed an issue to have a look and possible remote Host, Protocol and Path fields from the agent config at the time but maybe this needs further evaluation/discussion taking into account fleet bootstrap scenarios

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we link to that issue and add this explanation to the code? This is going to be really not obvious to someone reading this code without the context of this PR.

notifyFleetAuditUninstall(ctx, log, pt, cfg, ai) //nolint:errcheck // ignore the error as we can't act on it
cmacknz marked this conversation as resolved.
Show resolved Hide resolved
}

// ensure service is stopped
status, err := EnsureStoppedService(topPath, pt)
if err != nil {
Expand All @@ -71,12 +110,6 @@ func Uninstall(ctx context.Context, cfgFile, topPath, uninstallToken string, log
return fmt.Errorf("failed trying to kill any running watcher: %w", err)
}

// check if the agent was installed using --unprivileged by checking the file vault for the agent secret (needed on darwin to correctly load the vault)
unprivileged, err := checkForUnprivilegedVault(ctx)
if err != nil {
return fmt.Errorf("error checking for unprivileged vault: %w", err)
}

// Uninstall components first
if err := uninstallComponents(ctx, cfgFile, uninstallToken, log, pt, unprivileged); err != nil {
// If service status was running it was stopped to uninstall the components.
Expand Down Expand Up @@ -111,27 +144,6 @@ func Uninstall(ctx context.Context, cfgFile, topPath, uninstallToken string, log
}
}

// will only notify fleet of the uninstall command if it can gather config and agentinfo, and is not a stand-alone install
notifyFleet := false
var ai *info.AgentInfo
c, err := operations.LoadFullAgentConfig(ctx, log, cfgFile, false, unprivileged)
if err != nil {
pt.Describe(fmt.Sprintf("unable to read agent config to determine if notifying Fleet is needed: %v", err))
}
cfg, err := configuration.NewFromConfig(c)
if err != nil {
pt.Describe(fmt.Sprintf("notify Fleet: unable to transform *config.Config to *configuration.Configuration: %v", err))
}

if cfg != nil && !configuration.IsStandalone(cfg.Fleet) {
ai, err = info.NewAgentInfo(ctx, false)
if err != nil {
pt.Describe(fmt.Sprintf("unable to read agent info, Fleet will not be notified of uninstall: %v", err))
} else {
notifyFleet = true
}
}

// remove existing directory
pt.Describe("Removing install directory")
err = RemovePath(topPath)
Expand All @@ -146,7 +158,7 @@ func Uninstall(ctx context.Context, cfgFile, topPath, uninstallToken string, log

// Skip on Windows because of https://github.com/elastic/elastic-agent/issues/5952
// Once the root-cause is identified then this can be re-enabled on Windows.
if notifyFleet && runtime.GOOS != "windows" {
if notifyFleet && !localFleet && runtime.GOOS != "windows" {
notifyFleetAuditUninstall(ctx, log, pt, cfg, ai) //nolint:errcheck // ignore the error as we can't act on it
}

Expand Down
4 changes: 2 additions & 2 deletions testing/integration/fleetserver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,6 @@ func TestInstallFleetServerBootstrap(t *testing.T) {
Local: false,
})

t.Skip("Skip until the first 8.16.0-SNAPSHOT is available")

ctx, cancel := testcontext.WithDeadline(t, context.Background(), time.Now().Add(10*time.Minute))
defer cancel()

Expand Down Expand Up @@ -165,4 +163,6 @@ func TestInstallFleetServerBootstrap(t *testing.T) {
require.Error(t, err, "uninstall should have failed")
require.Containsf(t, string(out), "uninstall must be run from outside the installed path", "expected error string not found in: %s err: %s", out, err)
}

t.Run("Test audit/unenroll", testUninstallAuditUnenroll(ctx, fixture, info))
cmacknz marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test only runs for privileged agents, because of this block:

	// Run `elastic-agent install` with fleet-server bootstrap options.
	// We use `--force` to prevent interactive execution.
	opts := &atesting.InstallOpts{
		Force:      true,
		Privileged: true,

There shouldn't be a reason for this test to only run privileged, tests should ideally be unprivileged by default as that is the harder case to get right.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For Macs at least, you can't get the agent ID or determine if there is a local Fleet server if the agent is unprivileged. This could be the case on other platforms as well, I just confirmed for Mac by local testing.

If you make this test unprivileged you'll find out.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think i've fixed it; I no longer try to load the agent info through the info package; i just use the agent ID from config. With these changes my local testing (on a mac) shows the uninstalled audit reason, and no more wall of whitespace. I'll see about adding a test that does not run in privileged mode

}
55 changes: 32 additions & 23 deletions testing/integration/install_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -338,32 +338,41 @@ func TestInstallUninstallAudit(t *testing.T) {
return waitForAgentAndFleetHealthy(ctx, t, fixture)
}, time.Minute, time.Second, "agent never became healthy or connected to Fleet")

agentID, err := getAgentID(ctx, fixture)
require.NoError(t, err, "error getting the agent inspect output")
require.NotEmpty(t, agentID, "agent ID empty")
t.Run("run uninstall", testUninstallAuditUnenroll(ctx, fixture, info))
cmacknz marked this conversation as resolved.
Show resolved Hide resolved
}

out, err = fixture.Uninstall(ctx, &atesting.UninstallOpts{Force: true})
if err != nil {
t.Logf("uninstall output: %s", out)
require.NoError(t, err)
}
func testUninstallAuditUnenroll(ctx context.Context, fixture *atesting.Fixture, info *define.Info) func(t *testing.T) {
return func(t *testing.T) {
if runtime.GOOS == "windows" {
t.Skip("Skip Windows as it has been disabled because of https://github.com/elastic/elastic-agent/issues/5952")
}
agentID, err := getAgentID(ctx, fixture)
require.NoError(t, err, "error getting the agent inspect output")
require.NotEmpty(t, agentID, "agent ID empty")

// TODO: replace direct query to ES index with API call to Fleet
// Blocked on https://github.com/elastic/kibana/issues/194884
response, err := info.ESClient.Get(".fleet-agents", agentID, info.ESClient.Get.WithContext(ctx))
require.NoError(t, err)
defer response.Body.Close()
p, err := io.ReadAll(response.Body)
require.NoError(t, err)
require.Equalf(t, http.StatusOK, response.StatusCode, "ES status code expected 200, body: %s", p)
var res struct {
Source struct {
AuditUnenrolledReason string `json:"audit_unenrolled_reason"`
} `json:"_source"`
out, err := fixture.Uninstall(ctx, &atesting.UninstallOpts{Force: true})
if err != nil {
t.Logf("uninstall output: %s", out)
require.NoError(t, err)
}

// TODO: replace direct query to ES index with API call to Fleet
// Blocked on https://github.com/elastic/kibana/issues/194884
response, err := info.ESClient.Get(".fleet-agents", agentID, info.ESClient.Get.WithContext(ctx))
require.NoError(t, err)
defer response.Body.Close()
p, err := io.ReadAll(response.Body)
require.NoError(t, err)
require.Equalf(t, http.StatusOK, response.StatusCode, "ES status code expected 200, body: %s", p)
var res struct {
Source struct {
AuditUnenrolledReason string `json:"audit_unenrolled_reason"`
} `json:"_source"`
}
err = json.Unmarshal(p, &res)
require.NoError(t, err)
require.Equalf(t, "uninstall", res.Source.AuditUnenrolledReason, "uninstall output: %s", out)
}
err = json.Unmarshal(p, &res)
require.NoError(t, err)
require.Equal(t, "uninstall", res.Source.AuditUnenrolledReason)
}

// TestRepeatedInstallUninstall will install then uninstall the agent
Expand Down