From 58c94d2bd33d0fd03336b8f2b56f63953c44abaa Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Fri, 23 Feb 2024 10:59:24 +0100 Subject: [PATCH] Rework map session This commit restructures the map session in to a struct holding the state of what is needed during its lifetime. For streaming sessions, the event loop is structured a bit differently not hammering the clients with updates but rather batching them over a short, configurable time which should significantly improve cpu usage, and potentially flakyness. The use of Patch updates has been dialed back a little as it does not look like its a 100% ready for prime time. Nodes are now updated with full changes, except for a few things like online status. Signed-off-by: Kristoffer Dalby --- .github/workflows/test-integration.yaml | 3 +- go.mod | 2 + go.sum | 4 + hscontrol/app.go | 61 +- hscontrol/auth.go | 33 +- hscontrol/db/node.go | 101 +- hscontrol/db/node_test.go | 18 +- hscontrol/db/preauth_keys.go | 4 - hscontrol/db/preauth_keys_test.go | 4 +- hscontrol/db/routes.go | 276 +++--- hscontrol/db/routes_test.go | 506 +++++------ hscontrol/grpcv1.go | 113 ++- hscontrol/handlers.go | 19 - hscontrol/mapper/mapper.go | 306 +++---- hscontrol/mapper/mapper_test.go | 65 +- hscontrol/mapper/tail.go | 30 +- hscontrol/mapper/tail_test.go | 9 +- hscontrol/noise.go | 134 +++ hscontrol/notifier/notifier.go | 87 +- hscontrol/oidc.go | 7 +- hscontrol/policy/acls.go | 41 +- hscontrol/policy/acls_test.go | 73 +- hscontrol/poll.go | 1112 +++++++++++++---------- hscontrol/poll_noise.go | 96 -- hscontrol/types/common.go | 60 +- hscontrol/types/config.go | 16 + hscontrol/types/node.go | 27 +- integration/auth_oidc_test.go | 4 +- integration/auth_web_flow_test.go | 4 +- integration/general_test.go | 132 ++- integration/route_test.go | 68 +- integration/scenario.go | 2 + integration/tailscale.go | 2 +- integration/tsic/tsic.go | 83 +- integration/utils.go | 17 +- 35 files changed, 1803 insertions(+), 1716 deletions(-) delete mode 100644 hscontrol/poll_noise.go diff --git a/.github/workflows/test-integration.yaml b/.github/workflows/test-integration.yaml index 06a99db..d93aaca 100644 --- a/.github/workflows/test-integration.yaml +++ b/.github/workflows/test-integration.yaml @@ -43,7 +43,8 @@ jobs: - TestTaildrop - TestResolveMagicDNS - TestExpireNode - - TestNodeOnlineLastSeenStatus + - TestNodeOnlineStatus + - TestPingAllByIPManyUpDown - TestEnablingRoutes - TestHASubnetRouterFailover - TestEnableDisableAutoApprovedRoute diff --git a/go.mod b/go.mod index 20bd86b..bf7e61b 100644 --- a/go.mod +++ b/go.mod @@ -150,6 +150,7 @@ require ( github.com/opencontainers/image-spec v1.1.0-rc6 // indirect github.com/opencontainers/runc v1.1.12 // indirect github.com/pelletier/go-toml/v2 v2.1.1 // indirect + github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5 // indirect github.com/pierrec/lz4/v4 v4.1.21 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect @@ -161,6 +162,7 @@ require ( github.com/safchain/ethtool v0.3.0 // indirect github.com/sagikazarmark/locafero v0.4.0 // indirect github.com/sagikazarmark/slog-shim v0.1.0 // indirect + github.com/sasha-s/go-deadlock v0.3.1 // indirect github.com/sirupsen/logrus v1.9.3 // indirect github.com/sourcegraph/conc v0.3.0 // indirect github.com/spf13/afero v1.11.0 // indirect diff --git a/go.sum b/go.sum index 63876d1..703fa08 100644 --- a/go.sum +++ b/go.sum @@ -336,6 +336,8 @@ github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaR github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ= github.com/pelletier/go-toml/v2 v2.1.1 h1:LWAJwfNvjQZCFIDKWYQaM62NcYeYViCmWIwmOStowAI= github.com/pelletier/go-toml/v2 v2.1.1/go.mod h1:tJU2Z3ZkXwnxa4DPO899bsyIoywizdUvyaeZurnPPDc= +github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5 h1:q2e307iGHPdTGp0hoxKjt1H5pDo6utceo3dQVK3I5XQ= +github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5/go.mod h1:jvVRKCrJTQWu0XVbaOlby/2lO20uSCHEMzzplHXte1o= github.com/philip-bui/grpc-zerolog v1.0.1 h1:EMacvLRUd2O1K0eWod27ZP5CY1iTNkhBDLSN+Q4JEvA= github.com/philip-bui/grpc-zerolog v1.0.1/go.mod h1:qXbiq/2X4ZUMMshsqlWyTHOcw7ns+GZmlqZZN05ZHcQ= github.com/pierrec/lz4/v4 v4.1.14/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= @@ -392,6 +394,8 @@ github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6g github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ= github.com/samber/lo v1.39.0 h1:4gTz1wUhNYLhFSKl6O+8peW0v2F4BCY034GRpU9WnuA= github.com/samber/lo v1.39.0/go.mod h1:+m/ZKRl6ClXCE2Lgf3MsQlWfh4bn1bz6CXEOxnEXnEA= +github.com/sasha-s/go-deadlock v0.3.1 h1:sqv7fDNShgjcaxkO0JNcOAlr8B9+cV5Ey/OB71efZx0= +github.com/sasha-s/go-deadlock v0.3.1/go.mod h1:F73l+cr82YSh10GxyRI6qZiCgK64VaZjwesgfQ1/iLM= github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8= github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I= diff --git a/hscontrol/app.go b/hscontrol/app.go index a29e53d..bdb5c1d 100644 --- a/hscontrol/app.go +++ b/hscontrol/app.go @@ -28,6 +28,7 @@ import ( "github.com/juanfont/headscale/hscontrol/db" "github.com/juanfont/headscale/hscontrol/derp" derpServer "github.com/juanfont/headscale/hscontrol/derp/server" + "github.com/juanfont/headscale/hscontrol/mapper" "github.com/juanfont/headscale/hscontrol/notifier" "github.com/juanfont/headscale/hscontrol/policy" "github.com/juanfont/headscale/hscontrol/types" @@ -38,6 +39,7 @@ import ( "github.com/prometheus/client_golang/prometheus/promhttp" zl "github.com/rs/zerolog" "github.com/rs/zerolog/log" + "github.com/sasha-s/go-deadlock" "golang.org/x/crypto/acme" "golang.org/x/crypto/acme/autocert" "golang.org/x/oauth2" @@ -77,6 +79,11 @@ const ( registerCacheCleanup = time.Minute * 20 ) +func init() { + deadlock.Opts.DeadlockTimeout = 15 * time.Second + deadlock.Opts.PrintAllCurrentGoroutines = true +} + // Headscale represents the base app of the service. type Headscale struct { cfg *types.Config @@ -89,6 +96,7 @@ type Headscale struct { ACLPolicy *policy.ACLPolicy + mapper *mapper.Mapper nodeNotifier *notifier.Notifier oidcProvider *oidc.Provider @@ -96,8 +104,10 @@ type Headscale struct { registrationCache *cache.Cache - shutdownChan chan struct{} pollNetMapStreamWG sync.WaitGroup + + mapSessions map[types.NodeID]*mapSession + mapSessionMu deadlock.Mutex } var ( @@ -129,6 +139,7 @@ func NewHeadscale(cfg *types.Config) (*Headscale, error) { registrationCache: registrationCache, pollNetMapStreamWG: sync.WaitGroup{}, nodeNotifier: notifier.NewNotifier(), + mapSessions: make(map[types.NodeID]*mapSession), } app.db, err = db.NewHeadscaleDatabase( @@ -199,16 +210,16 @@ func (h *Headscale) redirect(w http.ResponseWriter, req *http.Request) { http.Redirect(w, req, target, http.StatusFound) } -// expireEphemeralNodes deletes ephemeral node records that have not been +// deleteExpireEphemeralNodes deletes ephemeral node records that have not been // seen for longer than h.cfg.EphemeralNodeInactivityTimeout. -func (h *Headscale) expireEphemeralNodes(milliSeconds int64) { +func (h *Headscale) deleteExpireEphemeralNodes(milliSeconds int64) { ticker := time.NewTicker(time.Duration(milliSeconds) * time.Millisecond) - var update types.StateUpdate - var changed bool for range ticker.C { + var removed []types.NodeID + var changed []types.NodeID if err := h.db.DB.Transaction(func(tx *gorm.DB) error { - update, changed = db.ExpireEphemeralNodes(tx, h.cfg.EphemeralNodeInactivityTimeout) + removed, changed = db.DeleteExpiredEphemeralNodes(tx, h.cfg.EphemeralNodeInactivityTimeout) return nil }); err != nil { @@ -216,9 +227,20 @@ func (h *Headscale) expireEphemeralNodes(milliSeconds int64) { continue } - if changed && update.Valid() { + if removed != nil { ctx := types.NotifyCtx(context.Background(), "expire-ephemeral", "na") - h.nodeNotifier.NotifyAll(ctx, update) + h.nodeNotifier.NotifyAll(ctx, types.StateUpdate{ + Type: types.StatePeerRemoved, + Removed: removed, + }) + } + + if changed != nil { + ctx := types.NotifyCtx(context.Background(), "expire-ephemeral", "na") + h.nodeNotifier.NotifyAll(ctx, types.StateUpdate{ + Type: types.StatePeerChanged, + ChangeNodes: changed, + }) } } } @@ -243,8 +265,9 @@ func (h *Headscale) expireExpiredMachines(intervalMs int64) { continue } - log.Trace().Str("nodes", update.ChangeNodes.String()).Msgf("expiring nodes") - if changed && update.Valid() { + if changed { + log.Trace().Interface("nodes", update.ChangePatches).Msgf("expiring nodes") + ctx := types.NotifyCtx(context.Background(), "expire-expired", "na") h.nodeNotifier.NotifyAll(ctx, update) } @@ -272,14 +295,11 @@ func (h *Headscale) scheduledDERPMapUpdateWorker(cancelChan <-chan struct{}) { h.DERPMap.Regions[region.RegionID] = ®ion } - stateUpdate := types.StateUpdate{ + ctx := types.NotifyCtx(context.Background(), "derpmap-update", "na") + h.nodeNotifier.NotifyAll(ctx, types.StateUpdate{ Type: types.StateDERPUpdated, DERPMap: h.DERPMap, - } - if stateUpdate.Valid() { - ctx := types.NotifyCtx(context.Background(), "derpmap-update", "na") - h.nodeNotifier.NotifyAll(ctx, stateUpdate) - } + }) } } } @@ -502,6 +522,7 @@ func (h *Headscale) Serve() error { // Fetch an initial DERP Map before we start serving h.DERPMap = derp.GetDERPMap(h.cfg.DERP) + h.mapper = mapper.NewMapper(h.db, h.cfg, h.DERPMap, h.nodeNotifier.ConnectedMap()) if h.cfg.DERP.ServerEnabled { // When embedded DERP is enabled we always need a STUN server @@ -533,7 +554,7 @@ func (h *Headscale) Serve() error { // TODO(kradalby): These should have cancel channels and be cleaned // up on shutdown. - go h.expireEphemeralNodes(updateInterval) + go h.deleteExpireEphemeralNodes(updateInterval) go h.expireExpiredMachines(updateInterval) if zl.GlobalLevel() == zl.TraceLevel { @@ -686,6 +707,9 @@ func (h *Headscale) Serve() error { // no good way to handle streaming timeouts, therefore we need to // keep this at unlimited and be careful to clean up connections // https://blog.cloudflare.com/the-complete-guide-to-golang-net-http-timeouts/#aboutstreaming + // TODO(kradalby): this timeout can now be set per handler with http.ResponseController: + // https://www.alexedwards.net/blog/how-to-use-the-http-responsecontroller-type + // replace this so only the longpoller has no timeout. WriteTimeout: 0, } @@ -742,7 +766,6 @@ func (h *Headscale) Serve() error { } // Handle common process-killing signals so we can gracefully shut down: - h.shutdownChan = make(chan struct{}) sigc := make(chan os.Signal, 1) signal.Notify(sigc, syscall.SIGHUP, @@ -785,8 +808,6 @@ func (h *Headscale) Serve() error { Str("signal", sig.String()). Msg("Received signal to stop, shutting down gracefully") - close(h.shutdownChan) - h.pollNetMapStreamWG.Wait() // Gracefully shut down servers diff --git a/hscontrol/auth.go b/hscontrol/auth.go index b199fa5..8271038 100644 --- a/hscontrol/auth.go +++ b/hscontrol/auth.go @@ -352,13 +352,8 @@ func (h *Headscale) handleAuthKey( } } - mkey := node.MachineKey - update := types.StateUpdateExpire(node.ID, registerRequest.Expiry) - - if update.Valid() { - ctx := types.NotifyCtx(context.Background(), "handle-authkey", "na") - h.nodeNotifier.NotifyWithIgnore(ctx, update, mkey.String()) - } + ctx := types.NotifyCtx(context.Background(), "handle-authkey", "na") + h.nodeNotifier.NotifyWithIgnore(ctx, types.StateUpdateExpire(node.ID, registerRequest.Expiry), node.ID) } else { now := time.Now().UTC() @@ -538,11 +533,8 @@ func (h *Headscale) handleNodeLogOut( return } - stateUpdate := types.StateUpdateExpire(node.ID, now) - if stateUpdate.Valid() { - ctx := types.NotifyCtx(context.Background(), "logout-expiry", "na") - h.nodeNotifier.NotifyWithIgnore(ctx, stateUpdate, node.MachineKey.String()) - } + ctx := types.NotifyCtx(context.Background(), "logout-expiry", "na") + h.nodeNotifier.NotifyWithIgnore(ctx, types.StateUpdateExpire(node.ID, now), node.ID) resp.AuthURL = "" resp.MachineAuthorized = false @@ -572,7 +564,7 @@ func (h *Headscale) handleNodeLogOut( } if node.IsEphemeral() { - err = h.db.DeleteNode(&node, h.nodeNotifier.ConnectedMap()) + changedNodes, err := h.db.DeleteNode(&node, h.nodeNotifier.ConnectedMap()) if err != nil { log.Error(). Err(err). @@ -580,13 +572,16 @@ func (h *Headscale) handleNodeLogOut( Msg("Cannot delete ephemeral node from the database") } - stateUpdate := types.StateUpdate{ + ctx := types.NotifyCtx(context.Background(), "logout-ephemeral", "na") + h.nodeNotifier.NotifyAll(ctx, types.StateUpdate{ Type: types.StatePeerRemoved, - Removed: []tailcfg.NodeID{tailcfg.NodeID(node.ID)}, - } - if stateUpdate.Valid() { - ctx := types.NotifyCtx(context.Background(), "logout-ephemeral", "na") - h.nodeNotifier.NotifyAll(ctx, stateUpdate) + Removed: []types.NodeID{node.ID}, + }) + if changedNodes != nil { + h.nodeNotifier.NotifyAll(ctx, types.StateUpdate{ + Type: types.StatePeerChanged, + ChangeNodes: changedNodes, + }) } return diff --git a/hscontrol/db/node.go b/hscontrol/db/node.go index d02c2d3..61c952a 100644 --- a/hscontrol/db/node.go +++ b/hscontrol/db/node.go @@ -34,27 +34,22 @@ var ( ) ) -func (hsdb *HSDatabase) ListPeers(node *types.Node) (types.Nodes, error) { +func (hsdb *HSDatabase) ListPeers(nodeID types.NodeID) (types.Nodes, error) { return Read(hsdb.DB, func(rx *gorm.DB) (types.Nodes, error) { - return ListPeers(rx, node) + return ListPeers(rx, nodeID) }) } // ListPeers returns all peers of node, regardless of any Policy or if the node is expired. -func ListPeers(tx *gorm.DB, node *types.Node) (types.Nodes, error) { - log.Trace(). - Caller(). - Str("node", node.Hostname). - Msg("Finding direct peers") - +func ListPeers(tx *gorm.DB, nodeID types.NodeID) (types.Nodes, error) { nodes := types.Nodes{} if err := tx. Preload("AuthKey"). Preload("AuthKey.User"). Preload("User"). Preload("Routes"). - Where("node_key <> ?", - node.NodeKey.String()).Find(&nodes).Error; err != nil { + Where("id <> ?", + nodeID).Find(&nodes).Error; err != nil { return types.Nodes{}, err } @@ -119,14 +114,14 @@ func getNode(tx *gorm.DB, user string, name string) (*types.Node, error) { return nil, ErrNodeNotFound } -func (hsdb *HSDatabase) GetNodeByID(id uint64) (*types.Node, error) { +func (hsdb *HSDatabase) GetNodeByID(id types.NodeID) (*types.Node, error) { return Read(hsdb.DB, func(rx *gorm.DB) (*types.Node, error) { return GetNodeByID(rx, id) }) } // GetNodeByID finds a Node by ID and returns the Node struct. -func GetNodeByID(tx *gorm.DB, id uint64) (*types.Node, error) { +func GetNodeByID(tx *gorm.DB, id types.NodeID) (*types.Node, error) { mach := types.Node{} if result := tx. Preload("AuthKey"). @@ -197,7 +192,7 @@ func GetNodeByAnyKey( } func (hsdb *HSDatabase) SetTags( - nodeID uint64, + nodeID types.NodeID, tags []string, ) error { return hsdb.Write(func(tx *gorm.DB) error { @@ -208,7 +203,7 @@ func (hsdb *HSDatabase) SetTags( // SetTags takes a Node struct pointer and update the forced tags. func SetTags( tx *gorm.DB, - nodeID uint64, + nodeID types.NodeID, tags []string, ) error { if len(tags) == 0 { @@ -256,7 +251,7 @@ func RenameNode(tx *gorm.DB, return nil } -func (hsdb *HSDatabase) NodeSetExpiry(nodeID uint64, expiry time.Time) error { +func (hsdb *HSDatabase) NodeSetExpiry(nodeID types.NodeID, expiry time.Time) error { return hsdb.Write(func(tx *gorm.DB) error { return NodeSetExpiry(tx, nodeID, expiry) }) @@ -264,13 +259,13 @@ func (hsdb *HSDatabase) NodeSetExpiry(nodeID uint64, expiry time.Time) error { // NodeSetExpiry takes a Node struct and a new expiry time. func NodeSetExpiry(tx *gorm.DB, - nodeID uint64, expiry time.Time, + nodeID types.NodeID, expiry time.Time, ) error { return tx.Model(&types.Node{}).Where("id = ?", nodeID).Update("expiry", expiry).Error } -func (hsdb *HSDatabase) DeleteNode(node *types.Node, isConnected map[key.MachinePublic]bool) error { - return hsdb.Write(func(tx *gorm.DB) error { +func (hsdb *HSDatabase) DeleteNode(node *types.Node, isConnected types.NodeConnectedMap) ([]types.NodeID, error) { + return Write(hsdb.DB, func(tx *gorm.DB) ([]types.NodeID, error) { return DeleteNode(tx, node, isConnected) }) } @@ -279,24 +274,24 @@ func (hsdb *HSDatabase) DeleteNode(node *types.Node, isConnected map[key.Machine // Caller is responsible for notifying all of change. func DeleteNode(tx *gorm.DB, node *types.Node, - isConnected map[key.MachinePublic]bool, -) error { - err := deleteNodeRoutes(tx, node, map[key.MachinePublic]bool{}) + isConnected types.NodeConnectedMap, +) ([]types.NodeID, error) { + changed, err := deleteNodeRoutes(tx, node, isConnected) if err != nil { - return err + return changed, err } // Unscoped causes the node to be fully removed from the database. if err := tx.Unscoped().Delete(&node).Error; err != nil { - return err + return changed, err } - return nil + return changed, nil } -// UpdateLastSeen sets a node's last seen field indicating that we +// SetLastSeen sets a node's last seen field indicating that we // have recently communicating with this node. -func UpdateLastSeen(tx *gorm.DB, nodeID uint64, lastSeen time.Time) error { +func SetLastSeen(tx *gorm.DB, nodeID types.NodeID, lastSeen time.Time) error { return tx.Model(&types.Node{}).Where("id = ?", nodeID).Update("last_seen", lastSeen).Error } @@ -606,7 +601,7 @@ func enableRoutes(tx *gorm.DB, return &types.StateUpdate{ Type: types.StatePeerChanged, - ChangeNodes: types.Nodes{node}, + ChangeNodes: []types.NodeID{node.ID}, Message: "created in db.enableRoutes", }, nil } @@ -681,17 +676,18 @@ func GenerateGivenName( return givenName, nil } -func ExpireEphemeralNodes(tx *gorm.DB, +func DeleteExpiredEphemeralNodes(tx *gorm.DB, inactivityThreshhold time.Duration, -) (types.StateUpdate, bool) { +) ([]types.NodeID, []types.NodeID) { users, err := ListUsers(tx) if err != nil { log.Error().Err(err).Msg("Error listing users") - return types.StateUpdate{}, false + return nil, nil } - expired := make([]tailcfg.NodeID, 0) + var expired []types.NodeID + var changedNodes []types.NodeID for _, user := range users { nodes, err := ListNodesByUser(tx, user.Name) if err != nil { @@ -700,40 +696,36 @@ func ExpireEphemeralNodes(tx *gorm.DB, Str("user", user.Name). Msg("Error listing nodes in user") - return types.StateUpdate{}, false + return nil, nil } for idx, node := range nodes { if node.IsEphemeral() && node.LastSeen != nil && time.Now(). After(node.LastSeen.Add(inactivityThreshhold)) { - expired = append(expired, tailcfg.NodeID(node.ID)) + expired = append(expired, node.ID) log.Info(). Str("node", node.Hostname). Msg("Ephemeral client removed from database") // empty isConnected map as ephemeral nodes are not routes - err = DeleteNode(tx, nodes[idx], map[key.MachinePublic]bool{}) + changed, err := DeleteNode(tx, nodes[idx], nil) if err != nil { log.Error(). Err(err). Str("node", node.Hostname). Msg("🤮 Cannot delete ephemeral node from the database") } + + changedNodes = append(changedNodes, changed...) } } // TODO(kradalby): needs to be moved out of transaction } - if len(expired) > 0 { - return types.StateUpdate{ - Type: types.StatePeerRemoved, - Removed: expired, - }, true - } - return types.StateUpdate{}, false + return expired, changedNodes } func ExpireExpiredNodes(tx *gorm.DB, @@ -754,35 +746,12 @@ func ExpireExpiredNodes(tx *gorm.DB, return time.Unix(0, 0), types.StateUpdate{}, false } - for index, node := range nodes { - if node.IsExpired() && - // TODO(kradalby): Replace this, it is very spammy - // It will notify about all nodes that has been expired. - // It should only notify about expired nodes since _last check_. - node.Expiry.After(lastCheck) { + for _, node := range nodes { + if node.IsExpired() && node.Expiry.After(lastCheck) { expired = append(expired, &tailcfg.PeerChange{ NodeID: tailcfg.NodeID(node.ID), KeyExpiry: node.Expiry, }) - - now := time.Now() - // Do not use setNodeExpiry as that has a notifier hook, which - // can cause a deadlock, we are updating all changed nodes later - // and there is no point in notifiying twice. - if err := tx.Model(&nodes[index]).Updates(types.Node{ - Expiry: &now, - }).Error; err != nil { - log.Error(). - Err(err). - Str("node", node.Hostname). - Str("name", node.GivenName). - Msg("🤮 Cannot expire node") - } else { - log.Info(). - Str("node", node.Hostname). - Str("name", node.GivenName). - Msg("Node successfully expired") - } } } diff --git a/hscontrol/db/node_test.go b/hscontrol/db/node_test.go index 5e8eb29..0dbe768 100644 --- a/hscontrol/db/node_test.go +++ b/hscontrol/db/node_test.go @@ -120,7 +120,7 @@ func (s *Suite) TestHardDeleteNode(c *check.C) { } db.DB.Save(&node) - err = db.DeleteNode(&node, map[key.MachinePublic]bool{}) + _, err = db.DeleteNode(&node, types.NodeConnectedMap{}) c.Assert(err, check.IsNil) _, err = db.getNode(user.Name, "testnode3") @@ -142,7 +142,7 @@ func (s *Suite) TestListPeers(c *check.C) { machineKey := key.NewMachine() node := types.Node{ - ID: uint64(index), + ID: types.NodeID(index), MachineKey: machineKey.Public(), NodeKey: nodeKey.Public(), Hostname: "testnode" + strconv.Itoa(index), @@ -156,7 +156,7 @@ func (s *Suite) TestListPeers(c *check.C) { node0ByID, err := db.GetNodeByID(0) c.Assert(err, check.IsNil) - peersOfNode0, err := db.ListPeers(node0ByID) + peersOfNode0, err := db.ListPeers(node0ByID.ID) c.Assert(err, check.IsNil) c.Assert(len(peersOfNode0), check.Equals, 9) @@ -189,7 +189,7 @@ func (s *Suite) TestGetACLFilteredPeers(c *check.C) { machineKey := key.NewMachine() node := types.Node{ - ID: uint64(index), + ID: types.NodeID(index), MachineKey: machineKey.Public(), NodeKey: nodeKey.Public(), IPAddresses: types.NodeAddresses{ @@ -232,16 +232,16 @@ func (s *Suite) TestGetACLFilteredPeers(c *check.C) { c.Logf("Node(%v), user: %v", testNode.Hostname, testNode.User) c.Assert(err, check.IsNil) - adminPeers, err := db.ListPeers(adminNode) + adminPeers, err := db.ListPeers(adminNode.ID) c.Assert(err, check.IsNil) - testPeers, err := db.ListPeers(testNode) + testPeers, err := db.ListPeers(testNode.ID) c.Assert(err, check.IsNil) - adminRules, _, err := policy.GenerateFilterAndSSHRules(aclPolicy, adminNode, adminPeers) + adminRules, _, err := policy.GenerateFilterAndSSHRulesForTests(aclPolicy, adminNode, adminPeers) c.Assert(err, check.IsNil) - testRules, _, err := policy.GenerateFilterAndSSHRules(aclPolicy, testNode, testPeers) + testRules, _, err := policy.GenerateFilterAndSSHRulesForTests(aclPolicy, testNode, testPeers) c.Assert(err, check.IsNil) peersOfAdminNode := policy.FilterNodesByACL(adminNode, adminPeers, adminRules) @@ -586,7 +586,7 @@ func (s *Suite) TestAutoApproveRoutes(c *check.C) { c.Assert(err, check.IsNil) // TODO(kradalby): Check state update - _, err = db.EnableAutoApprovedRoutes(pol, node0ByID) + err = db.EnableAutoApprovedRoutes(pol, node0ByID) c.Assert(err, check.IsNil) enabledRoutes, err := db.GetEnabledRoutes(node0ByID) diff --git a/hscontrol/db/preauth_keys.go b/hscontrol/db/preauth_keys.go index d1d94bb..5d38de2 100644 --- a/hscontrol/db/preauth_keys.go +++ b/hscontrol/db/preauth_keys.go @@ -92,10 +92,6 @@ func CreatePreAuthKey( } } - if err != nil { - return nil, err - } - return &key, nil } diff --git a/hscontrol/db/preauth_keys_test.go b/hscontrol/db/preauth_keys_test.go index 53cf37c..2cd59c4 100644 --- a/hscontrol/db/preauth_keys_test.go +++ b/hscontrol/db/preauth_keys_test.go @@ -148,7 +148,7 @@ func (*Suite) TestEphemeralKeyReusable(c *check.C) { c.Assert(err, check.IsNil) db.DB.Transaction(func(tx *gorm.DB) error { - ExpireEphemeralNodes(tx, time.Second*20) + DeleteExpiredEphemeralNodes(tx, time.Second*20) return nil }) @@ -182,7 +182,7 @@ func (*Suite) TestEphemeralKeyNotReusable(c *check.C) { c.Assert(err, check.IsNil) db.DB.Transaction(func(tx *gorm.DB) error { - ExpireEphemeralNodes(tx, time.Second*20) + DeleteExpiredEphemeralNodes(tx, time.Second*20) return nil }) diff --git a/hscontrol/db/routes.go b/hscontrol/db/routes.go index 1ee144a..9498bc6 100644 --- a/hscontrol/db/routes.go +++ b/hscontrol/db/routes.go @@ -8,7 +8,6 @@ import ( "github.com/juanfont/headscale/hscontrol/types" "github.com/rs/zerolog/log" "gorm.io/gorm" - "tailscale.com/types/key" ) var ErrRouteIsNotAvailable = errors.New("route is not available") @@ -124,8 +123,8 @@ func EnableRoute(tx *gorm.DB, id uint64) (*types.StateUpdate, error) { func DisableRoute(tx *gorm.DB, id uint64, - isConnected map[key.MachinePublic]bool, -) (*types.StateUpdate, error) { + isConnected types.NodeConnectedMap, +) ([]types.NodeID, error) { route, err := GetRoute(tx, id) if err != nil { return nil, err @@ -137,16 +136,15 @@ func DisableRoute(tx *gorm.DB, // Tailscale requires both IPv4 and IPv6 exit routes to // be enabled at the same time, as per // https://github.com/juanfont/headscale/issues/804#issuecomment-1399314002 - var update *types.StateUpdate + var update []types.NodeID if !route.IsExitRoute() { - update, err = failoverRouteReturnUpdate(tx, isConnected, route) + route.Enabled = false + err = tx.Save(route).Error if err != nil { return nil, err } - route.Enabled = false - route.IsPrimary = false - err = tx.Save(route).Error + update, err = failoverRouteTx(tx, isConnected, route) if err != nil { return nil, err } @@ -160,6 +158,7 @@ func DisableRoute(tx *gorm.DB, if routes[i].IsExitRoute() { routes[i].Enabled = false routes[i].IsPrimary = false + err = tx.Save(&routes[i]).Error if err != nil { return nil, err @@ -168,26 +167,11 @@ func DisableRoute(tx *gorm.DB, } } - if routes == nil { - routes, err = GetNodeRoutes(tx, &node) - if err != nil { - return nil, err - } - } - - node.Routes = routes - // If update is empty, it means that one was not created // by failover (as a failover was not necessary), create // one and return to the caller. if update == nil { - update = &types.StateUpdate{ - Type: types.StatePeerChanged, - ChangeNodes: types.Nodes{ - &node, - }, - Message: "called from db.DisableRoute", - } + update = []types.NodeID{node.ID} } return update, nil @@ -195,9 +179,9 @@ func DisableRoute(tx *gorm.DB, func (hsdb *HSDatabase) DeleteRoute( id uint64, - isConnected map[key.MachinePublic]bool, -) (*types.StateUpdate, error) { - return Write(hsdb.DB, func(tx *gorm.DB) (*types.StateUpdate, error) { + isConnected types.NodeConnectedMap, +) ([]types.NodeID, error) { + return Write(hsdb.DB, func(tx *gorm.DB) ([]types.NodeID, error) { return DeleteRoute(tx, id, isConnected) }) } @@ -205,8 +189,8 @@ func (hsdb *HSDatabase) DeleteRoute( func DeleteRoute( tx *gorm.DB, id uint64, - isConnected map[key.MachinePublic]bool, -) (*types.StateUpdate, error) { + isConnected types.NodeConnectedMap, +) ([]types.NodeID, error) { route, err := GetRoute(tx, id) if err != nil { return nil, err @@ -218,9 +202,9 @@ func DeleteRoute( // Tailscale requires both IPv4 and IPv6 exit routes to // be enabled at the same time, as per // https://github.com/juanfont/headscale/issues/804#issuecomment-1399314002 - var update *types.StateUpdate + var update []types.NodeID if !route.IsExitRoute() { - update, err = failoverRouteReturnUpdate(tx, isConnected, route) + update, err = failoverRouteTx(tx, isConnected, route) if err != nil { return nil, nil } @@ -229,7 +213,7 @@ func DeleteRoute( return nil, err } } else { - routes, err := GetNodeRoutes(tx, &node) + routes, err = GetNodeRoutes(tx, &node) if err != nil { return nil, err } @@ -259,35 +243,37 @@ func DeleteRoute( node.Routes = routes if update == nil { - update = &types.StateUpdate{ - Type: types.StatePeerChanged, - ChangeNodes: types.Nodes{ - &node, - }, - Message: "called from db.DeleteRoute", - } + update = []types.NodeID{node.ID} } return update, nil } -func deleteNodeRoutes(tx *gorm.DB, node *types.Node, isConnected map[key.MachinePublic]bool) error { +func deleteNodeRoutes(tx *gorm.DB, node *types.Node, isConnected types.NodeConnectedMap) ([]types.NodeID, error) { routes, err := GetNodeRoutes(tx, node) if err != nil { - return err + return nil, err } + var changed []types.NodeID for i := range routes { if err := tx.Unscoped().Delete(&routes[i]).Error; err != nil { - return err + return nil, err } // TODO(kradalby): This is a bit too aggressive, we could probably // figure out which routes needs to be failed over rather than all. - failoverRouteReturnUpdate(tx, isConnected, &routes[i]) + chn, err := failoverRouteTx(tx, isConnected, &routes[i]) + if err != nil { + return changed, err + } + + if chn != nil { + changed = append(changed, chn...) + } } - return nil + return changed, nil } // isUniquePrefix returns if there is another node providing the same route already. @@ -400,7 +386,7 @@ func SaveNodeRoutes(tx *gorm.DB, node *types.Node) (bool, error) { for prefix, exists := range advertisedRoutes { if !exists { route := types.Route{ - NodeID: node.ID, + NodeID: node.ID.Uint64(), Prefix: types.IPPrefix(prefix), Advertised: true, Enabled: false, @@ -415,19 +401,23 @@ func SaveNodeRoutes(tx *gorm.DB, node *types.Node) (bool, error) { return sendUpdate, nil } -// EnsureFailoverRouteIsAvailable takes a node and checks if the node's route +// FailoverRouteIfAvailable takes a node and checks if the node's route // currently have a functioning host that exposes the network. -func EnsureFailoverRouteIsAvailable( +// If it does not, it is failed over to another suitable route if there +// is one. +func FailoverRouteIfAvailable( tx *gorm.DB, - isConnected map[key.MachinePublic]bool, + isConnected types.NodeConnectedMap, node *types.Node, ) (*types.StateUpdate, error) { + log.Debug().Caller().Uint64("node.id", node.ID.Uint64()).Msgf("ROUTE DEBUG ENTERED FAILOVER") nodeRoutes, err := GetNodeRoutes(tx, node) if err != nil { + log.Debug().Caller().Uint64("node.id", node.ID.Uint64()).Interface("nodeRoutes", nodeRoutes).Msgf("ROUTE DEBUG NO ROUTES") return nil, nil } - var changedNodes types.Nodes + var changedNodes []types.NodeID for _, nodeRoute := range nodeRoutes { routes, err := getRoutesByPrefix(tx, netip.Prefix(nodeRoute.Prefix)) if err != nil { @@ -438,71 +428,39 @@ func EnsureFailoverRouteIsAvailable( if route.IsPrimary { // if we have a primary route, and the node is connected // nothing needs to be done. - if isConnected[route.Node.MachineKey] { - continue + log.Debug().Caller().Uint64("node.id", node.ID.Uint64()).Uint64("route.node.id", route.Node.ID.Uint64()).Msgf("ROUTE DEBUG CHECKING IF ONLINE") + if isConnected[route.Node.ID] { + log.Debug().Caller().Uint64("node.id", node.ID.Uint64()).Uint64("route.node.id", route.Node.ID.Uint64()).Msgf("ROUTE DEBUG IS ONLINE") + return nil, nil } + log.Debug().Caller().Uint64("node.id", node.ID.Uint64()).Uint64("route.node.id", route.Node.ID.Uint64()).Msgf("ROUTE DEBUG NOT ONLINE, FAILING OVER") // if not, we need to failover the route - update, err := failoverRouteReturnUpdate(tx, isConnected, &route) + changedIDs, err := failoverRouteTx(tx, isConnected, &route) if err != nil { return nil, err } - if update != nil { - changedNodes = append(changedNodes, update.ChangeNodes...) + if changedIDs != nil { + changedNodes = append(changedNodes, changedIDs...) } } } } + log.Debug().Caller().Uint64("node.id", node.ID.Uint64()).Interface("changedNodes", changedNodes).Msgf("ROUTE DEBUG") if len(changedNodes) != 0 { return &types.StateUpdate{ Type: types.StatePeerChanged, ChangeNodes: changedNodes, - Message: "called from db.EnsureFailoverRouteIsAvailable", + Message: "called from db.FailoverRouteIfAvailable", }, nil } return nil, nil } -func failoverRouteReturnUpdate( - tx *gorm.DB, - isConnected map[key.MachinePublic]bool, - r *types.Route, -) (*types.StateUpdate, error) { - changedKeys, err := failoverRoute(tx, isConnected, r) - if err != nil { - return nil, err - } - - log.Trace(). - Interface("isConnected", isConnected). - Interface("changedKeys", changedKeys). - Msg("building route failover") - - if len(changedKeys) == 0 { - return nil, nil - } - - var nodes types.Nodes - for _, key := range changedKeys { - node, err := GetNodeByMachineKey(tx, key) - if err != nil { - return nil, err - } - - nodes = append(nodes, node) - } - - return &types.StateUpdate{ - Type: types.StatePeerChanged, - ChangeNodes: nodes, - Message: "called from db.failoverRouteReturnUpdate", - }, nil -} - -// failoverRoute takes a route that is no longer available, +// failoverRouteTx takes a route that is no longer available, // this can be either from: // - being disabled // - being deleted @@ -510,11 +468,11 @@ func failoverRouteReturnUpdate( // // and tries to find a new route to take over its place. // If the given route was not primary, it returns early. -func failoverRoute( +func failoverRouteTx( tx *gorm.DB, - isConnected map[key.MachinePublic]bool, + isConnected types.NodeConnectedMap, r *types.Route, -) ([]key.MachinePublic, error) { +) ([]types.NodeID, error) { if r == nil { return nil, nil } @@ -535,11 +493,64 @@ func failoverRoute( return nil, err } + fo := failoverRoute(isConnected, r, routes) + if fo == nil { + return nil, nil + } + + err = tx.Save(fo.old).Error + if err != nil { + log.Error().Err(err).Msg("disabling old primary route") + + return nil, err + } + + err = tx.Save(fo.new).Error + if err != nil { + log.Error().Err(err).Msg("saving new primary route") + + return nil, err + } + + log.Trace(). + Str("hostname", fo.new.Node.Hostname). + Msgf("set primary to new route, was: id(%d), host(%s), now: id(%d), host(%s)", fo.old.ID, fo.old.Node.Hostname, fo.new.ID, fo.new.Node.Hostname) + + // Return a list of the machinekeys of the changed nodes. + return []types.NodeID{fo.old.Node.ID, fo.new.Node.ID}, nil +} + +type failover struct { + old *types.Route + new *types.Route +} + +func failoverRoute( + isConnected types.NodeConnectedMap, + routeToReplace *types.Route, + altRoutes types.Routes, + +) *failover { + if routeToReplace == nil { + return nil + } + + // This route is not a primary route, and it is not + // being served to nodes. + if !routeToReplace.IsPrimary { + return nil + } + + // We do not have to failover exit nodes + if routeToReplace.IsExitRoute() { + return nil + } + var newPrimary *types.Route // Find a new suitable route - for idx, route := range routes { - if r.ID == route.ID { + for idx, route := range altRoutes { + if routeToReplace.ID == route.ID { continue } @@ -547,8 +558,8 @@ func failoverRoute( continue } - if isConnected[route.Node.MachineKey] { - newPrimary = &routes[idx] + if isConnected != nil && isConnected[route.Node.ID] { + newPrimary = &altRoutes[idx] break } } @@ -559,48 +570,23 @@ func failoverRoute( // the one currently marked as primary is the // best we got. if newPrimary == nil { - return nil, nil + return nil } - log.Trace(). - Str("hostname", newPrimary.Node.Hostname). - Msg("found new primary, updating db") - - // Remove primary from the old route - r.IsPrimary = false - err = tx.Save(&r).Error - if err != nil { - log.Error().Err(err).Msg("error disabling new primary route") - - return nil, err - } - - log.Trace(). - Str("hostname", newPrimary.Node.Hostname). - Msg("removed primary from old route") - - // Set primary for the new primary + routeToReplace.IsPrimary = false newPrimary.IsPrimary = true - err = tx.Save(&newPrimary).Error - if err != nil { - log.Error().Err(err).Msg("error enabling new primary route") - return nil, err + return &failover{ + old: routeToReplace, + new: newPrimary, } - - log.Trace(). - Str("hostname", newPrimary.Node.Hostname). - Msg("set primary to new route") - - // Return a list of the machinekeys of the changed nodes. - return []key.MachinePublic{r.Node.MachineKey, newPrimary.Node.MachineKey}, nil } func (hsdb *HSDatabase) EnableAutoApprovedRoutes( aclPolicy *policy.ACLPolicy, node *types.Node, -) (*types.StateUpdate, error) { - return Write(hsdb.DB, func(tx *gorm.DB) (*types.StateUpdate, error) { +) error { + return hsdb.Write(func(tx *gorm.DB) error { return EnableAutoApprovedRoutes(tx, aclPolicy, node) }) } @@ -610,9 +596,9 @@ func EnableAutoApprovedRoutes( tx *gorm.DB, aclPolicy *policy.ACLPolicy, node *types.Node, -) (*types.StateUpdate, error) { +) error { if len(node.IPAddresses) == 0 { - return nil, nil // This node has no IPAddresses, so can't possibly match any autoApprovers ACLs + return nil // This node has no IPAddresses, so can't possibly match any autoApprovers ACLs } routes, err := GetNodeAdvertisedRoutes(tx, node) @@ -623,7 +609,7 @@ func EnableAutoApprovedRoutes( Str("node", node.Hostname). Msg("Could not get advertised routes for node") - return nil, err + return err } log.Trace().Interface("routes", routes).Msg("routes for autoapproving") @@ -641,10 +627,10 @@ func EnableAutoApprovedRoutes( if err != nil { log.Err(err). Str("advertisedRoute", advertisedRoute.String()). - Uint64("nodeId", node.ID). + Uint64("nodeId", node.ID.Uint64()). Msg("Failed to resolve autoApprovers for advertised route") - return nil, err + return err } log.Trace(). @@ -665,7 +651,7 @@ func EnableAutoApprovedRoutes( Str("alias", approvedAlias). Msg("Failed to expand alias when processing autoApprovers policy") - return nil, err + return err } // approvedIPs should contain all of node's IPs if it matches the rule, so check for first @@ -676,25 +662,17 @@ func EnableAutoApprovedRoutes( } } - update := &types.StateUpdate{ - Type: types.StatePeerChanged, - ChangeNodes: types.Nodes{}, - Message: "created in db.EnableAutoApprovedRoutes", - } - for _, approvedRoute := range approvedRoutes { - perHostUpdate, err := EnableRoute(tx, uint64(approvedRoute.ID)) + _, err := EnableRoute(tx, uint64(approvedRoute.ID)) if err != nil { log.Err(err). Str("approvedRoute", approvedRoute.String()). - Uint64("nodeId", node.ID). + Uint64("nodeId", node.ID.Uint64()). Msg("Failed to enable approved route") - return nil, err + return err } - - update.ChangeNodes = append(update.ChangeNodes, perHostUpdate.ChangeNodes...) } - return update, nil + return nil } diff --git a/hscontrol/db/routes_test.go b/hscontrol/db/routes_test.go index f3357e2..390cf70 100644 --- a/hscontrol/db/routes_test.go +++ b/hscontrol/db/routes_test.go @@ -13,7 +13,6 @@ import ( "gopkg.in/check.v1" "gorm.io/gorm" "tailscale.com/tailcfg" - "tailscale.com/types/key" ) func (s *Suite) TestGetRoutes(c *check.C) { @@ -262,7 +261,7 @@ func (s *Suite) TestDeleteRoutes(c *check.C) { c.Assert(err, check.IsNil) // TODO(kradalby): check stateupdate - _, err = db.DeleteRoute(uint64(routes[0].ID), map[key.MachinePublic]bool{}) + _, err = db.DeleteRoute(uint64(routes[0].ID), nil) c.Assert(err, check.IsNil) enabledRoutes1, err := db.GetEnabledRoutes(&node1) @@ -272,20 +271,13 @@ func (s *Suite) TestDeleteRoutes(c *check.C) { var ipp = func(s string) types.IPPrefix { return types.IPPrefix(netip.MustParsePrefix(s)) } -func TestFailoverRoute(t *testing.T) { - machineKeys := []key.MachinePublic{ - key.NewMachine().Public(), - key.NewMachine().Public(), - key.NewMachine().Public(), - key.NewMachine().Public(), - } - +func TestFailoverRouteTx(t *testing.T) { tests := []struct { name string failingRoute types.Route routes types.Routes - isConnected map[key.MachinePublic]bool - want []key.MachinePublic + isConnected types.NodeConnectedMap + want []types.NodeID wantErr bool }{ { @@ -301,10 +293,8 @@ func TestFailoverRoute(t *testing.T) { Model: gorm.Model{ ID: 1, }, - Prefix: ipp("10.0.0.0/24"), - Node: types.Node{ - MachineKey: machineKeys[0], - }, + Prefix: ipp("10.0.0.0/24"), + Node: types.Node{}, IsPrimary: false, }, routes: types.Routes{}, @@ -317,10 +307,8 @@ func TestFailoverRoute(t *testing.T) { Model: gorm.Model{ ID: 1, }, - Prefix: ipp("0.0.0.0/0"), - Node: types.Node{ - MachineKey: machineKeys[0], - }, + Prefix: ipp("0.0.0.0/0"), + Node: types.Node{}, IsPrimary: true, }, routes: types.Routes{}, @@ -335,7 +323,7 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[0], + ID: 1, }, IsPrimary: true, }, @@ -346,7 +334,7 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[0], + ID: 1, }, IsPrimary: true, }, @@ -362,7 +350,7 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[0], + ID: 1, }, IsPrimary: true, Enabled: true, @@ -374,7 +362,7 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[0], + ID: 1, }, IsPrimary: true, Enabled: true, @@ -385,19 +373,19 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[1], + ID: 2, }, IsPrimary: false, Enabled: true, }, }, - isConnected: map[key.MachinePublic]bool{ - machineKeys[0]: false, - machineKeys[1]: true, + isConnected: types.NodeConnectedMap{ + 1: false, + 2: true, }, - want: []key.MachinePublic{ - machineKeys[0], - machineKeys[1], + want: []types.NodeID{ + 1, + 2, }, wantErr: false, }, @@ -409,7 +397,7 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[0], + ID: 1, }, IsPrimary: false, Enabled: true, @@ -421,7 +409,7 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[0], + ID: 1, }, IsPrimary: true, Enabled: true, @@ -432,7 +420,7 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[1], + ID: 2, }, IsPrimary: false, Enabled: true, @@ -449,7 +437,7 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[1], + ID: 2, }, IsPrimary: true, Enabled: true, @@ -461,7 +449,7 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[0], + ID: 1, }, IsPrimary: false, Enabled: true, @@ -472,7 +460,7 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[1], + ID: 2, }, IsPrimary: true, Enabled: true, @@ -483,20 +471,19 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[2], + ID: 3, }, IsPrimary: false, Enabled: true, }, }, - isConnected: map[key.MachinePublic]bool{ - machineKeys[0]: true, - machineKeys[1]: true, - machineKeys[2]: true, + isConnected: types.NodeConnectedMap{ + 1: true, + 2: true, + 3: true, }, - want: []key.MachinePublic{ - machineKeys[1], - machineKeys[0], + want: []types.NodeID{ + 2, 1, }, wantErr: false, }, @@ -508,7 +495,7 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[0], + ID: 1, }, IsPrimary: true, Enabled: true, @@ -520,7 +507,7 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[0], + ID: 1, }, IsPrimary: true, Enabled: true, @@ -532,15 +519,15 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[3], + ID: 4, }, IsPrimary: false, Enabled: true, }, }, - isConnected: map[key.MachinePublic]bool{ - machineKeys[0]: true, - machineKeys[3]: false, + isConnected: types.NodeConnectedMap{ + 1: true, + 4: false, }, want: nil, wantErr: false, @@ -553,7 +540,7 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[0], + ID: 1, }, IsPrimary: true, Enabled: true, @@ -565,7 +552,7 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[0], + ID: 1, }, IsPrimary: true, Enabled: true, @@ -577,7 +564,7 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[3], + ID: 4, }, IsPrimary: false, Enabled: true, @@ -588,20 +575,20 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[1], + ID: 2, }, IsPrimary: true, Enabled: true, }, }, - isConnected: map[key.MachinePublic]bool{ - machineKeys[0]: false, - machineKeys[1]: true, - machineKeys[3]: false, + isConnected: types.NodeConnectedMap{ + 1: false, + 2: true, + 4: false, }, - want: []key.MachinePublic{ - machineKeys[0], - machineKeys[1], + want: []types.NodeID{ + 1, + 2, }, wantErr: false, }, @@ -613,7 +600,7 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[0], + ID: 1, }, IsPrimary: true, Enabled: true, @@ -625,7 +612,7 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[0], + ID: 1, }, IsPrimary: true, Enabled: true, @@ -637,7 +624,7 @@ func TestFailoverRoute(t *testing.T) { }, Prefix: ipp("10.0.0.0/24"), Node: types.Node{ - MachineKey: machineKeys[1], + ID: 2, }, IsPrimary: false, Enabled: false, @@ -670,8 +657,8 @@ func TestFailoverRoute(t *testing.T) { } } - got, err := Write(db.DB, func(tx *gorm.DB) ([]key.MachinePublic, error) { - return failoverRoute(tx, tt.isConnected, &tt.failingRoute) + got, err := Write(db.DB, func(tx *gorm.DB) ([]types.NodeID, error) { + return failoverRouteTx(tx, tt.isConnected, &tt.failingRoute) }) if (err != nil) != tt.wantErr { @@ -687,230 +674,177 @@ func TestFailoverRoute(t *testing.T) { } } -// func TestDisableRouteFailover(t *testing.T) { -// machineKeys := []key.MachinePublic{ -// key.NewMachine().Public(), -// key.NewMachine().Public(), -// key.NewMachine().Public(), -// key.NewMachine().Public(), -// } +func TestFailoverRoute(t *testing.T) { + r := func(id uint, nid types.NodeID, prefix types.IPPrefix, enabled, primary bool) types.Route { + return types.Route{ + Model: gorm.Model{ + ID: id, + }, + Node: types.Node{ + ID: nid, + }, + Prefix: prefix, + Enabled: enabled, + IsPrimary: primary, + } + } + rp := func(id uint, nid types.NodeID, prefix types.IPPrefix, enabled, primary bool) *types.Route { + ro := r(id, nid, prefix, enabled, primary) + return &ro + } + tests := []struct { + name string + failingRoute types.Route + routes types.Routes + isConnected types.NodeConnectedMap + want *failover + }{ + { + name: "no-route", + failingRoute: types.Route{}, + routes: types.Routes{}, + want: nil, + }, + { + name: "no-prime", + failingRoute: r(1, 1, ipp("10.0.0.0/24"), false, false), -// tests := []struct { -// name string -// nodes types.Nodes + routes: types.Routes{}, + want: nil, + }, + { + name: "exit-node", + failingRoute: r(1, 1, ipp("0.0.0.0/0"), false, true), + routes: types.Routes{}, + want: nil, + }, + { + name: "no-failover-single-route", + failingRoute: r(1, 1, ipp("10.0.0.0/24"), false, true), + routes: types.Routes{ + r(1, 1, ipp("10.0.0.0/24"), false, true), + }, + want: nil, + }, + { + name: "failover-primary", + failingRoute: r(1, 1, ipp("10.0.0.0/24"), true, true), + routes: types.Routes{ + r(1, 1, ipp("10.0.0.0/24"), true, true), + r(2, 2, ipp("10.0.0.0/24"), true, false), + }, + isConnected: types.NodeConnectedMap{ + 1: false, + 2: true, + }, + want: &failover{ + old: rp(1, 1, ipp("10.0.0.0/24"), true, false), + new: rp(2, 2, ipp("10.0.0.0/24"), true, true), + }, + }, + { + name: "failover-none-primary", + failingRoute: r(1, 1, ipp("10.0.0.0/24"), true, false), + routes: types.Routes{ + r(1, 1, ipp("10.0.0.0/24"), true, true), + r(2, 2, ipp("10.0.0.0/24"), true, false), + }, + want: nil, + }, + { + name: "failover-primary-multi-route", + failingRoute: r(2, 2, ipp("10.0.0.0/24"), true, true), + routes: types.Routes{ + r(1, 1, ipp("10.0.0.0/24"), true, false), + r(2, 2, ipp("10.0.0.0/24"), true, true), + r(3, 3, ipp("10.0.0.0/24"), true, false), + }, + isConnected: types.NodeConnectedMap{ + 1: true, + 2: true, + 3: true, + }, + want: &failover{ + old: rp(2, 2, ipp("10.0.0.0/24"), true, false), + new: rp(1, 1, ipp("10.0.0.0/24"), true, true), + }, + }, + { + name: "failover-primary-no-online", + failingRoute: r(1, 1, ipp("10.0.0.0/24"), true, true), + routes: types.Routes{ + r(1, 1, ipp("10.0.0.0/24"), true, true), + r(2, 4, ipp("10.0.0.0/24"), true, false), + }, + isConnected: types.NodeConnectedMap{ + 1: true, + 4: false, + }, + want: nil, + }, + { + name: "failover-primary-one-not-online", + failingRoute: r(1, 1, ipp("10.0.0.0/24"), true, true), + routes: types.Routes{ + r(1, 1, ipp("10.0.0.0/24"), true, true), + r(2, 4, ipp("10.0.0.0/24"), true, false), + r(3, 2, ipp("10.0.0.0/24"), true, false), + }, + isConnected: types.NodeConnectedMap{ + 1: false, + 2: true, + 4: false, + }, + want: &failover{ + old: rp(1, 1, ipp("10.0.0.0/24"), true, false), + new: rp(3, 2, ipp("10.0.0.0/24"), true, true), + }, + }, + { + name: "failover-primary-none-enabled", + failingRoute: r(1, 1, ipp("10.0.0.0/24"), true, true), + routes: types.Routes{ + r(1, 1, ipp("10.0.0.0/24"), true, false), + r(2, 2, ipp("10.0.0.0/24"), false, true), + }, + want: nil, + }, + } -// routeID uint64 -// isConnected map[key.MachinePublic]bool + cmps := append( + util.Comparers, + cmp.Comparer(func(x, y types.IPPrefix) bool { + return netip.Prefix(x) == netip.Prefix(y) + }), + ) -// wantMachineKey key.MachinePublic -// wantErr string -// }{ -// { -// name: "single-route", -// nodes: types.Nodes{ -// &types.Node{ -// ID: 0, -// MachineKey: machineKeys[0], -// Routes: []types.Route{ -// { -// Model: gorm.Model{ -// ID: 1, -// }, -// Prefix: ipp("10.0.0.0/24"), -// Node: types.Node{ -// MachineKey: machineKeys[0], -// }, -// IsPrimary: true, -// }, -// }, -// Hostinfo: &tailcfg.Hostinfo{ -// RoutableIPs: []netip.Prefix{ -// netip.MustParsePrefix("10.0.0.0/24"), -// }, -// }, -// }, -// }, -// routeID: 1, -// wantMachineKey: machineKeys[0], -// }, -// { -// name: "failover-simple", -// nodes: types.Nodes{ -// &types.Node{ -// ID: 0, -// MachineKey: machineKeys[0], -// Routes: []types.Route{ -// { -// Model: gorm.Model{ -// ID: 1, -// }, -// Prefix: ipp("10.0.0.0/24"), -// IsPrimary: true, -// }, -// }, -// Hostinfo: &tailcfg.Hostinfo{ -// RoutableIPs: []netip.Prefix{ -// netip.MustParsePrefix("10.0.0.0/24"), -// }, -// }, -// }, -// &types.Node{ -// ID: 1, -// MachineKey: machineKeys[1], -// Routes: []types.Route{ -// { -// Model: gorm.Model{ -// ID: 2, -// }, -// Prefix: ipp("10.0.0.0/24"), -// IsPrimary: false, -// }, -// }, -// Hostinfo: &tailcfg.Hostinfo{ -// RoutableIPs: []netip.Prefix{ -// netip.MustParsePrefix("10.0.0.0/24"), -// }, -// }, -// }, -// }, -// routeID: 1, -// wantMachineKey: machineKeys[1], -// }, -// { -// name: "no-failover-offline", -// nodes: types.Nodes{ -// &types.Node{ -// ID: 0, -// MachineKey: machineKeys[0], -// Routes: []types.Route{ -// { -// Model: gorm.Model{ -// ID: 1, -// }, -// Prefix: ipp("10.0.0.0/24"), -// IsPrimary: true, -// }, -// }, -// Hostinfo: &tailcfg.Hostinfo{ -// RoutableIPs: []netip.Prefix{ -// netip.MustParsePrefix("10.0.0.0/24"), -// }, -// }, -// }, -// &types.Node{ -// ID: 1, -// MachineKey: machineKeys[1], -// Routes: []types.Route{ -// { -// Model: gorm.Model{ -// ID: 2, -// }, -// Prefix: ipp("10.0.0.0/24"), -// IsPrimary: false, -// }, -// }, -// Hostinfo: &tailcfg.Hostinfo{ -// RoutableIPs: []netip.Prefix{ -// netip.MustParsePrefix("10.0.0.0/24"), -// }, -// }, -// }, -// }, -// isConnected: map[key.MachinePublic]bool{ -// machineKeys[0]: true, -// machineKeys[1]: false, -// }, -// routeID: 1, -// wantMachineKey: machineKeys[1], -// }, -// { -// name: "failover-to-online", -// nodes: types.Nodes{ -// &types.Node{ -// ID: 0, -// MachineKey: machineKeys[0], -// Routes: []types.Route{ -// { -// Model: gorm.Model{ -// ID: 1, -// }, -// Prefix: ipp("10.0.0.0/24"), -// IsPrimary: true, -// }, -// }, -// Hostinfo: &tailcfg.Hostinfo{ -// RoutableIPs: []netip.Prefix{ -// netip.MustParsePrefix("10.0.0.0/24"), -// }, -// }, -// }, -// &types.Node{ -// ID: 1, -// MachineKey: machineKeys[1], -// Routes: []types.Route{ -// { -// Model: gorm.Model{ -// ID: 2, -// }, -// Prefix: ipp("10.0.0.0/24"), -// IsPrimary: false, -// }, -// }, -// Hostinfo: &tailcfg.Hostinfo{ -// RoutableIPs: []netip.Prefix{ -// netip.MustParsePrefix("10.0.0.0/24"), -// }, -// }, -// }, -// }, -// isConnected: map[key.MachinePublic]bool{ -// machineKeys[0]: true, -// machineKeys[1]: true, -// }, -// routeID: 1, -// wantMachineKey: machineKeys[1], -// }, -// } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotf := failoverRoute(tt.isConnected, &tt.failingRoute, tt.routes) -// for _, tt := range tests { -// t.Run(tt.name, func(t *testing.T) { -// datab, err := NewHeadscaleDatabase("sqlite3", ":memory:", false, []netip.Prefix{}, "") -// assert.NoError(t, err) + if tt.want == nil && gotf != nil { + t.Fatalf("expected nil, got %+v", gotf) + } -// // bootstrap db -// datab.DB.Transaction(func(tx *gorm.DB) error { -// for _, node := range tt.nodes { -// err := tx.Save(node).Error -// if err != nil { -// return err -// } + if gotf == nil && tt.want != nil { + t.Fatalf("expected %+v, got nil", tt.want) + } -// _, err = SaveNodeRoutes(tx, node) -// if err != nil { -// return err -// } -// } + if tt.want != nil && gotf != nil { + want := map[string]*types.Route{ + "new": tt.want.new, + "old": tt.want.old, + } -// return nil -// }) + got := map[string]*types.Route{ + "new": gotf.new, + "old": gotf.old, + } -// got, err := Write(datab.DB, func(tx *gorm.DB) (*types.StateUpdate, error) { -// return DisableRoute(tx, tt.routeID, tt.isConnected) -// }) - -// // if (err.Error() != "") != tt.wantErr { -// // t.Errorf("failoverRoute() error = %v, wantErr %v", err, tt.wantErr) - -// // return -// // } - -// if len(got.ChangeNodes) != 1 { -// t.Errorf("expected update with one machine, got %d", len(got.ChangeNodes)) -// } - -// if diff := cmp.Diff(tt.wantMachineKey, got.ChangeNodes[0].MachineKey, util.Comparers...); diff != "" { -// t.Errorf("DisableRoute() unexpected result (-want +got):\n%s", diff) -// } -// }) -// } -// } + if diff := cmp.Diff(want, got, cmps...); diff != "" { + t.Fatalf("failoverRoute unexpected result (-want +got):\n%s", diff) + } + } + }) + } +} diff --git a/hscontrol/grpcv1.go b/hscontrol/grpcv1.go index 379502c..d5a1854 100644 --- a/hscontrol/grpcv1.go +++ b/hscontrol/grpcv1.go @@ -222,7 +222,7 @@ func (api headscaleV1APIServer) GetNode( ctx context.Context, request *v1.GetNodeRequest, ) (*v1.GetNodeResponse, error) { - node, err := api.h.db.GetNodeByID(request.GetNodeId()) + node, err := api.h.db.GetNodeByID(types.NodeID(request.GetNodeId())) if err != nil { return nil, err } @@ -231,7 +231,7 @@ func (api headscaleV1APIServer) GetNode( // Populate the online field based on // currently connected nodes. - resp.Online = api.h.nodeNotifier.IsConnected(node.MachineKey) + resp.Online = api.h.nodeNotifier.IsConnected(node.ID) return &v1.GetNodeResponse{Node: resp}, nil } @@ -248,12 +248,12 @@ func (api headscaleV1APIServer) SetTags( } node, err := db.Write(api.h.db.DB, func(tx *gorm.DB) (*types.Node, error) { - err := db.SetTags(tx, request.GetNodeId(), request.GetTags()) + err := db.SetTags(tx, types.NodeID(request.GetNodeId()), request.GetTags()) if err != nil { return nil, err } - return db.GetNodeByID(tx, request.GetNodeId()) + return db.GetNodeByID(tx, types.NodeID(request.GetNodeId())) }) if err != nil { return &v1.SetTagsResponse{ @@ -261,15 +261,12 @@ func (api headscaleV1APIServer) SetTags( }, status.Error(codes.InvalidArgument, err.Error()) } - stateUpdate := types.StateUpdate{ + ctx = types.NotifyCtx(ctx, "cli-settags", node.Hostname) + api.h.nodeNotifier.NotifyWithIgnore(ctx, types.StateUpdate{ Type: types.StatePeerChanged, - ChangeNodes: types.Nodes{node}, + ChangeNodes: []types.NodeID{node.ID}, Message: "called from api.SetTags", - } - if stateUpdate.Valid() { - ctx := types.NotifyCtx(ctx, "cli-settags", node.Hostname) - api.h.nodeNotifier.NotifyWithIgnore(ctx, stateUpdate, node.MachineKey.String()) - } + }, node.ID) log.Trace(). Str("node", node.Hostname). @@ -296,12 +293,12 @@ func (api headscaleV1APIServer) DeleteNode( ctx context.Context, request *v1.DeleteNodeRequest, ) (*v1.DeleteNodeResponse, error) { - node, err := api.h.db.GetNodeByID(request.GetNodeId()) + node, err := api.h.db.GetNodeByID(types.NodeID(request.GetNodeId())) if err != nil { return nil, err } - err = api.h.db.DeleteNode( + changedNodes, err := api.h.db.DeleteNode( node, api.h.nodeNotifier.ConnectedMap(), ) @@ -309,13 +306,17 @@ func (api headscaleV1APIServer) DeleteNode( return nil, err } - stateUpdate := types.StateUpdate{ + ctx = types.NotifyCtx(ctx, "cli-deletenode", node.Hostname) + api.h.nodeNotifier.NotifyAll(ctx, types.StateUpdate{ Type: types.StatePeerRemoved, - Removed: []tailcfg.NodeID{tailcfg.NodeID(node.ID)}, - } - if stateUpdate.Valid() { - ctx := types.NotifyCtx(ctx, "cli-deletenode", node.Hostname) - api.h.nodeNotifier.NotifyAll(ctx, stateUpdate) + Removed: []types.NodeID{node.ID}, + }) + + if changedNodes != nil { + api.h.nodeNotifier.NotifyAll(ctx, types.StateUpdate{ + Type: types.StatePeerChanged, + ChangeNodes: changedNodes, + }) } return &v1.DeleteNodeResponse{}, nil @@ -330,33 +331,27 @@ func (api headscaleV1APIServer) ExpireNode( node, err := db.Write(api.h.db.DB, func(tx *gorm.DB) (*types.Node, error) { db.NodeSetExpiry( tx, - request.GetNodeId(), + types.NodeID(request.GetNodeId()), now, ) - return db.GetNodeByID(tx, request.GetNodeId()) + return db.GetNodeByID(tx, types.NodeID(request.GetNodeId())) }) if err != nil { return nil, err } - selfUpdate := types.StateUpdate{ - Type: types.StateSelfUpdate, - ChangeNodes: types.Nodes{node}, - } - if selfUpdate.Valid() { - ctx := types.NotifyCtx(ctx, "cli-expirenode-self", node.Hostname) - api.h.nodeNotifier.NotifyByMachineKey( - ctx, - selfUpdate, - node.MachineKey) - } + ctx = types.NotifyCtx(ctx, "cli-expirenode-self", node.Hostname) + api.h.nodeNotifier.NotifyByMachineKey( + ctx, + types.StateUpdate{ + Type: types.StateSelfUpdate, + ChangeNodes: []types.NodeID{node.ID}, + }, + node.ID) - stateUpdate := types.StateUpdateExpire(node.ID, now) - if stateUpdate.Valid() { - ctx := types.NotifyCtx(ctx, "cli-expirenode-peers", node.Hostname) - api.h.nodeNotifier.NotifyWithIgnore(ctx, stateUpdate, node.MachineKey.String()) - } + ctx = types.NotifyCtx(ctx, "cli-expirenode-peers", node.Hostname) + api.h.nodeNotifier.NotifyWithIgnore(ctx, types.StateUpdateExpire(node.ID, now), node.ID) log.Trace(). Str("node", node.Hostname). @@ -380,21 +375,18 @@ func (api headscaleV1APIServer) RenameNode( return nil, err } - return db.GetNodeByID(tx, request.GetNodeId()) + return db.GetNodeByID(tx, types.NodeID(request.GetNodeId())) }) if err != nil { return nil, err } - stateUpdate := types.StateUpdate{ + ctx = types.NotifyCtx(ctx, "cli-renamenode", node.Hostname) + api.h.nodeNotifier.NotifyWithIgnore(ctx, types.StateUpdate{ Type: types.StatePeerChanged, - ChangeNodes: types.Nodes{node}, + ChangeNodes: []types.NodeID{node.ID}, Message: "called from api.RenameNode", - } - if stateUpdate.Valid() { - ctx := types.NotifyCtx(ctx, "cli-renamenode", node.Hostname) - api.h.nodeNotifier.NotifyWithIgnore(ctx, stateUpdate, node.MachineKey.String()) - } + }, node.ID) log.Trace(). Str("node", node.Hostname). @@ -423,7 +415,7 @@ func (api headscaleV1APIServer) ListNodes( // Populate the online field based on // currently connected nodes. - resp.Online = isConnected[node.MachineKey] + resp.Online = isConnected[node.ID] response[index] = resp } @@ -446,7 +438,7 @@ func (api headscaleV1APIServer) ListNodes( // Populate the online field based on // currently connected nodes. - resp.Online = isConnected[node.MachineKey] + resp.Online = isConnected[node.ID] validTags, invalidTags := api.h.ACLPolicy.TagsOfNode( node, @@ -463,7 +455,7 @@ func (api headscaleV1APIServer) MoveNode( ctx context.Context, request *v1.MoveNodeRequest, ) (*v1.MoveNodeResponse, error) { - node, err := api.h.db.GetNodeByID(request.GetNodeId()) + node, err := api.h.db.GetNodeByID(types.NodeID(request.GetNodeId())) if err != nil { return nil, err } @@ -503,7 +495,7 @@ func (api headscaleV1APIServer) EnableRoute( return nil, err } - if update != nil && update.Valid() { + if update != nil { ctx := types.NotifyCtx(ctx, "cli-enableroute", "unknown") api.h.nodeNotifier.NotifyAll( ctx, *update) @@ -516,17 +508,19 @@ func (api headscaleV1APIServer) DisableRoute( ctx context.Context, request *v1.DisableRouteRequest, ) (*v1.DisableRouteResponse, error) { - isConnected := api.h.nodeNotifier.ConnectedMap() - update, err := db.Write(api.h.db.DB, func(tx *gorm.DB) (*types.StateUpdate, error) { - return db.DisableRoute(tx, request.GetRouteId(), isConnected) + update, err := db.Write(api.h.db.DB, func(tx *gorm.DB) ([]types.NodeID, error) { + return db.DisableRoute(tx, request.GetRouteId(), api.h.nodeNotifier.ConnectedMap()) }) if err != nil { return nil, err } - if update != nil && update.Valid() { + if update != nil { ctx := types.NotifyCtx(ctx, "cli-disableroute", "unknown") - api.h.nodeNotifier.NotifyAll(ctx, *update) + api.h.nodeNotifier.NotifyAll(ctx, types.StateUpdate{ + Type: types.StatePeerChanged, + ChangeNodes: update, + }) } return &v1.DisableRouteResponse{}, nil @@ -536,7 +530,7 @@ func (api headscaleV1APIServer) GetNodeRoutes( ctx context.Context, request *v1.GetNodeRoutesRequest, ) (*v1.GetNodeRoutesResponse, error) { - node, err := api.h.db.GetNodeByID(request.GetNodeId()) + node, err := api.h.db.GetNodeByID(types.NodeID(request.GetNodeId())) if err != nil { return nil, err } @@ -556,16 +550,19 @@ func (api headscaleV1APIServer) DeleteRoute( request *v1.DeleteRouteRequest, ) (*v1.DeleteRouteResponse, error) { isConnected := api.h.nodeNotifier.ConnectedMap() - update, err := db.Write(api.h.db.DB, func(tx *gorm.DB) (*types.StateUpdate, error) { + update, err := db.Write(api.h.db.DB, func(tx *gorm.DB) ([]types.NodeID, error) { return db.DeleteRoute(tx, request.GetRouteId(), isConnected) }) if err != nil { return nil, err } - if update != nil && update.Valid() { + if update != nil { ctx := types.NotifyCtx(ctx, "cli-deleteroute", "unknown") - api.h.nodeNotifier.NotifyWithIgnore(ctx, *update) + api.h.nodeNotifier.NotifyAll(ctx, types.StateUpdate{ + Type: types.StatePeerChanged, + ChangeNodes: update, + }) } return &v1.DeleteRouteResponse{}, nil diff --git a/hscontrol/handlers.go b/hscontrol/handlers.go index ee67073..a6bbd1b 100644 --- a/hscontrol/handlers.go +++ b/hscontrol/handlers.go @@ -68,12 +68,6 @@ func (h *Headscale) KeyHandler( Msg("could not get capability version") writer.Header().Set("Content-Type", "text/plain; charset=utf-8") writer.WriteHeader(http.StatusInternalServerError) - if err != nil { - log.Error(). - Caller(). - Err(err). - Msg("Failed to write response") - } return } @@ -82,19 +76,6 @@ func (h *Headscale) KeyHandler( Str("handler", "/key"). Int("cap_ver", int(capVer)). Msg("New noise client") - if err != nil { - writer.Header().Set("Content-Type", "text/plain; charset=utf-8") - writer.WriteHeader(http.StatusBadRequest) - _, err := writer.Write([]byte("Wrong params")) - if err != nil { - log.Error(). - Caller(). - Err(err). - Msg("Failed to write response") - } - - return - } // TS2021 (Tailscale v2 protocol) requires to have a different key if capVer >= NoiseCapabilityVersion { diff --git a/hscontrol/mapper/mapper.go b/hscontrol/mapper/mapper.go index df0f4d9..3a92cae 100644 --- a/hscontrol/mapper/mapper.go +++ b/hscontrol/mapper/mapper.go @@ -16,12 +16,12 @@ import ( "time" mapset "github.com/deckarep/golang-set/v2" + "github.com/juanfont/headscale/hscontrol/db" "github.com/juanfont/headscale/hscontrol/policy" "github.com/juanfont/headscale/hscontrol/types" "github.com/juanfont/headscale/hscontrol/util" "github.com/klauspost/compress/zstd" "github.com/rs/zerolog/log" - "golang.org/x/exp/maps" "tailscale.com/envknob" "tailscale.com/smallzstd" "tailscale.com/tailcfg" @@ -51,21 +51,14 @@ var debugDumpMapResponsePath = envknob.String("HEADSCALE_DEBUG_DUMP_MAPRESPONSE_ type Mapper struct { // Configuration // TODO(kradalby): figure out if this is the format we want this in - derpMap *tailcfg.DERPMap - baseDomain string - dnsCfg *tailcfg.DNSConfig - logtail bool - randomClientPort bool + db *db.HSDatabase + cfg *types.Config + derpMap *tailcfg.DERPMap + isLikelyConnected types.NodeConnectedMap uid string created time.Time seq uint64 - - // Map isnt concurrency safe, so we need to ensure - // only one func is accessing it over time. - mu sync.Mutex - peers map[uint64]*types.Node - patches map[uint64][]patch } type patch struct { @@ -74,35 +67,22 @@ type patch struct { } func NewMapper( - node *types.Node, - peers types.Nodes, + db *db.HSDatabase, + cfg *types.Config, derpMap *tailcfg.DERPMap, - baseDomain string, - dnsCfg *tailcfg.DNSConfig, - logtail bool, - randomClientPort bool, + isLikelyConnected types.NodeConnectedMap, ) *Mapper { - log.Debug(). - Caller(). - Str("node", node.Hostname). - Msg("creating new mapper") - uid, _ := util.GenerateRandomStringDNSSafe(mapperIDLength) return &Mapper{ - derpMap: derpMap, - baseDomain: baseDomain, - dnsCfg: dnsCfg, - logtail: logtail, - randomClientPort: randomClientPort, + db: db, + cfg: cfg, + derpMap: derpMap, + isLikelyConnected: isLikelyConnected, uid: uid, created: time.Now(), seq: 0, - - // TODO: populate - peers: peers.IDMap(), - patches: make(map[uint64][]patch), } } @@ -207,11 +187,10 @@ func addNextDNSMetadata(resolvers []*dnstype.Resolver, node *types.Node) { // It is a separate function to make testing easier. func (m *Mapper) fullMapResponse( node *types.Node, + peers types.Nodes, pol *policy.ACLPolicy, capVer tailcfg.CapabilityVersion, ) (*tailcfg.MapResponse, error) { - peers := nodeMapToList(m.peers) - resp, err := m.baseWithConfigMapResponse(node, pol, capVer) if err != nil { return nil, err @@ -219,14 +198,13 @@ func (m *Mapper) fullMapResponse( err = appendPeerChanges( resp, + true, // full change pol, node, capVer, peers, peers, - m.baseDomain, - m.dnsCfg, - m.randomClientPort, + m.cfg, ) if err != nil { return nil, err @@ -240,35 +218,25 @@ func (m *Mapper) FullMapResponse( mapRequest tailcfg.MapRequest, node *types.Node, pol *policy.ACLPolicy, + messages ...string, ) ([]byte, error) { - m.mu.Lock() - defer m.mu.Unlock() - - peers := maps.Keys(m.peers) - peersWithPatches := maps.Keys(m.patches) - slices.Sort(peers) - slices.Sort(peersWithPatches) - - if len(peersWithPatches) > 0 { - log.Debug(). - Str("node", node.Hostname). - Uints64("peers", peers). - Uints64("pending_patches", peersWithPatches). - Msgf("node requested full map response, but has pending patches") - } - - resp, err := m.fullMapResponse(node, pol, mapRequest.Version) + peers, err := m.ListPeers(node.ID) if err != nil { return nil, err } - return m.marshalMapResponse(mapRequest, resp, node, mapRequest.Compress) + resp, err := m.fullMapResponse(node, peers, pol, mapRequest.Version) + if err != nil { + return nil, err + } + + return m.marshalMapResponse(mapRequest, resp, node, mapRequest.Compress, messages...) } -// LiteMapResponse returns a MapResponse for the given node. +// ReadOnlyResponse returns a MapResponse for the given node. // Lite means that the peers has been omitted, this is intended // to be used to answer MapRequests with OmitPeers set to true. -func (m *Mapper) LiteMapResponse( +func (m *Mapper) ReadOnlyMapResponse( mapRequest tailcfg.MapRequest, node *types.Node, pol *policy.ACLPolicy, @@ -279,18 +247,6 @@ func (m *Mapper) LiteMapResponse( return nil, err } - rules, sshPolicy, err := policy.GenerateFilterAndSSHRules( - pol, - node, - nodeMapToList(m.peers), - ) - if err != nil { - return nil, err - } - - resp.PacketFilter = policy.ReduceFilterRules(node, rules) - resp.SSHPolicy = sshPolicy - return m.marshalMapResponse(mapRequest, resp, node, mapRequest.Compress, messages...) } @@ -320,50 +276,74 @@ func (m *Mapper) DERPMapResponse( func (m *Mapper) PeerChangedResponse( mapRequest tailcfg.MapRequest, node *types.Node, - changed types.Nodes, + changed map[types.NodeID]bool, + patches []*tailcfg.PeerChange, pol *policy.ACLPolicy, messages ...string, ) ([]byte, error) { - m.mu.Lock() - defer m.mu.Unlock() - - // Update our internal map. - for _, node := range changed { - if patches, ok := m.patches[node.ID]; ok { - // preserve online status in case the patch has an outdated one - online := node.IsOnline - - for _, p := range patches { - // TODO(kradalby): Figure if this needs to be sorted by timestamp - node.ApplyPeerChange(p.change) - } - - // Ensure the patches are not applied again later - delete(m.patches, node.ID) - - node.IsOnline = online - } - - m.peers[node.ID] = node - } - resp := m.baseMapResponse() - err := appendPeerChanges( + peers, err := m.ListPeers(node.ID) + if err != nil { + return nil, err + } + + var removedIDs []tailcfg.NodeID + var changedIDs []types.NodeID + for nodeID, nodeChanged := range changed { + if nodeChanged { + changedIDs = append(changedIDs, nodeID) + } else { + removedIDs = append(removedIDs, nodeID.NodeID()) + } + } + + changedNodes := make(types.Nodes, 0, len(changedIDs)) + for _, peer := range peers { + if slices.Contains(changedIDs, peer.ID) { + changedNodes = append(changedNodes, peer) + } + } + + err = appendPeerChanges( &resp, + false, // partial change pol, node, mapRequest.Version, - nodeMapToList(m.peers), - changed, - m.baseDomain, - m.dnsCfg, - m.randomClientPort, + peers, + changedNodes, + m.cfg, ) if err != nil { return nil, err } + resp.PeersRemoved = removedIDs + + // Sending patches as a part of a PeersChanged response + // is technically not suppose to be done, but they are + // applied after the PeersChanged. The patch list + // should _only_ contain Nodes that are not in the + // PeersChanged or PeersRemoved list and the caller + // should filter them out. + // + // From tailcfg docs: + // These are applied after Peers* above, but in practice the + // control server should only send these on their own, without + // the Peers* fields also set. + if patches != nil { + resp.PeersChangedPatch = patches + } + + // Add the node itself, it might have changed, and particularly + // if there are no patches or changes, this is a self update. + tailnode, err := tailNode(node, mapRequest.Version, pol, m.cfg) + if err != nil { + return nil, err + } + resp.Node = tailnode + return m.marshalMapResponse(mapRequest, &resp, node, mapRequest.Compress, messages...) } @@ -375,71 +355,12 @@ func (m *Mapper) PeerChangedPatchResponse( changed []*tailcfg.PeerChange, pol *policy.ACLPolicy, ) ([]byte, error) { - m.mu.Lock() - defer m.mu.Unlock() - - sendUpdate := false - // patch the internal map - for _, change := range changed { - if peer, ok := m.peers[uint64(change.NodeID)]; ok { - peer.ApplyPeerChange(change) - sendUpdate = true - } else { - log.Trace().Str("node", node.Hostname).Msgf("Node with ID %s is missing from mapper for Node %s, saving patch for when node is available", change.NodeID, node.Hostname) - - p := patch{ - timestamp: time.Now(), - change: change, - } - - if patches, ok := m.patches[uint64(change.NodeID)]; ok { - m.patches[uint64(change.NodeID)] = append(patches, p) - } else { - m.patches[uint64(change.NodeID)] = []patch{p} - } - } - } - - if !sendUpdate { - return nil, nil - } - resp := m.baseMapResponse() resp.PeersChangedPatch = changed return m.marshalMapResponse(mapRequest, &resp, node, mapRequest.Compress) } -// TODO(kradalby): We need some integration tests for this. -func (m *Mapper) PeerRemovedResponse( - mapRequest tailcfg.MapRequest, - node *types.Node, - removed []tailcfg.NodeID, -) ([]byte, error) { - m.mu.Lock() - defer m.mu.Unlock() - - // Some nodes might have been removed already - // so we dont want to ask downstream to remove - // twice, than can cause a panic in tailscaled. - notYetRemoved := []tailcfg.NodeID{} - - // remove from our internal map - for _, id := range removed { - if _, ok := m.peers[uint64(id)]; ok { - notYetRemoved = append(notYetRemoved, id) - } - - delete(m.peers, uint64(id)) - delete(m.patches, uint64(id)) - } - - resp := m.baseMapResponse() - resp.PeersRemoved = notYetRemoved - - return m.marshalMapResponse(mapRequest, &resp, node, mapRequest.Compress) -} - func (m *Mapper) marshalMapResponse( mapRequest tailcfg.MapRequest, resp *tailcfg.MapResponse, @@ -469,10 +390,8 @@ func (m *Mapper) marshalMapResponse( switch { case resp.Peers != nil && len(resp.Peers) > 0: responseType = "full" - case isSelfUpdate(messages...): + case resp.Peers == nil && resp.PeersChanged == nil && resp.PeersChangedPatch == nil && resp.DERPMap == nil && !resp.KeepAlive: responseType = "self" - case resp.Peers == nil && resp.PeersChanged == nil && resp.PeersChangedPatch == nil: - responseType = "lite" case resp.PeersChanged != nil && len(resp.PeersChanged) > 0: responseType = "changed" case resp.PeersChangedPatch != nil && len(resp.PeersChangedPatch) > 0: @@ -496,11 +415,11 @@ func (m *Mapper) marshalMapResponse( panic(err) } - now := time.Now().UnixNano() + now := time.Now().Format("2006-01-02T15-04-05.999999999") mapResponsePath := path.Join( mPath, - fmt.Sprintf("%d-%s-%d-%s.json", now, m.uid, atomic.LoadUint64(&m.seq), responseType), + fmt.Sprintf("%s-%s-%d-%s.json", now, m.uid, atomic.LoadUint64(&m.seq), responseType), ) log.Trace().Msgf("Writing MapResponse to %s", mapResponsePath) @@ -574,7 +493,7 @@ func (m *Mapper) baseWithConfigMapResponse( ) (*tailcfg.MapResponse, error) { resp := m.baseMapResponse() - tailnode, err := tailNode(node, capVer, pol, m.dnsCfg, m.baseDomain, m.randomClientPort) + tailnode, err := tailNode(node, capVer, pol, m.cfg) if err != nil { return nil, err } @@ -582,7 +501,7 @@ func (m *Mapper) baseWithConfigMapResponse( resp.DERPMap = m.derpMap - resp.Domain = m.baseDomain + resp.Domain = m.cfg.BaseDomain // Do not instruct clients to collect services we do not // support or do anything with them @@ -591,12 +510,26 @@ func (m *Mapper) baseWithConfigMapResponse( resp.KeepAlive = false resp.Debug = &tailcfg.Debug{ - DisableLogTail: !m.logtail, + DisableLogTail: !m.cfg.LogTail.Enabled, } return &resp, nil } +func (m *Mapper) ListPeers(nodeID types.NodeID) (types.Nodes, error) { + peers, err := m.db.ListPeers(nodeID) + if err != nil { + return nil, err + } + + for _, peer := range peers { + online := m.isLikelyConnected[peer.ID] + peer.IsOnline = &online + } + + return peers, nil +} + func nodeMapToList(nodes map[uint64]*types.Node) types.Nodes { ret := make(types.Nodes, 0) @@ -612,42 +545,41 @@ func nodeMapToList(nodes map[uint64]*types.Node) types.Nodes { func appendPeerChanges( resp *tailcfg.MapResponse, + fullChange bool, pol *policy.ACLPolicy, node *types.Node, capVer tailcfg.CapabilityVersion, peers types.Nodes, changed types.Nodes, - baseDomain string, - dnsCfg *tailcfg.DNSConfig, - randomClientPort bool, + cfg *types.Config, ) error { - fullChange := len(peers) == len(changed) - rules, sshPolicy, err := policy.GenerateFilterAndSSHRules( - pol, - node, - peers, - ) + packetFilter, err := pol.CompileFilterRules(append(peers, node)) + if err != nil { + return err + } + + sshPolicy, err := pol.CompileSSHPolicy(node, peers) if err != nil { return err } // If there are filter rules present, see if there are any nodes that cannot // access eachother at all and remove them from the peers. - if len(rules) > 0 { - changed = policy.FilterNodesByACL(node, changed, rules) + if len(packetFilter) > 0 { + changed = policy.FilterNodesByACL(node, changed, packetFilter) } - profiles := generateUserProfiles(node, changed, baseDomain) + profiles := generateUserProfiles(node, changed, cfg.BaseDomain) dnsConfig := generateDNSConfig( - dnsCfg, - baseDomain, + cfg.DNSConfig, + cfg.BaseDomain, node, peers, ) - tailPeers, err := tailNodes(changed, capVer, pol, dnsCfg, baseDomain, randomClientPort) + tailPeers, err := tailNodes(changed, capVer, pol, cfg) if err != nil { return err } @@ -663,19 +595,9 @@ func appendPeerChanges( resp.PeersChanged = tailPeers } resp.DNSConfig = dnsConfig - resp.PacketFilter = policy.ReduceFilterRules(node, rules) + resp.PacketFilter = policy.ReduceFilterRules(node, packetFilter) resp.UserProfiles = profiles resp.SSHPolicy = sshPolicy return nil } - -func isSelfUpdate(messages ...string) bool { - for _, message := range messages { - if strings.Contains(message, types.SelfUpdateIdentifier) { - return true - } - } - - return false -} diff --git a/hscontrol/mapper/mapper_test.go b/hscontrol/mapper/mapper_test.go index bcc17dd..3f4d689 100644 --- a/hscontrol/mapper/mapper_test.go +++ b/hscontrol/mapper/mapper_test.go @@ -331,13 +331,10 @@ func Test_fullMapResponse(t *testing.T) { node *types.Node peers types.Nodes - baseDomain string - dnsConfig *tailcfg.DNSConfig - derpMap *tailcfg.DERPMap - logtail bool - randomClientPort bool - want *tailcfg.MapResponse - wantErr bool + derpMap *tailcfg.DERPMap + cfg *types.Config + want *tailcfg.MapResponse + wantErr bool }{ // { // name: "empty-node", @@ -349,15 +346,17 @@ func Test_fullMapResponse(t *testing.T) { // wantErr: true, // }, { - name: "no-pol-no-peers-map-response", - pol: &policy.ACLPolicy{}, - node: mini, - peers: types.Nodes{}, - baseDomain: "", - dnsConfig: &tailcfg.DNSConfig{}, - derpMap: &tailcfg.DERPMap{}, - logtail: false, - randomClientPort: false, + name: "no-pol-no-peers-map-response", + pol: &policy.ACLPolicy{}, + node: mini, + peers: types.Nodes{}, + derpMap: &tailcfg.DERPMap{}, + cfg: &types.Config{ + BaseDomain: "", + DNSConfig: &tailcfg.DNSConfig{}, + LogTail: types.LogTailConfig{Enabled: false}, + RandomizeClientPort: false, + }, want: &tailcfg.MapResponse{ Node: tailMini, KeepAlive: false, @@ -383,11 +382,13 @@ func Test_fullMapResponse(t *testing.T) { peers: types.Nodes{ peer1, }, - baseDomain: "", - dnsConfig: &tailcfg.DNSConfig{}, - derpMap: &tailcfg.DERPMap{}, - logtail: false, - randomClientPort: false, + derpMap: &tailcfg.DERPMap{}, + cfg: &types.Config{ + BaseDomain: "", + DNSConfig: &tailcfg.DNSConfig{}, + LogTail: types.LogTailConfig{Enabled: false}, + RandomizeClientPort: false, + }, want: &tailcfg.MapResponse{ KeepAlive: false, Node: tailMini, @@ -424,11 +425,13 @@ func Test_fullMapResponse(t *testing.T) { peer1, peer2, }, - baseDomain: "", - dnsConfig: &tailcfg.DNSConfig{}, - derpMap: &tailcfg.DERPMap{}, - logtail: false, - randomClientPort: false, + derpMap: &tailcfg.DERPMap{}, + cfg: &types.Config{ + BaseDomain: "", + DNSConfig: &tailcfg.DNSConfig{}, + LogTail: types.LogTailConfig{Enabled: false}, + RandomizeClientPort: false, + }, want: &tailcfg.MapResponse{ KeepAlive: false, Node: tailMini, @@ -463,17 +466,15 @@ func Test_fullMapResponse(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { mappy := NewMapper( - tt.node, - tt.peers, + nil, + tt.cfg, tt.derpMap, - tt.baseDomain, - tt.dnsConfig, - tt.logtail, - tt.randomClientPort, + nil, ) got, err := mappy.fullMapResponse( tt.node, + tt.peers, tt.pol, 0, ) diff --git a/hscontrol/mapper/tail.go b/hscontrol/mapper/tail.go index c10da4d..97d12e8 100644 --- a/hscontrol/mapper/tail.go +++ b/hscontrol/mapper/tail.go @@ -3,12 +3,10 @@ package mapper import ( "fmt" "net/netip" - "strconv" "time" "github.com/juanfont/headscale/hscontrol/policy" "github.com/juanfont/headscale/hscontrol/types" - "github.com/juanfont/headscale/hscontrol/util" "github.com/samber/lo" "tailscale.com/tailcfg" ) @@ -17,9 +15,7 @@ func tailNodes( nodes types.Nodes, capVer tailcfg.CapabilityVersion, pol *policy.ACLPolicy, - dnsConfig *tailcfg.DNSConfig, - baseDomain string, - randomClientPort bool, + cfg *types.Config, ) ([]*tailcfg.Node, error) { tNodes := make([]*tailcfg.Node, len(nodes)) @@ -28,9 +24,7 @@ func tailNodes( node, capVer, pol, - dnsConfig, - baseDomain, - randomClientPort, + cfg, ) if err != nil { return nil, err @@ -48,9 +42,7 @@ func tailNode( node *types.Node, capVer tailcfg.CapabilityVersion, pol *policy.ACLPolicy, - dnsConfig *tailcfg.DNSConfig, - baseDomain string, - randomClientPort bool, + cfg *types.Config, ) (*tailcfg.Node, error) { addrs := node.IPAddresses.Prefixes() @@ -85,7 +77,7 @@ func tailNode( keyExpiry = time.Time{} } - hostname, err := node.GetFQDN(dnsConfig, baseDomain) + hostname, err := node.GetFQDN(cfg.DNSConfig, cfg.BaseDomain) if err != nil { return nil, fmt.Errorf("tailNode, failed to create FQDN: %s", err) } @@ -94,12 +86,10 @@ func tailNode( tags = lo.Uniq(append(tags, node.ForcedTags...)) tNode := tailcfg.Node{ - ID: tailcfg.NodeID(node.ID), // this is the actual ID - StableID: tailcfg.StableNodeID( - strconv.FormatUint(node.ID, util.Base10), - ), // in headscale, unlike tailcontrol server, IDs are permanent - Name: hostname, - Cap: capVer, + ID: tailcfg.NodeID(node.ID), // this is the actual ID + StableID: node.ID.StableID(), + Name: hostname, + Cap: capVer, User: tailcfg.UserID(node.UserID), @@ -133,7 +123,7 @@ func tailNode( tailcfg.CapabilitySSH: []tailcfg.RawMessage{}, } - if randomClientPort { + if cfg.RandomizeClientPort { tNode.CapMap[tailcfg.NodeAttrRandomizeClientPort] = []tailcfg.RawMessage{} } } else { @@ -143,7 +133,7 @@ func tailNode( tailcfg.CapabilitySSH, } - if randomClientPort { + if cfg.RandomizeClientPort { tNode.Capabilities = append(tNode.Capabilities, tailcfg.NodeAttrRandomizeClientPort) } } diff --git a/hscontrol/mapper/tail_test.go b/hscontrol/mapper/tail_test.go index f6e370c..e79d9dc 100644 --- a/hscontrol/mapper/tail_test.go +++ b/hscontrol/mapper/tail_test.go @@ -182,13 +182,16 @@ func TestTailNode(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + cfg := &types.Config{ + BaseDomain: tt.baseDomain, + DNSConfig: tt.dnsConfig, + RandomizeClientPort: false, + } got, err := tailNode( tt.node, 0, tt.pol, - tt.dnsConfig, - tt.baseDomain, - false, + cfg, ) if (err != nil) != tt.wantErr { diff --git a/hscontrol/noise.go b/hscontrol/noise.go index 0fa28d1..3debd37 100644 --- a/hscontrol/noise.go +++ b/hscontrol/noise.go @@ -3,6 +3,7 @@ package hscontrol import ( "encoding/binary" "encoding/json" + "errors" "io" "net/http" @@ -11,6 +12,7 @@ import ( "github.com/rs/zerolog/log" "golang.org/x/net/http2" "golang.org/x/net/http2/h2c" + "gorm.io/gorm" "tailscale.com/control/controlbase" "tailscale.com/control/controlhttp" "tailscale.com/tailcfg" @@ -163,3 +165,135 @@ func (ns *noiseServer) earlyNoise(protocolVersion int, writer io.Writer) error { return nil } + +const ( + MinimumCapVersion tailcfg.CapabilityVersion = 58 +) + +// NoisePollNetMapHandler takes care of /machine/:id/map using the Noise protocol +// +// This is the busiest endpoint, as it keeps the HTTP long poll that updates +// the clients when something in the network changes. +// +// The clients POST stuff like HostInfo and their Endpoints here, but +// only after their first request (marked with the ReadOnly field). +// +// At this moment the updates are sent in a quite horrendous way, but they kinda work. +func (ns *noiseServer) NoisePollNetMapHandler( + writer http.ResponseWriter, + req *http.Request, +) { + log.Trace(). + Str("handler", "NoisePollNetMap"). + Msg("PollNetMapHandler called") + + log.Trace(). + Any("headers", req.Header). + Caller(). + Msg("Headers") + + body, _ := io.ReadAll(req.Body) + + mapRequest := tailcfg.MapRequest{} + if err := json.Unmarshal(body, &mapRequest); err != nil { + log.Error(). + Caller(). + Err(err). + Msg("Cannot parse MapRequest") + http.Error(writer, "Internal error", http.StatusInternalServerError) + + return + } + + // Reject unsupported versions + if mapRequest.Version < MinimumCapVersion { + log.Info(). + Caller(). + Int("min_version", int(MinimumCapVersion)). + Int("client_version", int(mapRequest.Version)). + Msg("unsupported client connected") + http.Error(writer, "Internal error", http.StatusBadRequest) + + return + } + + ns.nodeKey = mapRequest.NodeKey + + node, err := ns.headscale.db.GetNodeByAnyKey( + ns.conn.Peer(), + mapRequest.NodeKey, + key.NodePublic{}, + ) + if err != nil { + if errors.Is(err, gorm.ErrRecordNotFound) { + log.Warn(). + Str("handler", "NoisePollNetMap"). + Uint64("node.id", node.ID.Uint64()). + Msgf("Ignoring request, cannot find node with key %s", mapRequest.NodeKey.String()) + http.Error(writer, "Internal error", http.StatusNotFound) + + return + } + log.Error(). + Str("handler", "NoisePollNetMap"). + Uint64("node.id", node.ID.Uint64()). + Msgf("Failed to fetch node from the database with node key: %s", mapRequest.NodeKey.String()) + http.Error(writer, "Internal error", http.StatusInternalServerError) + + return + } + log.Debug(). + Str("handler", "NoisePollNetMap"). + Str("node", node.Hostname). + Int("cap_ver", int(mapRequest.Version)). + Uint64("node.id", node.ID.Uint64()). + Msg("A node sending a MapRequest with Noise protocol") + + session := ns.headscale.newMapSession(req.Context(), mapRequest, writer, node) + + // If a streaming mapSession exists for this node, close it + // and start a new one. + if session.isStreaming() { + log.Debug(). + Caller(). + Uint64("node.id", node.ID.Uint64()). + Int("cap_ver", int(mapRequest.Version)). + Msg("Aquiring lock to check stream") + ns.headscale.mapSessionMu.Lock() + if oldSession, ok := ns.headscale.mapSessions[node.ID]; ok { + log.Info(). + Caller(). + Uint64("node.id", node.ID.Uint64()). + Msg("Node has an open streaming session, replacing") + oldSession.close() + } + + ns.headscale.mapSessions[node.ID] = session + ns.headscale.mapSessionMu.Unlock() + log.Debug(). + Caller(). + Uint64("node.id", node.ID.Uint64()). + Int("cap_ver", int(mapRequest.Version)). + Msg("Releasing lock to check stream") + } + + session.serve() + + if session.isStreaming() { + log.Debug(). + Caller(). + Uint64("node.id", node.ID.Uint64()). + Int("cap_ver", int(mapRequest.Version)). + Msg("Aquiring lock to remove stream") + ns.headscale.mapSessionMu.Lock() + + delete(ns.headscale.mapSessions, node.ID) + + ns.headscale.mapSessionMu.Unlock() + log.Debug(). + Caller(). + Uint64("node.id", node.ID.Uint64()). + Int("cap_ver", int(mapRequest.Version)). + Msg("Releasing lock to remove stream") + } +} diff --git a/hscontrol/notifier/notifier.go b/hscontrol/notifier/notifier.go index 2384a40..4ead615 100644 --- a/hscontrol/notifier/notifier.go +++ b/hscontrol/notifier/notifier.go @@ -3,52 +3,51 @@ package notifier import ( "context" "fmt" + "slices" "strings" "sync" "github.com/juanfont/headscale/hscontrol/types" - "github.com/juanfont/headscale/hscontrol/util" "github.com/rs/zerolog/log" - "tailscale.com/types/key" ) type Notifier struct { l sync.RWMutex - nodes map[string]chan<- types.StateUpdate - connected map[key.MachinePublic]bool + nodes map[types.NodeID]chan<- types.StateUpdate + connected types.NodeConnectedMap } func NewNotifier() *Notifier { return &Notifier{ - nodes: make(map[string]chan<- types.StateUpdate), - connected: make(map[key.MachinePublic]bool), + nodes: make(map[types.NodeID]chan<- types.StateUpdate), + connected: make(types.NodeConnectedMap), } } -func (n *Notifier) AddNode(machineKey key.MachinePublic, c chan<- types.StateUpdate) { - log.Trace().Caller().Str("key", machineKey.ShortString()).Msg("acquiring lock to add node") +func (n *Notifier) AddNode(nodeID types.NodeID, c chan<- types.StateUpdate) { + log.Trace().Caller().Uint64("node.id", nodeID.Uint64()).Msg("acquiring lock to add node") defer log.Trace(). Caller(). - Str("key", machineKey.ShortString()). + Uint64("node.id", nodeID.Uint64()). Msg("releasing lock to add node") n.l.Lock() defer n.l.Unlock() - n.nodes[machineKey.String()] = c - n.connected[machineKey] = true + n.nodes[nodeID] = c + n.connected[nodeID] = true log.Trace(). - Str("machine_key", machineKey.ShortString()). + Uint64("node.id", nodeID.Uint64()). Int("open_chans", len(n.nodes)). Msg("Added new channel") } -func (n *Notifier) RemoveNode(machineKey key.MachinePublic) { - log.Trace().Caller().Str("key", machineKey.ShortString()).Msg("acquiring lock to remove node") +func (n *Notifier) RemoveNode(nodeID types.NodeID) { + log.Trace().Caller().Uint64("node.id", nodeID.Uint64()).Msg("acquiring lock to remove node") defer log.Trace(). Caller(). - Str("key", machineKey.ShortString()). + Uint64("node.id", nodeID.Uint64()). Msg("releasing lock to remove node") n.l.Lock() @@ -58,26 +57,32 @@ func (n *Notifier) RemoveNode(machineKey key.MachinePublic) { return } - delete(n.nodes, machineKey.String()) - n.connected[machineKey] = false + delete(n.nodes, nodeID) + n.connected[nodeID] = false log.Trace(). - Str("machine_key", machineKey.ShortString()). + Uint64("node.id", nodeID.Uint64()). Int("open_chans", len(n.nodes)). Msg("Removed channel") } // IsConnected reports if a node is connected to headscale and has a // poll session open. -func (n *Notifier) IsConnected(machineKey key.MachinePublic) bool { +func (n *Notifier) IsConnected(nodeID types.NodeID) bool { n.l.RLock() defer n.l.RUnlock() - return n.connected[machineKey] + return n.connected[nodeID] +} + +// IsLikelyConnected reports if a node is connected to headscale and has a +// poll session open, but doesnt lock, so might be wrong. +func (n *Notifier) IsLikelyConnected(nodeID types.NodeID) bool { + return n.connected[nodeID] } // TODO(kradalby): This returns a pointer and can be dangerous. -func (n *Notifier) ConnectedMap() map[key.MachinePublic]bool { +func (n *Notifier) ConnectedMap() types.NodeConnectedMap { return n.connected } @@ -88,19 +93,23 @@ func (n *Notifier) NotifyAll(ctx context.Context, update types.StateUpdate) { func (n *Notifier) NotifyWithIgnore( ctx context.Context, update types.StateUpdate, - ignore ...string, + ignoreNodeIDs ...types.NodeID, ) { - log.Trace().Caller().Interface("type", update.Type).Msg("acquiring lock to notify") + log.Trace().Caller().Str("type", update.Type.String()).Msg("acquiring lock to notify") defer log.Trace(). Caller(). - Interface("type", update.Type). + Str("type", update.Type.String()). Msg("releasing lock, finished notifying") n.l.RLock() defer n.l.RUnlock() - for key, c := range n.nodes { - if util.IsStringInSlice(ignore, key) { + if update.Type == types.StatePeerChangedPatch { + log.Trace().Interface("update", update).Interface("online", n.connected).Msg("PATCH UPDATE SENT") + } + + for nodeID, c := range n.nodes { + if slices.Contains(ignoreNodeIDs, nodeID) { continue } @@ -108,17 +117,17 @@ func (n *Notifier) NotifyWithIgnore( case <-ctx.Done(): log.Error(). Err(ctx.Err()). - Str("mkey", key). + Uint64("node.id", nodeID.Uint64()). Any("origin", ctx.Value("origin")). - Any("hostname", ctx.Value("hostname")). + Any("origin-hostname", ctx.Value("hostname")). Msgf("update not sent, context cancelled") return case c <- update: log.Trace(). - Str("mkey", key). + Uint64("node.id", nodeID.Uint64()). Any("origin", ctx.Value("origin")). - Any("hostname", ctx.Value("hostname")). + Any("origin-hostname", ctx.Value("hostname")). Msgf("update successfully sent on chan") } } @@ -127,33 +136,33 @@ func (n *Notifier) NotifyWithIgnore( func (n *Notifier) NotifyByMachineKey( ctx context.Context, update types.StateUpdate, - mKey key.MachinePublic, + nodeID types.NodeID, ) { - log.Trace().Caller().Interface("type", update.Type).Msg("acquiring lock to notify") + log.Trace().Caller().Str("type", update.Type.String()).Msg("acquiring lock to notify") defer log.Trace(). Caller(). - Interface("type", update.Type). + Str("type", update.Type.String()). Msg("releasing lock, finished notifying") n.l.RLock() defer n.l.RUnlock() - if c, ok := n.nodes[mKey.String()]; ok { + if c, ok := n.nodes[nodeID]; ok { select { case <-ctx.Done(): log.Error(). Err(ctx.Err()). - Str("mkey", mKey.String()). + Uint64("node.id", nodeID.Uint64()). Any("origin", ctx.Value("origin")). - Any("hostname", ctx.Value("hostname")). + Any("origin-hostname", ctx.Value("hostname")). Msgf("update not sent, context cancelled") return case c <- update: log.Trace(). - Str("mkey", mKey.String()). + Uint64("node.id", nodeID.Uint64()). Any("origin", ctx.Value("origin")). - Any("hostname", ctx.Value("hostname")). + Any("origin-hostname", ctx.Value("hostname")). Msgf("update successfully sent on chan") } } @@ -166,7 +175,7 @@ func (n *Notifier) String() string { str := []string{"Notifier, in map:\n"} for k, v := range n.nodes { - str = append(str, fmt.Sprintf("\t%s: %v\n", k, v)) + str = append(str, fmt.Sprintf("\t%d: %v\n", k, v)) } return strings.Join(str, "") diff --git a/hscontrol/oidc.go b/hscontrol/oidc.go index 318aada..d669a92 100644 --- a/hscontrol/oidc.go +++ b/hscontrol/oidc.go @@ -537,11 +537,8 @@ func (h *Headscale) validateNodeForOIDCCallback( util.LogErr(err, "Failed to write response") } - stateUpdate := types.StateUpdateExpire(node.ID, expiry) - if stateUpdate.Valid() { - ctx := types.NotifyCtx(context.Background(), "oidc-expiry", "na") - h.nodeNotifier.NotifyWithIgnore(ctx, stateUpdate, node.MachineKey.String()) - } + ctx := types.NotifyCtx(context.Background(), "oidc-expiry", "na") + h.nodeNotifier.NotifyWithIgnore(ctx, types.StateUpdateExpire(node.ID, expiry), node.ID) return nil, true, nil } diff --git a/hscontrol/policy/acls.go b/hscontrol/policy/acls.go index 2ccc56b..b409578 100644 --- a/hscontrol/policy/acls.go +++ b/hscontrol/policy/acls.go @@ -114,7 +114,7 @@ func LoadACLPolicyFromBytes(acl []byte, format string) (*ACLPolicy, error) { return &policy, nil } -func GenerateFilterAndSSHRules( +func GenerateFilterAndSSHRulesForTests( policy *ACLPolicy, node *types.Node, peers types.Nodes, @@ -124,40 +124,31 @@ func GenerateFilterAndSSHRules( return tailcfg.FilterAllowAll, &tailcfg.SSHPolicy{}, nil } - rules, err := policy.generateFilterRules(node, peers) + rules, err := policy.CompileFilterRules(append(peers, node)) if err != nil { return []tailcfg.FilterRule{}, &tailcfg.SSHPolicy{}, err } log.Trace().Interface("ACL", rules).Str("node", node.GivenName).Msg("ACL rules") - var sshPolicy *tailcfg.SSHPolicy - sshRules, err := policy.generateSSHRules(node, peers) + sshPolicy, err := policy.CompileSSHPolicy(node, peers) if err != nil { return []tailcfg.FilterRule{}, &tailcfg.SSHPolicy{}, err } - log.Trace(). - Interface("SSH", sshRules). - Str("node", node.GivenName). - Msg("SSH rules") - - if sshPolicy == nil { - sshPolicy = &tailcfg.SSHPolicy{} - } - sshPolicy.Rules = sshRules - return rules, sshPolicy, nil } -// generateFilterRules takes a set of nodes and an ACLPolicy and generates a +// CompileFilterRules takes a set of nodes and an ACLPolicy and generates a // set of Tailscale compatible FilterRules used to allow traffic on clients. -func (pol *ACLPolicy) generateFilterRules( - node *types.Node, - peers types.Nodes, +func (pol *ACLPolicy) CompileFilterRules( + nodes types.Nodes, ) ([]tailcfg.FilterRule, error) { + if pol == nil { + return tailcfg.FilterAllowAll, nil + } + rules := []tailcfg.FilterRule{} - nodes := append(peers, node) for index, acl := range pol.ACLs { if acl.Action != "accept" { @@ -279,10 +270,14 @@ func ReduceFilterRules(node *types.Node, rules []tailcfg.FilterRule) []tailcfg.F return ret } -func (pol *ACLPolicy) generateSSHRules( +func (pol *ACLPolicy) CompileSSHPolicy( node *types.Node, peers types.Nodes, -) ([]*tailcfg.SSHRule, error) { +) (*tailcfg.SSHPolicy, error) { + if pol == nil { + return nil, nil + } + rules := []*tailcfg.SSHRule{} acceptAction := tailcfg.SSHAction{ @@ -393,7 +388,9 @@ func (pol *ACLPolicy) generateSSHRules( }) } - return rules, nil + return &tailcfg.SSHPolicy{ + Rules: rules, + }, nil } func sshCheckAction(duration string) (*tailcfg.SSHAction, error) { diff --git a/hscontrol/policy/acls_test.go b/hscontrol/policy/acls_test.go index ff18dd0..db1a0dd 100644 --- a/hscontrol/policy/acls_test.go +++ b/hscontrol/policy/acls_test.go @@ -385,11 +385,12 @@ acls: return } - rules, err := pol.generateFilterRules(&types.Node{ - IPAddresses: types.NodeAddresses{ - netip.MustParseAddr("100.100.100.100"), + rules, err := pol.CompileFilterRules(types.Nodes{ + &types.Node{ + IPAddresses: types.NodeAddresses{ + netip.MustParseAddr("100.100.100.100"), + }, }, - }, types.Nodes{ &types.Node{ IPAddresses: types.NodeAddresses{ netip.MustParseAddr("200.200.200.200"), @@ -546,7 +547,7 @@ func (s *Suite) TestRuleInvalidGeneration(c *check.C) { c.Assert(pol.ACLs, check.HasLen, 6) c.Assert(err, check.IsNil) - rules, err := pol.generateFilterRules(&types.Node{}, types.Nodes{}) + rules, err := pol.CompileFilterRules(types.Nodes{}) c.Assert(err, check.NotNil) c.Assert(rules, check.IsNil) } @@ -562,7 +563,7 @@ func (s *Suite) TestInvalidAction(c *check.C) { }, }, } - _, _, err := GenerateFilterAndSSHRules(pol, &types.Node{}, types.Nodes{}) + _, _, err := GenerateFilterAndSSHRulesForTests(pol, &types.Node{}, types.Nodes{}) c.Assert(errors.Is(err, ErrInvalidAction), check.Equals, true) } @@ -581,7 +582,7 @@ func (s *Suite) TestInvalidGroupInGroup(c *check.C) { }, }, } - _, _, err := GenerateFilterAndSSHRules(pol, &types.Node{}, types.Nodes{}) + _, _, err := GenerateFilterAndSSHRulesForTests(pol, &types.Node{}, types.Nodes{}) c.Assert(errors.Is(err, ErrInvalidGroup), check.Equals, true) } @@ -597,7 +598,7 @@ func (s *Suite) TestInvalidTagOwners(c *check.C) { }, } - _, _, err := GenerateFilterAndSSHRules(pol, &types.Node{}, types.Nodes{}) + _, _, err := GenerateFilterAndSSHRulesForTests(pol, &types.Node{}, types.Nodes{}) c.Assert(errors.Is(err, ErrInvalidTag), check.Equals, true) } @@ -1724,8 +1725,7 @@ func TestACLPolicy_generateFilterRules(t *testing.T) { pol ACLPolicy } type args struct { - node *types.Node - peers types.Nodes + nodes types.Nodes } tests := []struct { name string @@ -1755,13 +1755,14 @@ func TestACLPolicy_generateFilterRules(t *testing.T) { }, }, args: args{ - node: &types.Node{ - IPAddresses: types.NodeAddresses{ - netip.MustParseAddr("100.64.0.1"), - netip.MustParseAddr("fd7a:115c:a1e0:ab12:4843:2222:6273:2221"), + nodes: types.Nodes{ + &types.Node{ + IPAddresses: types.NodeAddresses{ + netip.MustParseAddr("100.64.0.1"), + netip.MustParseAddr("fd7a:115c:a1e0:ab12:4843:2222:6273:2221"), + }, }, }, - peers: types.Nodes{}, }, want: []tailcfg.FilterRule{ { @@ -1800,14 +1801,14 @@ func TestACLPolicy_generateFilterRules(t *testing.T) { }, }, args: args{ - node: &types.Node{ - IPAddresses: types.NodeAddresses{ - netip.MustParseAddr("100.64.0.1"), - netip.MustParseAddr("fd7a:115c:a1e0:ab12:4843:2222:6273:2221"), + nodes: types.Nodes{ + &types.Node{ + IPAddresses: types.NodeAddresses{ + netip.MustParseAddr("100.64.0.1"), + netip.MustParseAddr("fd7a:115c:a1e0:ab12:4843:2222:6273:2221"), + }, + User: types.User{Name: "mickael"}, }, - User: types.User{Name: "mickael"}, - }, - peers: types.Nodes{ &types.Node{ IPAddresses: types.NodeAddresses{ netip.MustParseAddr("100.64.0.2"), @@ -1846,9 +1847,8 @@ func TestACLPolicy_generateFilterRules(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, err := tt.field.pol.generateFilterRules( - tt.args.node, - tt.args.peers, + got, err := tt.field.pol.CompileFilterRules( + tt.args.nodes, ) if (err != nil) != tt.wantErr { t.Errorf("ACLgenerateFilterRules() error = %v, wantErr %v", err, tt.wantErr) @@ -1980,9 +1980,8 @@ func TestReduceFilterRules(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - rules, _ := tt.pol.generateFilterRules( - tt.node, - tt.peers, + rules, _ := tt.pol.CompileFilterRules( + append(tt.peers, tt.node), ) got := ReduceFilterRules(tt.node, rules) @@ -2883,7 +2882,7 @@ func TestSSHRules(t *testing.T) { node types.Node peers types.Nodes pol ACLPolicy - want []*tailcfg.SSHRule + want *tailcfg.SSHPolicy }{ { name: "peers-can-connect", @@ -2946,7 +2945,7 @@ func TestSSHRules(t *testing.T) { }, }, }, - want: []*tailcfg.SSHRule{ + want: &tailcfg.SSHPolicy{Rules: []*tailcfg.SSHRule{ { Principals: []*tailcfg.SSHPrincipal{ { @@ -2991,7 +2990,7 @@ func TestSSHRules(t *testing.T) { }, Action: &tailcfg.SSHAction{Accept: true, AllowLocalPortForwarding: true}, }, - }, + }}, }, { name: "peers-cannot-connect", @@ -3042,13 +3041,13 @@ func TestSSHRules(t *testing.T) { }, }, }, - want: []*tailcfg.SSHRule{}, + want: &tailcfg.SSHPolicy{Rules: []*tailcfg.SSHRule{}}, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, err := tt.pol.generateSSHRules(&tt.node, tt.peers) + got, err := tt.pol.CompileSSHPolicy(&tt.node, tt.peers) assert.NoError(t, err) if diff := cmp.Diff(tt.want, got); diff != "" { @@ -3155,7 +3154,7 @@ func TestValidExpandTagOwnersInSources(t *testing.T) { }, } - got, _, err := GenerateFilterAndSSHRules(pol, node, types.Nodes{}) + got, _, err := GenerateFilterAndSSHRulesForTests(pol, node, types.Nodes{}) assert.NoError(t, err) want := []tailcfg.FilterRule{ @@ -3206,7 +3205,7 @@ func TestInvalidTagValidUser(t *testing.T) { }, } - got, _, err := GenerateFilterAndSSHRules(pol, node, types.Nodes{}) + got, _, err := GenerateFilterAndSSHRulesForTests(pol, node, types.Nodes{}) assert.NoError(t, err) want := []tailcfg.FilterRule{ @@ -3265,7 +3264,7 @@ func TestValidExpandTagOwnersInDestinations(t *testing.T) { // c.Assert(rules[0].DstPorts, check.HasLen, 1) // c.Assert(rules[0].DstPorts[0].IP, check.Equals, "100.64.0.1/32") - got, _, err := GenerateFilterAndSSHRules(pol, node, types.Nodes{}) + got, _, err := GenerateFilterAndSSHRulesForTests(pol, node, types.Nodes{}) assert.NoError(t, err) want := []tailcfg.FilterRule{ @@ -3335,7 +3334,7 @@ func TestValidTagInvalidUser(t *testing.T) { }, } - got, _, err := GenerateFilterAndSSHRules(pol, node, types.Nodes{nodes2}) + got, _, err := GenerateFilterAndSSHRulesForTests(pol, node, types.Nodes{nodes2}) assert.NoError(t, err) want := []tailcfg.FilterRule{ diff --git a/hscontrol/poll.go b/hscontrol/poll.go index 22dd78f..2b65f6d 100644 --- a/hscontrol/poll.go +++ b/hscontrol/poll.go @@ -1,80 +1,166 @@ package hscontrol import ( + "cmp" "context" "fmt" + "math/rand/v2" "net/http" + "net/netip" + "sort" "strings" + "sync" "time" "github.com/juanfont/headscale/hscontrol/db" "github.com/juanfont/headscale/hscontrol/mapper" "github.com/juanfont/headscale/hscontrol/types" "github.com/rs/zerolog/log" + "github.com/sasha-s/go-deadlock" xslices "golang.org/x/exp/slices" "gorm.io/gorm" - "tailscale.com/envknob" "tailscale.com/tailcfg" ) const ( - keepAliveInterval = 60 * time.Second + keepAliveInterval = 50 * time.Second ) type contextKey string const nodeNameContextKey = contextKey("nodeName") -type UpdateNode func() +type sessionManager struct { + mu sync.RWMutex + sess map[types.NodeID]*mapSession +} -func logPollFunc( - mapRequest tailcfg.MapRequest, +type mapSession struct { + h *Headscale + req tailcfg.MapRequest + ctx context.Context + capVer tailcfg.CapabilityVersion + mapper *mapper.Mapper + + serving bool + servingMu deadlock.Mutex + + ch chan types.StateUpdate + cancelCh chan struct{} + + node *types.Node + w http.ResponseWriter + + warnf func(string, ...any) + infof func(string, ...any) + tracef func(string, ...any) + errf func(error, string, ...any) +} + +func (h *Headscale) newMapSession( + ctx context.Context, + req tailcfg.MapRequest, + w http.ResponseWriter, node *types.Node, -) (func(string), func(string), func(error, string)) { - return func(msg string) { - log.Trace(). - Caller(). - Bool("readOnly", mapRequest.ReadOnly). - Bool("omitPeers", mapRequest.OmitPeers). - Bool("stream", mapRequest.Stream). - Str("node_key", node.NodeKey.ShortString()). - Str("node", node.Hostname). - Msg(msg) - }, - func(msg string) { - log.Warn(). - Caller(). - Bool("readOnly", mapRequest.ReadOnly). - Bool("omitPeers", mapRequest.OmitPeers). - Bool("stream", mapRequest.Stream). - Str("node_key", node.NodeKey.ShortString()). - Str("node", node.Hostname). - Msg(msg) - }, - func(err error, msg string) { - log.Error(). - Caller(). - Bool("readOnly", mapRequest.ReadOnly). - Bool("omitPeers", mapRequest.OmitPeers). - Bool("stream", mapRequest.Stream). - Str("node_key", node.NodeKey.ShortString()). - Str("node", node.Hostname). - Err(err). - Msg(msg) - } +) *mapSession { + warnf, tracef, infof, errf := logPollFunc(req, node) + + // Use a buffered channel in case a node is not fully ready + // to receive a message to make sure we dont block the entire + // notifier. + updateChan := make(chan types.StateUpdate, h.cfg.Tuning.NodeMapSessionBufferedChanSize) + + return &mapSession{ + h: h, + ctx: ctx, + req: req, + w: w, + node: node, + capVer: req.Version, + mapper: h.mapper, + + // serving indicates if a client is being served. + serving: false, + + ch: updateChan, + cancelCh: make(chan struct{}), + + // Loggers + warnf: warnf, + infof: infof, + tracef: tracef, + errf: errf, + } +} + +func (m *mapSession) close() { + m.servingMu.Lock() + defer m.servingMu.Unlock() + if !m.serving { + return + } + + select { + case m.cancelCh <- struct{}{}: + default: + } +} + +func (m *mapSession) isStreaming() bool { + return m.req.Stream && !m.req.ReadOnly +} + +func (m *mapSession) isEndpointUpdate() bool { + return !m.req.Stream && !m.req.ReadOnly && m.req.OmitPeers +} + +func (m *mapSession) isReadOnlyUpdate() bool { + return !m.req.Stream && m.req.OmitPeers && m.req.ReadOnly +} + +func (m *mapSession) flush200() { + m.w.WriteHeader(http.StatusOK) + if f, ok := m.w.(http.Flusher); ok { + f.Flush() + } } // handlePoll ensures the node gets the appropriate updates from either // polling or immediate responses. // //nolint:gocyclo -func (h *Headscale) handlePoll( - writer http.ResponseWriter, - ctx context.Context, - node *types.Node, - mapRequest tailcfg.MapRequest, -) { - logTrace, logWarn, logErr := logPollFunc(mapRequest, node) +func (m *mapSession) serve() { + // Register with the notifier if this is a streaming + // session + if m.isStreaming() { + // defers are called in reverse order, + // so top one is executed last. + + // Failover the node's routes if any. + defer m.infof("node has disconnected, mapSession: %p", m) + defer m.pollFailoverRoutes("node closing connection", m.node) + + defer m.h.updateNodeOnlineStatus(false, m.node) + defer m.h.nodeNotifier.RemoveNode(m.node.ID) + + defer func() { + m.servingMu.Lock() + defer m.servingMu.Unlock() + + m.serving = false + close(m.cancelCh) + }() + + m.serving = true + + m.h.nodeNotifier.AddNode(m.node.ID, m.ch) + m.h.updateNodeOnlineStatus(true, m.node) + + m.infof("node has connected, mapSession: %p", m) + } + + // TODO(kradalby): A set todos to harden: + // - func to tell the stream to die, readonly -> false, !stream && omitpeers -> false, true // This is the mechanism where the node gives us information about its // current configuration. @@ -84,473 +170,275 @@ func (h *Headscale) handlePoll( // breaking existing long-polling (Stream == true) connections. // In this case, the server can omit the entire response; the client // only checks the HTTP response status code. + // + // This is what Tailscale calls a Lite update, the client ignores + // the response and just wants a 200. + // !req.stream && !req.ReadOnly && req.OmitPeers + // // TODO(kradalby): remove ReadOnly when we only support capVer 68+ - if mapRequest.OmitPeers && !mapRequest.Stream && !mapRequest.ReadOnly { - log.Info(). - Caller(). - Bool("readOnly", mapRequest.ReadOnly). - Bool("omitPeers", mapRequest.OmitPeers). - Bool("stream", mapRequest.Stream). - Str("node_key", node.NodeKey.ShortString()). - Str("node", node.Hostname). - Int("cap_ver", int(mapRequest.Version)). - Msg("Received update") + if m.isEndpointUpdate() { + m.handleEndpointUpdate() - change := node.PeerChangeFromMapRequest(mapRequest) + return + } - online := h.nodeNotifier.IsConnected(node.MachineKey) - change.Online = &online + // ReadOnly is whether the client just wants to fetch the + // MapResponse, without updating their Endpoints. The + // Endpoints field will be ignored and LastSeen will not be + // updated and peers will not be notified of changes. + // + // The intended use is for clients to discover the DERP map at + // start-up before their first real endpoint update. + if m.isReadOnlyUpdate() { + m.handleReadOnlyRequest() - node.ApplyPeerChange(&change) - - hostInfoChange := node.Hostinfo.Equal(mapRequest.Hostinfo) - - logTracePeerChange(node.Hostname, hostInfoChange, &change) - - // Check if the Hostinfo of the node has changed. - // If it has changed, check if there has been a change tod - // the routable IPs of the host and update update them in - // the database. Then send a Changed update - // (containing the whole node object) to peers to inform about - // the route change. - // If the hostinfo has changed, but not the routes, just update - // hostinfo and let the function continue. - if !hostInfoChange { - oldRoutes := node.Hostinfo.RoutableIPs - newRoutes := mapRequest.Hostinfo.RoutableIPs - - oldServicesCount := len(node.Hostinfo.Services) - newServicesCount := len(mapRequest.Hostinfo.Services) - - node.Hostinfo = mapRequest.Hostinfo - - sendUpdate := false - - // Route changes come as part of Hostinfo, which means that - // when an update comes, the Node Route logic need to run. - // This will require a "change" in comparison to a "patch", - // which is more costly. - if !xslices.Equal(oldRoutes, newRoutes) { - var err error - sendUpdate, err = h.db.SaveNodeRoutes(node) - if err != nil { - logErr(err, "Error processing node routes") - http.Error(writer, "", http.StatusInternalServerError) - - return - } - - if h.ACLPolicy != nil { - // update routes with peer information - update, err := h.db.EnableAutoApprovedRoutes(h.ACLPolicy, node) - if err != nil { - logErr(err, "Error running auto approved routes") - } - - if update != nil { - sendUpdate = true - } - } - } - - // Services is mostly useful for discovery and not critical, - // except for peerapi, which is how nodes talk to eachother. - // If peerapi was not part of the initial mapresponse, we - // need to make sure its sent out later as it is needed for - // Taildrop. - // TODO(kradalby): Length comparison is a bit naive, replace. - if oldServicesCount != newServicesCount { - sendUpdate = true - } - - if sendUpdate { - if err := h.db.DB.Save(node).Error; err != nil { - logErr(err, "Failed to persist/update node in the database") - http.Error(writer, "", http.StatusInternalServerError) - - return - } - - // Send an update to all peers to propagate the new routes - // available. - stateUpdate := types.StateUpdate{ - Type: types.StatePeerChanged, - ChangeNodes: types.Nodes{node}, - Message: "called from handlePoll -> update -> new hostinfo", - } - if stateUpdate.Valid() { - ctx := types.NotifyCtx(context.Background(), "poll-nodeupdate-peers-hostinfochange", node.Hostname) - h.nodeNotifier.NotifyWithIgnore( - ctx, - stateUpdate, - node.MachineKey.String()) - } - - // Send an update to the node itself with to ensure it - // has an updated packetfilter allowing the new route - // if it is defined in the ACL. - selfUpdate := types.StateUpdate{ - Type: types.StateSelfUpdate, - ChangeNodes: types.Nodes{node}, - } - if selfUpdate.Valid() { - ctx := types.NotifyCtx(context.Background(), "poll-nodeupdate-self-hostinfochange", node.Hostname) - h.nodeNotifier.NotifyByMachineKey( - ctx, - selfUpdate, - node.MachineKey) - } - - return - } - } - - if err := h.db.DB.Save(node).Error; err != nil { - logErr(err, "Failed to persist/update node in the database") - http.Error(writer, "", http.StatusInternalServerError) + return + } + // From version 68, all streaming requests can be treated as read only. + if m.capVer < 68 { + // Error has been handled/written to client in the func + // return + err := m.handleSaveNode() + if err != nil { return } - - stateUpdate := types.StateUpdate{ - Type: types.StatePeerChangedPatch, - ChangePatches: []*tailcfg.PeerChange{&change}, - } - if stateUpdate.Valid() { - ctx := types.NotifyCtx(context.Background(), "poll-nodeupdate-peers-patch", node.Hostname) - h.nodeNotifier.NotifyWithIgnore( - ctx, - stateUpdate, - node.MachineKey.String()) - } - - writer.WriteHeader(http.StatusOK) - if f, ok := writer.(http.Flusher); ok { - f.Flush() - } - - return - } else if mapRequest.OmitPeers && !mapRequest.Stream && mapRequest.ReadOnly { - // ReadOnly is whether the client just wants to fetch the - // MapResponse, without updating their Endpoints. The - // Endpoints field will be ignored and LastSeen will not be - // updated and peers will not be notified of changes. - // - // The intended use is for clients to discover the DERP map at - // start-up before their first real endpoint update. - } else if mapRequest.OmitPeers && !mapRequest.Stream && mapRequest.ReadOnly { - h.handleLiteRequest(writer, node, mapRequest) - - return - } else if mapRequest.OmitPeers && mapRequest.Stream { - logErr(nil, "Ignoring request, don't know how to handle it") - - return - } - - change := node.PeerChangeFromMapRequest(mapRequest) - - // A stream is being set up, the node is Online - online := true - change.Online = &online - - node.ApplyPeerChange(&change) - - // Only save HostInfo if changed, update routes if changed - // TODO(kradalby): Remove when capver is over 68 - if !node.Hostinfo.Equal(mapRequest.Hostinfo) { - oldRoutes := node.Hostinfo.RoutableIPs - newRoutes := mapRequest.Hostinfo.RoutableIPs - - node.Hostinfo = mapRequest.Hostinfo - - if !xslices.Equal(oldRoutes, newRoutes) { - _, err := h.db.SaveNodeRoutes(node) - if err != nil { - logErr(err, "Error processing node routes") - http.Error(writer, "", http.StatusInternalServerError) - - return - } - } - } - - if err := h.db.DB.Save(node).Error; err != nil { - logErr(err, "Failed to persist/update node in the database") - http.Error(writer, "", http.StatusInternalServerError) - - return } // Set up the client stream - h.pollNetMapStreamWG.Add(1) - defer h.pollNetMapStreamWG.Done() + m.h.pollNetMapStreamWG.Add(1) + defer m.h.pollNetMapStreamWG.Done() - // Use a buffered channel in case a node is not fully ready - // to receive a message to make sure we dont block the entire - // notifier. - // 12 is arbitrarily chosen. - chanSize := 3 - if size, ok := envknob.LookupInt("HEADSCALE_TUNING_POLL_QUEUE_SIZE"); ok { - chanSize = size - } - updateChan := make(chan types.StateUpdate, chanSize) - defer closeChanWithLog(updateChan, node.Hostname, "updateChan") + m.pollFailoverRoutes("node connected", m.node) - // Register the node's update channel - h.nodeNotifier.AddNode(node.MachineKey, updateChan) - defer h.nodeNotifier.RemoveNode(node.MachineKey) + keepAliveTicker := time.NewTicker(keepAliveInterval + (time.Duration(rand.IntN(9000)) * time.Millisecond)) - // When a node connects to control, list the peers it has at - // that given point, further updates are kept in memory in - // the Mapper, which lives for the duration of the polling - // session. - peers, err := h.db.ListPeers(node) - if err != nil { - logErr(err, "Failed to list peers when opening poller") - http.Error(writer, "", http.StatusInternalServerError) - - return - } - - isConnected := h.nodeNotifier.ConnectedMap() - for _, peer := range peers { - online := isConnected[peer.MachineKey] - peer.IsOnline = &online - } - - mapp := mapper.NewMapper( - node, - peers, - h.DERPMap, - h.cfg.BaseDomain, - h.cfg.DNSConfig, - h.cfg.LogTail.Enabled, - h.cfg.RandomizeClientPort, - ) - - // update ACLRules with peer informations (to update server tags if necessary) - if h.ACLPolicy != nil { - // update routes with peer information - // This state update is ignored as it will be sent - // as part of the whole node - // TODO(kradalby): figure out if that is actually correct - _, err = h.db.EnableAutoApprovedRoutes(h.ACLPolicy, node) - if err != nil { - logErr(err, "Error running auto approved routes") - } - } - - logTrace("Sending initial map") - - mapResp, err := mapp.FullMapResponse(mapRequest, node, h.ACLPolicy) - if err != nil { - logErr(err, "Failed to create MapResponse") - http.Error(writer, "", http.StatusInternalServerError) - - return - } - - // Send the client an update to make sure we send an initial mapresponse - _, err = writer.Write(mapResp) - if err != nil { - logErr(err, "Could not write the map response") - - return - } - - if flusher, ok := writer.(http.Flusher); ok { - flusher.Flush() - } else { - return - } - - stateUpdate := types.StateUpdate{ - Type: types.StatePeerChanged, - ChangeNodes: types.Nodes{node}, - Message: "called from handlePoll -> new node added", - } - if stateUpdate.Valid() { - ctx := types.NotifyCtx(context.Background(), "poll-newnode-peers", node.Hostname) - h.nodeNotifier.NotifyWithIgnore( - ctx, - stateUpdate, - node.MachineKey.String()) - } - - if len(node.Routes) > 0 { - go h.pollFailoverRoutes(logErr, "new node", node) - } - - keepAliveTicker := time.NewTicker(keepAliveInterval) - - ctx, cancel := context.WithCancel(context.WithValue(ctx, nodeNameContextKey, node.Hostname)) + ctx, cancel := context.WithCancel(context.WithValue(m.ctx, nodeNameContextKey, m.node.Hostname)) defer cancel() + // TODO(kradalby): Make this available through a tuning envvar + wait := time.Second + + // Add a circuit breaker, if the loop is not interrupted + // inbetween listening for the channels, some updates + // might get stale and stucked in the "changed" map + // defined below. + blockBreaker := time.NewTicker(wait) + + // true means changed, false means removed + var changed map[types.NodeID]bool + var patches []*tailcfg.PeerChange + var derp bool + + // Set full to true to immediatly send a full mapresponse + full := true + prev := time.Now() + lastMessage := "" + + // Loop through updates and continuously send them to the + // client. for { - logTrace("Waiting for update on stream channel") - select { - case <-keepAliveTicker.C: - data, err := mapp.KeepAliveResponse(mapRequest, node) - if err != nil { - logErr(err, "Error generating the keep alive msg") - - return - } - _, err = writer.Write(data) - if err != nil { - logErr(err, "Cannot write keep alive message") - - return - } - if flusher, ok := writer.(http.Flusher); ok { - flusher.Flush() - } else { - log.Error().Msg("Failed to create http flusher") - - return - } - - // This goroutine is not ideal, but we have a potential issue here - // where it blocks too long and that holds up updates. - // One alternative is to split these different channels into - // goroutines, but then you might have a problem without a lock - // if a keepalive is written at the same time as an update. - go h.updateNodeOnlineStatus(true, node) - - case update := <-updateChan: - logTrace("Received update") - now := time.Now() - + // If a full update has been requested or there are patches, then send it immediately + // otherwise wait for the "batching" of changes or patches + if full || patches != nil || (changed != nil && time.Since(prev) > wait) { var data []byte var err error // Ensure the node object is updated, for example, there // might have been a hostinfo update in a sidechannel // which contains data needed to generate a map response. - node, err = h.db.GetNodeByMachineKey(node.MachineKey) + m.node, err = m.h.db.GetNodeByID(m.node.ID) if err != nil { - logErr(err, "Could not get machine from db") + m.errf(err, "Could not get machine from db") return } - startMapResp := time.Now() - switch update.Type { - case types.StateFullUpdate: - logTrace("Sending Full MapResponse") + // If there are patches _and_ fully changed nodes, filter the + // patches and remove all patches that are present for the full + // changes updates. This allows us to send them as part of the + // PeerChange update, but only for nodes that are not fully changed. + // The fully changed nodes will be updated from the database and + // have all the updates needed. + // This means that the patches left are for nodes that has no + // updates that requires a full update. + // Patches are not suppose to be mixed in, but can be. + // + // From tailcfg docs: + // These are applied after Peers* above, but in practice the + // control server should only send these on their own, without + // + // Currently, there is no effort to merge patch updates, they + // are all sent, and the client will apply them in order. + // TODO(kradalby): Merge Patches for the same IDs to send less + // data and give the client less work. + if patches != nil && changed != nil { + var filteredPatches []*tailcfg.PeerChange - data, err = mapp.FullMapResponse(mapRequest, node, h.ACLPolicy) - case types.StatePeerChanged: - logTrace(fmt.Sprintf("Sending Changed MapResponse: %s", update.Message)) - - isConnectedMap := h.nodeNotifier.ConnectedMap() - for _, node := range update.ChangeNodes { - // If a node is not reported to be online, it might be - // because the value is outdated, check with the notifier. - // However, if it is set to Online, and not in the notifier, - // this might be because it has announced itself, but not - // reached the stage to actually create the notifier channel. - if node.IsOnline != nil && !*node.IsOnline { - isOnline := isConnectedMap[node.MachineKey] - node.IsOnline = &isOnline + for _, patch := range patches { + if _, ok := changed[types.NodeID(patch.NodeID)]; !ok { + filteredPatches = append(filteredPatches, patch) } } - data, err = mapp.PeerChangedResponse(mapRequest, node, update.ChangeNodes, h.ACLPolicy, update.Message) - case types.StatePeerChangedPatch: - logTrace("Sending PeerChangedPatch MapResponse") - data, err = mapp.PeerChangedPatchResponse(mapRequest, node, update.ChangePatches, h.ACLPolicy) - case types.StatePeerRemoved: - logTrace("Sending PeerRemoved MapResponse") - data, err = mapp.PeerRemovedResponse(mapRequest, node, update.Removed) - case types.StateSelfUpdate: - if len(update.ChangeNodes) == 1 { - logTrace("Sending SelfUpdate MapResponse") - node = update.ChangeNodes[0] - data, err = mapp.LiteMapResponse(mapRequest, node, h.ACLPolicy, types.SelfUpdateIdentifier) - } else { - logWarn("SelfUpdate contained too many nodes, this is likely a bug in the code, please report.") - } - case types.StateDERPUpdated: - logTrace("Sending DERPUpdate MapResponse") - data, err = mapp.DERPMapResponse(mapRequest, node, update.DERPMap) + patches = filteredPatches + } + + // When deciding what update to send, the following is considered, + // Full is a superset of all updates, when a full update is requested, + // send only that and move on, all other updates will be present in + // a full map response. + // + // If a map of changed nodes exists, prefer sending that as it will + // contain all the updates for the node, including patches, as it + // is fetched freshly from the database when building the response. + // + // If there is full changes registered, but we have patches for individual + // nodes, send them. + // + // Finally, if a DERP map is the only request, send that alone. + if full { + m.tracef("Sending Full MapResponse") + data, err = m.mapper.FullMapResponse(m.req, m.node, m.h.ACLPolicy, fmt.Sprintf("from mapSession: %p, stream: %t", m, m.isStreaming())) + } else if changed != nil { + m.tracef(fmt.Sprintf("Sending Changed MapResponse: %v", lastMessage)) + data, err = m.mapper.PeerChangedResponse(m.req, m.node, changed, patches, m.h.ACLPolicy, lastMessage) + } else if patches != nil { + m.tracef(fmt.Sprintf("Sending Changed Patch MapResponse: %v", lastMessage)) + data, err = m.mapper.PeerChangedPatchResponse(m.req, m.node, patches, m.h.ACLPolicy) + } else if derp { + m.tracef("Sending DERPUpdate MapResponse") + data, err = m.mapper.DERPMapResponse(m.req, m.node, m.h.DERPMap) } if err != nil { - logErr(err, "Could not get the create map update") + m.errf(err, "Could not get the create map update") return } - log.Trace().Str("node", node.Hostname).TimeDiff("timeSpent", time.Now(), startMapResp).Str("mkey", node.MachineKey.String()).Int("type", int(update.Type)).Msg("finished making map response") + // log.Trace().Str("node", m.node.Hostname).TimeDiff("timeSpent", time.Now(), startMapResp).Str("mkey", m.node.MachineKey.String()).Int("type", int(update.Type)).Msg("finished making map response") // Only send update if there is change if data != nil { startWrite := time.Now() - _, err = writer.Write(data) + _, err = m.w.Write(data) if err != nil { - logErr(err, "Could not write the map response") - - updateRequestsSentToNode.WithLabelValues(node.User.Name, node.Hostname, "failed"). - Inc() + m.errf(err, "Could not write the map response, for mapSession: %p, stream: %t", m, m.isStreaming()) return } - if flusher, ok := writer.(http.Flusher); ok { + if flusher, ok := m.w.(http.Flusher); ok { flusher.Flush() } else { log.Error().Msg("Failed to create http flusher") return } - log.Trace().Str("node", node.Hostname).TimeDiff("timeSpent", time.Now(), startWrite).Str("mkey", node.MachineKey.String()).Int("type", int(update.Type)).Msg("finished writing mapresp to node") + log.Trace().Str("node", m.node.Hostname).TimeDiff("timeSpent", time.Now(), startWrite).Str("mkey", m.node.MachineKey.String()).Msg("finished writing mapresp to node") - log.Debug(). - Caller(). - Bool("readOnly", mapRequest.ReadOnly). - Bool("omitPeers", mapRequest.OmitPeers). - Bool("stream", mapRequest.Stream). - Str("node_key", node.NodeKey.ShortString()). - Str("machine_key", node.MachineKey.ShortString()). - Str("node", node.Hostname). - TimeDiff("timeSpent", time.Now(), now). - Msg("update sent") + m.infof("update sent") } + // reset + changed = nil + patches = nil + lastMessage = "" + full = false + derp = false + prev = time.Now() + } + + // consume channels with update, keep alives or "batch" blocking signals + select { + case <-m.cancelCh: + m.tracef("poll cancelled received") + return case <-ctx.Done(): - logTrace("The client has closed the connection") - - go h.updateNodeOnlineStatus(false, node) - - // Failover the node's routes if any. - go h.pollFailoverRoutes(logErr, "node closing connection", node) - - // The connection has been closed, so we can stop polling. + m.tracef("poll context done") return - case <-h.shutdownChan: - logTrace("The long-poll handler is shutting down") + // Avoid infinite block that would potentially leave + // some updates in the changed map. + case <-blockBreaker.C: + continue - return + // Consume all updates sent to node + case update := <-m.ch: + m.tracef("received stream update: %d %s", update.Type, update.Message) + + switch update.Type { + case types.StateFullUpdate: + full = true + case types.StatePeerChanged: + if changed == nil { + changed = make(map[types.NodeID]bool) + } + + for _, nodeID := range update.ChangeNodes { + changed[nodeID] = true + } + + lastMessage = update.Message + case types.StatePeerChangedPatch: + patches = append(patches, update.ChangePatches...) + case types.StatePeerRemoved: + if changed == nil { + changed = make(map[types.NodeID]bool) + } + + for _, nodeID := range update.Removed { + changed[nodeID] = false + } + case types.StateSelfUpdate: + // create the map so an empty (self) update is sent + if changed == nil { + changed = make(map[types.NodeID]bool) + } + + lastMessage = update.Message + case types.StateDERPUpdated: + derp = true + } + + case <-keepAliveTicker.C: + data, err := m.mapper.KeepAliveResponse(m.req, m.node) + if err != nil { + m.errf(err, "Error generating the keep alive msg") + + return + } + _, err = m.w.Write(data) + if err != nil { + m.errf(err, "Cannot write keep alive message") + + return + } + if flusher, ok := m.w.(http.Flusher); ok { + flusher.Flush() + } else { + log.Error().Msg("Failed to create http flusher") + + return + } } } } -func (h *Headscale) pollFailoverRoutes(logErr func(error, string), where string, node *types.Node) { - update, err := db.Write(h.db.DB, func(tx *gorm.DB) (*types.StateUpdate, error) { - return db.EnsureFailoverRouteIsAvailable(tx, h.nodeNotifier.ConnectedMap(), node) +func (m *mapSession) pollFailoverRoutes(where string, node *types.Node) { + update, err := db.Write(m.h.db.DB, func(tx *gorm.DB) (*types.StateUpdate, error) { + return db.FailoverRouteIfAvailable(tx, m.h.nodeNotifier.ConnectedMap(), node) }) if err != nil { - logErr(err, fmt.Sprintf("failed to ensure failover routes, %s", where)) + m.errf(err, fmt.Sprintf("failed to ensure failover routes, %s", where)) return } - if update != nil && !update.Empty() && update.Valid() { + if update != nil && !update.Empty() { ctx := types.NotifyCtx(context.Background(), fmt.Sprintf("poll-%s-routes-ensurefailover", strings.ReplaceAll(where, " ", "-")), node.Hostname) - h.nodeNotifier.NotifyWithIgnore(ctx, *update, node.MachineKey.String()) + m.h.nodeNotifier.NotifyWithIgnore(ctx, *update, node.ID) } } @@ -558,33 +446,35 @@ func (h *Headscale) pollFailoverRoutes(logErr func(error, string), where string, // about change in their online/offline status. // It takes a StateUpdateType of either StatePeerOnlineChanged or StatePeerOfflineChanged. func (h *Headscale) updateNodeOnlineStatus(online bool, node *types.Node) { - now := time.Now() + change := &tailcfg.PeerChange{ + NodeID: tailcfg.NodeID(node.ID), + Online: &online, + } - node.LastSeen = &now + if !online { + now := time.Now() - statusUpdate := types.StateUpdate{ + // lastSeen is only relevant if the node is disconnected. + node.LastSeen = &now + change.LastSeen = &now + + err := h.db.DB.Transaction(func(tx *gorm.DB) error { + return db.SetLastSeen(tx, node.ID, *node.LastSeen) + }) + if err != nil { + log.Error().Err(err).Msg("Cannot update node LastSeen") + + return + } + } + + ctx := types.NotifyCtx(context.Background(), "poll-nodeupdate-onlinestatus", node.Hostname) + h.nodeNotifier.NotifyWithIgnore(ctx, types.StateUpdate{ Type: types.StatePeerChangedPatch, ChangePatches: []*tailcfg.PeerChange{ - { - NodeID: tailcfg.NodeID(node.ID), - Online: &online, - LastSeen: &now, - }, + change, }, - } - if statusUpdate.Valid() { - ctx := types.NotifyCtx(context.Background(), "poll-nodeupdate-onlinestatus", node.Hostname) - h.nodeNotifier.NotifyWithIgnore(ctx, statusUpdate, node.MachineKey.String()) - } - - err := h.db.DB.Transaction(func(tx *gorm.DB) error { - return db.UpdateLastSeen(tx, node.ID, *node.LastSeen) - }) - if err != nil { - log.Error().Err(err).Msg("Cannot update node LastSeen") - - return - } + }, node.ID) } func closeChanWithLog[C chan []byte | chan struct{} | chan types.StateUpdate](channel C, node, name string) { @@ -597,43 +487,178 @@ func closeChanWithLog[C chan []byte | chan struct{} | chan types.StateUpdate](ch close(channel) } -func (h *Headscale) handleLiteRequest( - writer http.ResponseWriter, - node *types.Node, - mapRequest tailcfg.MapRequest, -) { - logTrace, _, logErr := logPollFunc(mapRequest, node) +func (m *mapSession) handleEndpointUpdate() { + m.tracef("received endpoint update") - mapp := mapper.NewMapper( - node, - types.Nodes{}, - h.DERPMap, - h.cfg.BaseDomain, - h.cfg.DNSConfig, - h.cfg.LogTail.Enabled, - h.cfg.RandomizeClientPort, - ) + change := m.node.PeerChangeFromMapRequest(m.req) - logTrace("Client asked for a lite update, responding without peers") + online := m.h.nodeNotifier.IsLikelyConnected(m.node.ID) + change.Online = &online - mapResp, err := mapp.LiteMapResponse(mapRequest, node, h.ACLPolicy) - if err != nil { - logErr(err, "Failed to create MapResponse") - http.Error(writer, "", http.StatusInternalServerError) + m.node.ApplyPeerChange(&change) + + sendUpdate, routesChanged := hostInfoChanged(m.node.Hostinfo, m.req.Hostinfo) + m.node.Hostinfo = m.req.Hostinfo + + logTracePeerChange(m.node.Hostname, sendUpdate, &change) + + // If there is no changes and nothing to save, + // return early. + if peerChangeEmpty(change) && !sendUpdate { + return + } + + // Check if the Hostinfo of the node has changed. + // If it has changed, check if there has been a change to + // the routable IPs of the host and update update them in + // the database. Then send a Changed update + // (containing the whole node object) to peers to inform about + // the route change. + // If the hostinfo has changed, but not the routes, just update + // hostinfo and let the function continue. + if routesChanged { + var err error + _, err = m.h.db.SaveNodeRoutes(m.node) + if err != nil { + m.errf(err, "Error processing node routes") + http.Error(m.w, "", http.StatusInternalServerError) + + return + } + + if m.h.ACLPolicy != nil { + // update routes with peer information + err := m.h.db.EnableAutoApprovedRoutes(m.h.ACLPolicy, m.node) + if err != nil { + m.errf(err, "Error running auto approved routes") + } + } + + // Send an update to the node itself with to ensure it + // has an updated packetfilter allowing the new route + // if it is defined in the ACL. + ctx := types.NotifyCtx(context.Background(), "poll-nodeupdate-self-hostinfochange", m.node.Hostname) + m.h.nodeNotifier.NotifyByMachineKey( + ctx, + types.StateUpdate{ + Type: types.StateSelfUpdate, + ChangeNodes: []types.NodeID{m.node.ID}, + }, + m.node.ID) + + } + + if err := m.h.db.DB.Save(m.node).Error; err != nil { + m.errf(err, "Failed to persist/update node in the database") + http.Error(m.w, "", http.StatusInternalServerError) return } - writer.Header().Set("Content-Type", "application/json; charset=utf-8") - writer.WriteHeader(http.StatusOK) - _, err = writer.Write(mapResp) - if err != nil { - logErr(err, "Failed to write response") + ctx := types.NotifyCtx(context.Background(), "poll-nodeupdate-peers-patch", m.node.Hostname) + m.h.nodeNotifier.NotifyWithIgnore( + ctx, + types.StateUpdate{ + Type: types.StatePeerChanged, + ChangeNodes: []types.NodeID{m.node.ID}, + Message: "called from handlePoll -> update", + }, + m.node.ID) + + m.flush200() + + return +} + +// handleSaveNode saves node updates in the maprequest _streaming_ +// path and is mostly the same code as in handleEndpointUpdate. +// It is not attempted to be deduplicated since it will go away +// when we stop supporting older than 68 which removes updates +// when the node is streaming. +func (m *mapSession) handleSaveNode() error { + m.tracef("saving node update from stream session") + + change := m.node.PeerChangeFromMapRequest(m.req) + + // A stream is being set up, the node is Online + online := true + change.Online = &online + + m.node.ApplyPeerChange(&change) + + sendUpdate, routesChanged := hostInfoChanged(m.node.Hostinfo, m.req.Hostinfo) + m.node.Hostinfo = m.req.Hostinfo + + // If there is no changes and nothing to save, + // return early. + if peerChangeEmpty(change) || !sendUpdate { + return nil } + + // Check if the Hostinfo of the node has changed. + // If it has changed, check if there has been a change to + // the routable IPs of the host and update update them in + // the database. Then send a Changed update + // (containing the whole node object) to peers to inform about + // the route change. + // If the hostinfo has changed, but not the routes, just update + // hostinfo and let the function continue. + if routesChanged { + var err error + _, err = m.h.db.SaveNodeRoutes(m.node) + if err != nil { + return err + } + + if m.h.ACLPolicy != nil { + // update routes with peer information + err := m.h.db.EnableAutoApprovedRoutes(m.h.ACLPolicy, m.node) + if err != nil { + return err + } + } + } + + if err := m.h.db.DB.Save(m.node).Error; err != nil { + return err + } + + ctx := types.NotifyCtx(context.Background(), "pre-68-update-while-stream", m.node.Hostname) + m.h.nodeNotifier.NotifyWithIgnore( + ctx, + types.StateUpdate{ + Type: types.StatePeerChanged, + ChangeNodes: []types.NodeID{m.node.ID}, + Message: "called from handlePoll -> pre-68-update-while-stream", + }, + m.node.ID) + + return nil +} + +func (m *mapSession) handleReadOnlyRequest() { + m.tracef("Client asked for a lite update, responding without peers") + + mapResp, err := m.mapper.ReadOnlyMapResponse(m.req, m.node, m.h.ACLPolicy) + if err != nil { + m.errf(err, "Failed to create MapResponse") + http.Error(m.w, "", http.StatusInternalServerError) + + return + } + + m.w.Header().Set("Content-Type", "application/json; charset=utf-8") + m.w.WriteHeader(http.StatusOK) + _, err = m.w.Write(mapResp) + if err != nil { + m.errf(err, "Failed to write response") + } + + m.flush200() } func logTracePeerChange(hostname string, hostinfoChange bool, change *tailcfg.PeerChange) { - trace := log.Trace().Str("node_id", change.NodeID.String()).Str("hostname", hostname) + trace := log.Trace().Uint64("node.id", uint64(change.NodeID)).Str("hostname", hostname) if change.Key != nil { trace = trace.Str("node_key", change.Key.ShortString()) @@ -666,3 +691,114 @@ func logTracePeerChange(hostname string, hostinfoChange bool, change *tailcfg.Pe trace.Time("last_seen", *change.LastSeen).Msg("PeerChange received") } + +func peerChangeEmpty(chng tailcfg.PeerChange) bool { + return chng.Key == nil && + chng.DiscoKey == nil && + chng.Online == nil && + chng.Endpoints == nil && + chng.DERPRegion == 0 && + chng.LastSeen == nil && + chng.KeyExpiry == nil +} + +func logPollFunc( + mapRequest tailcfg.MapRequest, + node *types.Node, +) (func(string, ...any), func(string, ...any), func(string, ...any), func(error, string, ...any)) { + return func(msg string, a ...any) { + log.Warn(). + Caller(). + Bool("readOnly", mapRequest.ReadOnly). + Bool("omitPeers", mapRequest.OmitPeers). + Bool("stream", mapRequest.Stream). + Uint64("node.id", node.ID.Uint64()). + Str("node", node.Hostname). + Msgf(msg, a...) + }, + func(msg string, a ...any) { + log.Info(). + Caller(). + Bool("readOnly", mapRequest.ReadOnly). + Bool("omitPeers", mapRequest.OmitPeers). + Bool("stream", mapRequest.Stream). + Uint64("node.id", node.ID.Uint64()). + Str("node", node.Hostname). + Msgf(msg, a...) + }, + func(msg string, a ...any) { + log.Trace(). + Caller(). + Bool("readOnly", mapRequest.ReadOnly). + Bool("omitPeers", mapRequest.OmitPeers). + Bool("stream", mapRequest.Stream). + Uint64("node.id", node.ID.Uint64()). + Str("node", node.Hostname). + Msgf(msg, a...) + }, + func(err error, msg string, a ...any) { + log.Error(). + Caller(). + Bool("readOnly", mapRequest.ReadOnly). + Bool("omitPeers", mapRequest.OmitPeers). + Bool("stream", mapRequest.Stream). + Uint64("node.id", node.ID.Uint64()). + Str("node", node.Hostname). + Err(err). + Msgf(msg, a...) + } +} + +// hostInfoChanged reports if hostInfo has changed in two ways, +// - first bool reports if an update needs to be sent to nodes +// - second reports if there has been changes to routes +// the caller can then use this info to save and update nodes +// and routes as needed. +func hostInfoChanged(old, new *tailcfg.Hostinfo) (bool, bool) { + if old.Equal(new) { + return false, false + } + + // Routes + oldRoutes := old.RoutableIPs + newRoutes := new.RoutableIPs + + sort.Slice(oldRoutes, func(i, j int) bool { + return comparePrefix(oldRoutes[i], oldRoutes[j]) > 0 + }) + sort.Slice(newRoutes, func(i, j int) bool { + return comparePrefix(newRoutes[i], newRoutes[j]) > 0 + }) + + if !xslices.Equal(oldRoutes, newRoutes) { + return true, true + } + + // Services is mostly useful for discovery and not critical, + // except for peerapi, which is how nodes talk to eachother. + // If peerapi was not part of the initial mapresponse, we + // need to make sure its sent out later as it is needed for + // Taildrop. + // TODO(kradalby): Length comparison is a bit naive, replace. + if len(old.Services) != len(new.Services) { + return true, false + } + + return false, false +} + +// TODO(kradalby): Remove after go 1.23, will be in stdlib. +// Compare returns an integer comparing two prefixes. +// The result will be 0 if p == p2, -1 if p < p2, and +1 if p > p2. +// Prefixes sort first by validity (invalid before valid), then +// address family (IPv4 before IPv6), then prefix length, then +// address. +func comparePrefix(p, p2 netip.Prefix) int { + if c := cmp.Compare(p.Addr().BitLen(), p2.Addr().BitLen()); c != 0 { + return c + } + if c := cmp.Compare(p.Bits(), p2.Bits()); c != 0 { + return c + } + return p.Addr().Compare(p2.Addr()) +} diff --git a/hscontrol/poll_noise.go b/hscontrol/poll_noise.go deleted file mode 100644 index 53b1d47..0000000 --- a/hscontrol/poll_noise.go +++ /dev/null @@ -1,96 +0,0 @@ -package hscontrol - -import ( - "encoding/json" - "errors" - "io" - "net/http" - - "github.com/rs/zerolog/log" - "gorm.io/gorm" - "tailscale.com/tailcfg" - "tailscale.com/types/key" -) - -const ( - MinimumCapVersion tailcfg.CapabilityVersion = 58 -) - -// NoisePollNetMapHandler takes care of /machine/:id/map using the Noise protocol -// -// This is the busiest endpoint, as it keeps the HTTP long poll that updates -// the clients when something in the network changes. -// -// The clients POST stuff like HostInfo and their Endpoints here, but -// only after their first request (marked with the ReadOnly field). -// -// At this moment the updates are sent in a quite horrendous way, but they kinda work. -func (ns *noiseServer) NoisePollNetMapHandler( - writer http.ResponseWriter, - req *http.Request, -) { - log.Trace(). - Str("handler", "NoisePollNetMap"). - Msg("PollNetMapHandler called") - - log.Trace(). - Any("headers", req.Header). - Caller(). - Msg("Headers") - - body, _ := io.ReadAll(req.Body) - - mapRequest := tailcfg.MapRequest{} - if err := json.Unmarshal(body, &mapRequest); err != nil { - log.Error(). - Caller(). - Err(err). - Msg("Cannot parse MapRequest") - http.Error(writer, "Internal error", http.StatusInternalServerError) - - return - } - - // Reject unsupported versions - if mapRequest.Version < MinimumCapVersion { - log.Info(). - Caller(). - Int("min_version", int(MinimumCapVersion)). - Int("client_version", int(mapRequest.Version)). - Msg("unsupported client connected") - http.Error(writer, "Internal error", http.StatusBadRequest) - - return - } - - ns.nodeKey = mapRequest.NodeKey - - node, err := ns.headscale.db.GetNodeByAnyKey( - ns.conn.Peer(), - mapRequest.NodeKey, - key.NodePublic{}, - ) - if err != nil { - if errors.Is(err, gorm.ErrRecordNotFound) { - log.Warn(). - Str("handler", "NoisePollNetMap"). - Msgf("Ignoring request, cannot find node with key %s", mapRequest.NodeKey.String()) - http.Error(writer, "Internal error", http.StatusNotFound) - - return - } - log.Error(). - Str("handler", "NoisePollNetMap"). - Msgf("Failed to fetch node from the database with node key: %s", mapRequest.NodeKey.String()) - http.Error(writer, "Internal error", http.StatusInternalServerError) - - return - } - log.Debug(). - Str("handler", "NoisePollNetMap"). - Str("node", node.Hostname). - Int("cap_ver", int(mapRequest.Version)). - Msg("A node sending a MapRequest with Noise protocol") - - ns.headscale.handlePoll(writer, req.Context(), node, mapRequest) -} diff --git a/hscontrol/types/common.go b/hscontrol/types/common.go index ceeceea..6d63f30 100644 --- a/hscontrol/types/common.go +++ b/hscontrol/types/common.go @@ -90,6 +90,25 @@ func (i StringList) Value() (driver.Value, error) { type StateUpdateType int +func (su StateUpdateType) String() string { + switch su { + case StateFullUpdate: + return "StateFullUpdate" + case StatePeerChanged: + return "StatePeerChanged" + case StatePeerChangedPatch: + return "StatePeerChangedPatch" + case StatePeerRemoved: + return "StatePeerRemoved" + case StateSelfUpdate: + return "StateSelfUpdate" + case StateDERPUpdated: + return "StateDERPUpdated" + } + + return "unknown state update type" +} + const ( StateFullUpdate StateUpdateType = iota // StatePeerChanged is used for updates that needs @@ -118,7 +137,7 @@ type StateUpdate struct { // ChangeNodes must be set when Type is StatePeerAdded // and StatePeerChanged and contains the full node // object for added nodes. - ChangeNodes Nodes + ChangeNodes []NodeID // ChangePatches must be set when Type is StatePeerChangedPatch // and contains a populated PeerChange object. @@ -127,7 +146,7 @@ type StateUpdate struct { // Removed must be set when Type is StatePeerRemoved and // contain a list of the nodes that has been removed from // the network. - Removed []tailcfg.NodeID + Removed []NodeID // DERPMap must be set when Type is StateDERPUpdated and // contain the new DERP Map. @@ -138,39 +157,6 @@ type StateUpdate struct { Message string } -// Valid reports if a StateUpdate is correctly filled and -// panics if the mandatory fields for a type is not -// filled. -// Reports true if valid. -func (su *StateUpdate) Valid() bool { - switch su.Type { - case StatePeerChanged: - if su.ChangeNodes == nil { - panic("Mandatory field ChangeNodes is not set on StatePeerChanged update") - } - case StatePeerChangedPatch: - if su.ChangePatches == nil { - panic("Mandatory field ChangePatches is not set on StatePeerChangedPatch update") - } - case StatePeerRemoved: - if su.Removed == nil { - panic("Mandatory field Removed is not set on StatePeerRemove update") - } - case StateSelfUpdate: - if su.ChangeNodes == nil || len(su.ChangeNodes) != 1 { - panic( - "Mandatory field ChangeNodes is not set for StateSelfUpdate or has more than one node", - ) - } - case StateDERPUpdated: - if su.DERPMap == nil { - panic("Mandatory field DERPMap is not set on StateDERPUpdated update") - } - } - - return true -} - // Empty reports if there are any updates in the StateUpdate. func (su *StateUpdate) Empty() bool { switch su.Type { @@ -185,12 +171,12 @@ func (su *StateUpdate) Empty() bool { return false } -func StateUpdateExpire(nodeID uint64, expiry time.Time) StateUpdate { +func StateUpdateExpire(nodeID NodeID, expiry time.Time) StateUpdate { return StateUpdate{ Type: StatePeerChangedPatch, ChangePatches: []*tailcfg.PeerChange{ { - NodeID: tailcfg.NodeID(nodeID), + NodeID: nodeID.NodeID(), KeyExpiry: &expiry, }, }, diff --git a/hscontrol/types/config.go b/hscontrol/types/config.go index 022d127..4e4b9a6 100644 --- a/hscontrol/types/config.go +++ b/hscontrol/types/config.go @@ -69,6 +69,8 @@ type Config struct { CLI CLIConfig ACL ACLConfig + + Tuning Tuning } type SqliteConfig struct { @@ -161,6 +163,11 @@ type LogConfig struct { Level zerolog.Level } +type Tuning struct { + BatchChangeDelay time.Duration + NodeMapSessionBufferedChanSize int +} + func LoadConfig(path string, isFile bool) error { if isFile { viper.SetConfigFile(path) @@ -220,6 +227,9 @@ func LoadConfig(path string, isFile bool) error { viper.SetDefault("node_update_check_interval", "10s") + viper.SetDefault("tuning.batch_change_delay", "800ms") + viper.SetDefault("tuning.node_mapsession_buffered_chan_size", 30) + if IsCLIConfigured() { return nil } @@ -719,6 +729,12 @@ func GetHeadscaleConfig() (*Config, error) { }, Log: GetLogConfig(), + + // TODO(kradalby): Document these settings when more stable + Tuning: Tuning{ + BatchChangeDelay: viper.GetDuration("tuning.batch_change_delay"), + NodeMapSessionBufferedChanSize: viper.GetInt("tuning.node_mapsession_buffered_chan_size"), + }, }, nil } diff --git a/hscontrol/types/node.go b/hscontrol/types/node.go index 69004bf..2d6c631 100644 --- a/hscontrol/types/node.go +++ b/hscontrol/types/node.go @@ -7,11 +7,13 @@ import ( "fmt" "net/netip" "sort" + "strconv" "strings" "time" v1 "github.com/juanfont/headscale/gen/go/headscale/v1" "github.com/juanfont/headscale/hscontrol/policy/matcher" + "github.com/juanfont/headscale/hscontrol/util" "github.com/rs/zerolog/log" "go4.org/netipx" "google.golang.org/protobuf/types/known/timestamppb" @@ -27,9 +29,24 @@ var ( ErrNodeUserHasNoName = errors.New("node user has no name") ) +type NodeID uint64 +type NodeConnectedMap map[NodeID]bool + +func (id NodeID) StableID() tailcfg.StableNodeID { + return tailcfg.StableNodeID(strconv.FormatUint(uint64(id), util.Base10)) +} + +func (id NodeID) NodeID() tailcfg.NodeID { + return tailcfg.NodeID(id) +} + +func (id NodeID) Uint64() uint64 { + return uint64(id) +} + // Node is a Headscale client. type Node struct { - ID uint64 `gorm:"primary_key"` + ID NodeID `gorm:"primary_key"` // MachineKeyDatabaseField is the string representation of MachineKey // it is _only_ used for reading and writing the key to the @@ -198,7 +215,7 @@ func (node Node) IsExpired() bool { return false } - return time.Now().UTC().After(*node.Expiry) + return time.Since(*node.Expiry) > 0 } // IsEphemeral returns if the node is registered as an Ephemeral node. @@ -319,7 +336,7 @@ func (node *Node) AfterFind(tx *gorm.DB) error { func (node *Node) Proto() *v1.Node { nodeProto := &v1.Node{ - Id: node.ID, + Id: uint64(node.ID), MachineKey: node.MachineKey.String(), NodeKey: node.NodeKey.String(), @@ -486,8 +503,8 @@ func (nodes Nodes) String() string { return fmt.Sprintf("[ %s ](%d)", strings.Join(temp, ", "), len(temp)) } -func (nodes Nodes) IDMap() map[uint64]*Node { - ret := map[uint64]*Node{} +func (nodes Nodes) IDMap() map[NodeID]*Node { + ret := map[NodeID]*Node{} for _, node := range nodes { ret[node.ID] = node diff --git a/integration/auth_oidc_test.go b/integration/auth_oidc_test.go index 36e74a8..347dbcc 100644 --- a/integration/auth_oidc_test.go +++ b/integration/auth_oidc_test.go @@ -83,7 +83,7 @@ func TestOIDCAuthenticationPingAll(t *testing.T) { err = scenario.WaitForTailscaleSync() assertNoErrSync(t, err) - assertClientsState(t, allClients) + // assertClientsState(t, allClients) allAddrs := lo.Map(allIps, func(x netip.Addr, index int) string { return x.String() @@ -142,7 +142,7 @@ func TestOIDCExpireNodesBasedOnTokenExpiry(t *testing.T) { err = scenario.WaitForTailscaleSync() assertNoErrSync(t, err) - assertClientsState(t, allClients) + // assertClientsState(t, allClients) allAddrs := lo.Map(allIps, func(x netip.Addr, index int) string { return x.String() diff --git a/integration/auth_web_flow_test.go b/integration/auth_web_flow_test.go index aa589fa..6d981bc 100644 --- a/integration/auth_web_flow_test.go +++ b/integration/auth_web_flow_test.go @@ -53,7 +53,7 @@ func TestAuthWebFlowAuthenticationPingAll(t *testing.T) { err = scenario.WaitForTailscaleSync() assertNoErrSync(t, err) - assertClientsState(t, allClients) + // assertClientsState(t, allClients) allAddrs := lo.Map(allIps, func(x netip.Addr, index int) string { return x.String() @@ -92,7 +92,7 @@ func TestAuthWebFlowLogoutAndRelogin(t *testing.T) { err = scenario.WaitForTailscaleSync() assertNoErrSync(t, err) - assertClientsState(t, allClients) + // assertClientsState(t, allClients) allAddrs := lo.Map(allIps, func(x netip.Addr, index int) string { return x.String() diff --git a/integration/general_test.go b/integration/general_test.go index 9aae26f..975b4c2 100644 --- a/integration/general_test.go +++ b/integration/general_test.go @@ -65,7 +65,7 @@ func TestPingAllByIP(t *testing.T) { err = scenario.WaitForTailscaleSync() assertNoErrSync(t, err) - assertClientsState(t, allClients) + // assertClientsState(t, allClients) allAddrs := lo.Map(allIps, func(x netip.Addr, index int) string { return x.String() @@ -103,7 +103,7 @@ func TestPingAllByIPPublicDERP(t *testing.T) { err = scenario.WaitForTailscaleSync() assertNoErrSync(t, err) - assertClientsState(t, allClients) + // assertClientsState(t, allClients) allAddrs := lo.Map(allIps, func(x netip.Addr, index int) string { return x.String() @@ -135,7 +135,7 @@ func TestAuthKeyLogoutAndRelogin(t *testing.T) { err = scenario.WaitForTailscaleSync() assertNoErrSync(t, err) - assertClientsState(t, allClients) + // assertClientsState(t, allClients) clientIPs := make(map[TailscaleClient][]netip.Addr) for _, client := range allClients { @@ -176,7 +176,7 @@ func TestAuthKeyLogoutAndRelogin(t *testing.T) { err = scenario.WaitForTailscaleSync() assertNoErrSync(t, err) - assertClientsState(t, allClients) + // assertClientsState(t, allClients) allClients, err = scenario.ListTailscaleClients() assertNoErrListClients(t, err) @@ -329,7 +329,7 @@ func TestPingAllByHostname(t *testing.T) { err = scenario.WaitForTailscaleSync() assertNoErrSync(t, err) - assertClientsState(t, allClients) + // assertClientsState(t, allClients) allHostnames, err := scenario.ListTailscaleClientsFQDNs() assertNoErrListFQDN(t, err) @@ -539,7 +539,7 @@ func TestResolveMagicDNS(t *testing.T) { err = scenario.WaitForTailscaleSync() assertNoErrSync(t, err) - assertClientsState(t, allClients) + // assertClientsState(t, allClients) // Poor mans cache _, err = scenario.ListTailscaleClientsFQDNs() @@ -609,7 +609,7 @@ func TestExpireNode(t *testing.T) { err = scenario.WaitForTailscaleSync() assertNoErrSync(t, err) - assertClientsState(t, allClients) + // assertClientsState(t, allClients) allAddrs := lo.Map(allIps, func(x netip.Addr, index int) string { return x.String() @@ -711,7 +711,7 @@ func TestExpireNode(t *testing.T) { } } -func TestNodeOnlineLastSeenStatus(t *testing.T) { +func TestNodeOnlineStatus(t *testing.T) { IntegrationSkip(t) t.Parallel() @@ -723,7 +723,7 @@ func TestNodeOnlineLastSeenStatus(t *testing.T) { "user1": len(MustTestVersions), } - err = scenario.CreateHeadscaleEnv(spec, []tsic.Option{}, hsic.WithTestName("onlinelastseen")) + err = scenario.CreateHeadscaleEnv(spec, []tsic.Option{}, hsic.WithTestName("online")) assertNoErrHeadscaleEnv(t, err) allClients, err := scenario.ListTailscaleClients() @@ -735,7 +735,7 @@ func TestNodeOnlineLastSeenStatus(t *testing.T) { err = scenario.WaitForTailscaleSync() assertNoErrSync(t, err) - assertClientsState(t, allClients) + // assertClientsState(t, allClients) allAddrs := lo.Map(allIps, func(x netip.Addr, index int) string { return x.String() @@ -755,8 +755,6 @@ func TestNodeOnlineLastSeenStatus(t *testing.T) { headscale, err := scenario.Headscale() assertNoErr(t, err) - keepAliveInterval := 60 * time.Second - // Duration is chosen arbitrarily, 10m is reported in #1561 testDuration := 12 * time.Minute start := time.Now() @@ -780,11 +778,6 @@ func TestNodeOnlineLastSeenStatus(t *testing.T) { err = json.Unmarshal([]byte(result), &nodes) assertNoErr(t, err) - now := time.Now() - - // Threshold with some leeway - lastSeenThreshold := now.Add(-keepAliveInterval - (10 * time.Second)) - // Verify that headscale reports the nodes as online for _, node := range nodes { // All nodes should be online @@ -795,18 +788,6 @@ func TestNodeOnlineLastSeenStatus(t *testing.T) { node.GetName(), time.Since(start), ) - - lastSeen := node.GetLastSeen().AsTime() - // All nodes should have been last seen between now and the keepAliveInterval - assert.Truef( - t, - lastSeen.After(lastSeenThreshold), - "node (%s) lastSeen (%v) was not %s after the threshold (%v)", - node.GetName(), - lastSeen, - keepAliveInterval, - lastSeenThreshold, - ) } // Verify that all nodes report all nodes to be online @@ -834,15 +815,6 @@ func TestNodeOnlineLastSeenStatus(t *testing.T) { client.Hostname(), time.Since(start), ) - - // from docs: last seen to tailcontrol; only present if offline - // assert.Nilf( - // t, - // peerStatus.LastSeen, - // "expected node %s to not have LastSeen set, got %s", - // peerStatus.HostName, - // peerStatus.LastSeen, - // ) } } @@ -850,3 +822,87 @@ func TestNodeOnlineLastSeenStatus(t *testing.T) { time.Sleep(time.Second) } } + +// TestPingAllByIPManyUpDown is a variant of the PingAll +// test which will take the tailscale node up and down +// five times ensuring they are able to restablish connectivity. +func TestPingAllByIPManyUpDown(t *testing.T) { + IntegrationSkip(t) + t.Parallel() + + scenario, err := NewScenario() + assertNoErr(t, err) + defer scenario.Shutdown() + + // TODO(kradalby): it does not look like the user thing works, only second + // get created? maybe only when many? + spec := map[string]int{ + "user1": len(MustTestVersions), + "user2": len(MustTestVersions), + } + + headscaleConfig := map[string]string{ + "HEADSCALE_DERP_URLS": "", + "HEADSCALE_DERP_SERVER_ENABLED": "true", + "HEADSCALE_DERP_SERVER_REGION_ID": "999", + "HEADSCALE_DERP_SERVER_REGION_CODE": "headscale", + "HEADSCALE_DERP_SERVER_REGION_NAME": "Headscale Embedded DERP", + "HEADSCALE_DERP_SERVER_STUN_LISTEN_ADDR": "0.0.0.0:3478", + "HEADSCALE_DERP_SERVER_PRIVATE_KEY_PATH": "/tmp/derp.key", + + // Envknob for enabling DERP debug logs + "DERP_DEBUG_LOGS": "true", + "DERP_PROBER_DEBUG_LOGS": "true", + } + + err = scenario.CreateHeadscaleEnv(spec, + []tsic.Option{}, + hsic.WithTestName("pingallbyip"), + hsic.WithConfigEnv(headscaleConfig), + hsic.WithTLS(), + hsic.WithHostnameAsServerURL(), + ) + assertNoErrHeadscaleEnv(t, err) + + allClients, err := scenario.ListTailscaleClients() + assertNoErrListClients(t, err) + + allIps, err := scenario.ListTailscaleClientsIPs() + assertNoErrListClientIPs(t, err) + + err = scenario.WaitForTailscaleSync() + assertNoErrSync(t, err) + + // assertClientsState(t, allClients) + + allAddrs := lo.Map(allIps, func(x netip.Addr, index int) string { + return x.String() + }) + + success := pingAllHelper(t, allClients, allAddrs) + t.Logf("%d successful pings out of %d", success, len(allClients)*len(allIps)) + + for run := range 3 { + t.Logf("Starting DownUpPing run %d", run+1) + + for _, client := range allClients { + t.Logf("taking down %q", client.Hostname()) + client.Down() + } + + time.Sleep(5 * time.Second) + + for _, client := range allClients { + t.Logf("bringing up %q", client.Hostname()) + client.Up() + } + + time.Sleep(5 * time.Second) + + err = scenario.WaitForTailscaleSync() + assertNoErrSync(t, err) + + success := pingAllHelper(t, allClients, allAddrs) + t.Logf("%d successful pings out of %d", success, len(allClients)*len(allIps)) + } +} diff --git a/integration/route_test.go b/integration/route_test.go index 75296fd..d185acf 100644 --- a/integration/route_test.go +++ b/integration/route_test.go @@ -212,7 +212,11 @@ func TestEnablingRoutes(t *testing.T) { if route.GetId() == routeToBeDisabled.GetId() { assert.Equal(t, false, route.GetEnabled()) - assert.Equal(t, false, route.GetIsPrimary()) + + // since this is the only route of this cidr, + // it will not failover, and remain Primary + // until something can replace it. + assert.Equal(t, true, route.GetIsPrimary()) } else { assert.Equal(t, true, route.GetEnabled()) assert.Equal(t, true, route.GetIsPrimary()) @@ -291,6 +295,7 @@ func TestHASubnetRouterFailover(t *testing.T) { client := allClients[2] + t.Logf("Advertise route from r1 (%s) and r2 (%s), making it HA, n1 is primary", subRouter1.Hostname(), subRouter2.Hostname()) // advertise HA route on node 1 and 2 // ID 1 will be primary // ID 2 will be secondary @@ -384,12 +389,12 @@ func TestHASubnetRouterFailover(t *testing.T) { // Node 1 is primary assert.Equal(t, true, enablingRoutes[0].GetAdvertised()) assert.Equal(t, true, enablingRoutes[0].GetEnabled()) - assert.Equal(t, true, enablingRoutes[0].GetIsPrimary()) + assert.Equal(t, true, enablingRoutes[0].GetIsPrimary(), "both subnet routers are up, expected r1 to be primary") // Node 2 is not primary assert.Equal(t, true, enablingRoutes[1].GetAdvertised()) assert.Equal(t, true, enablingRoutes[1].GetEnabled()) - assert.Equal(t, false, enablingRoutes[1].GetIsPrimary()) + assert.Equal(t, false, enablingRoutes[1].GetIsPrimary(), "both subnet routers are up, expected r2 to be non-primary") // Verify that the client has routes from the primary machine srs1, err := subRouter1.Status() @@ -401,6 +406,9 @@ func TestHASubnetRouterFailover(t *testing.T) { srs1PeerStatus := clientStatus.Peer[srs1.Self.PublicKey] srs2PeerStatus := clientStatus.Peer[srs2.Self.PublicKey] + assert.True(t, srs1PeerStatus.Online, "r1 up, r2 up") + assert.True(t, srs2PeerStatus.Online, "r1 up, r2 up") + assertNotNil(t, srs1PeerStatus.PrimaryRoutes) assert.Nil(t, srs2PeerStatus.PrimaryRoutes) @@ -411,7 +419,8 @@ func TestHASubnetRouterFailover(t *testing.T) { ) // Take down the current primary - t.Logf("taking down subnet router 1 (%s)", subRouter1.Hostname()) + t.Logf("taking down subnet router r1 (%s)", subRouter1.Hostname()) + t.Logf("expecting r2 (%s) to take over as primary", subRouter2.Hostname()) err = subRouter1.Down() assertNoErr(t, err) @@ -435,15 +444,12 @@ func TestHASubnetRouterFailover(t *testing.T) { // Node 1 is not primary assert.Equal(t, true, routesAfterMove[0].GetAdvertised()) assert.Equal(t, true, routesAfterMove[0].GetEnabled()) - assert.Equal(t, false, routesAfterMove[0].GetIsPrimary()) + assert.Equal(t, false, routesAfterMove[0].GetIsPrimary(), "r1 is down, expected r2 to be primary") // Node 2 is primary assert.Equal(t, true, routesAfterMove[1].GetAdvertised()) assert.Equal(t, true, routesAfterMove[1].GetEnabled()) - assert.Equal(t, true, routesAfterMove[1].GetIsPrimary()) - - // TODO(kradalby): Check client status - // Route is expected to be on SR2 + assert.Equal(t, true, routesAfterMove[1].GetIsPrimary(), "r1 is down, expected r2 to be primary") srs2, err = subRouter2.Status() @@ -453,6 +459,9 @@ func TestHASubnetRouterFailover(t *testing.T) { srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey] srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] + assert.False(t, srs1PeerStatus.Online, "r1 down, r2 down") + assert.True(t, srs2PeerStatus.Online, "r1 down, r2 up") + assert.Nil(t, srs1PeerStatus.PrimaryRoutes) assertNotNil(t, srs2PeerStatus.PrimaryRoutes) @@ -465,7 +474,8 @@ func TestHASubnetRouterFailover(t *testing.T) { } // Take down subnet router 2, leaving none available - t.Logf("taking down subnet router 2 (%s)", subRouter2.Hostname()) + t.Logf("taking down subnet router r2 (%s)", subRouter2.Hostname()) + t.Logf("expecting r2 (%s) to remain primary, no other available", subRouter2.Hostname()) err = subRouter2.Down() assertNoErr(t, err) @@ -489,14 +499,14 @@ func TestHASubnetRouterFailover(t *testing.T) { // Node 1 is not primary assert.Equal(t, true, routesAfterBothDown[0].GetAdvertised()) assert.Equal(t, true, routesAfterBothDown[0].GetEnabled()) - assert.Equal(t, false, routesAfterBothDown[0].GetIsPrimary()) + assert.Equal(t, false, routesAfterBothDown[0].GetIsPrimary(), "r1 and r2 is down, expected r2 to _still_ be primary") // Node 2 is primary // if the node goes down, but no other suitable route is // available, keep the last known good route. assert.Equal(t, true, routesAfterBothDown[1].GetAdvertised()) assert.Equal(t, true, routesAfterBothDown[1].GetEnabled()) - assert.Equal(t, true, routesAfterBothDown[1].GetIsPrimary()) + assert.Equal(t, true, routesAfterBothDown[1].GetIsPrimary(), "r1 and r2 is down, expected r2 to _still_ be primary") // TODO(kradalby): Check client status // Both are expected to be down @@ -508,6 +518,9 @@ func TestHASubnetRouterFailover(t *testing.T) { srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey] srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] + assert.False(t, srs1PeerStatus.Online, "r1 down, r2 down") + assert.False(t, srs2PeerStatus.Online, "r1 down, r2 down") + assert.Nil(t, srs1PeerStatus.PrimaryRoutes) assertNotNil(t, srs2PeerStatus.PrimaryRoutes) @@ -520,7 +533,8 @@ func TestHASubnetRouterFailover(t *testing.T) { } // Bring up subnet router 1, making the route available from there. - t.Logf("bringing up subnet router 1 (%s)", subRouter1.Hostname()) + t.Logf("bringing up subnet router r1 (%s)", subRouter1.Hostname()) + t.Logf("expecting r1 (%s) to take over as primary (only one online)", subRouter1.Hostname()) err = subRouter1.Up() assertNoErr(t, err) @@ -544,12 +558,12 @@ func TestHASubnetRouterFailover(t *testing.T) { // Node 1 is primary assert.Equal(t, true, routesAfter1Up[0].GetAdvertised()) assert.Equal(t, true, routesAfter1Up[0].GetEnabled()) - assert.Equal(t, true, routesAfter1Up[0].GetIsPrimary()) + assert.Equal(t, true, routesAfter1Up[0].GetIsPrimary(), "r1 is back up, expected r1 to become be primary") // Node 2 is not primary assert.Equal(t, true, routesAfter1Up[1].GetAdvertised()) assert.Equal(t, true, routesAfter1Up[1].GetEnabled()) - assert.Equal(t, false, routesAfter1Up[1].GetIsPrimary()) + assert.Equal(t, false, routesAfter1Up[1].GetIsPrimary(), "r1 is back up, expected r1 to become be primary") // Verify that the route is announced from subnet router 1 clientStatus, err = client.Status() @@ -558,6 +572,9 @@ func TestHASubnetRouterFailover(t *testing.T) { srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey] srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] + assert.True(t, srs1PeerStatus.Online, "r1 is back up, r2 down") + assert.False(t, srs2PeerStatus.Online, "r1 is back up, r2 down") + assert.NotNil(t, srs1PeerStatus.PrimaryRoutes) assert.Nil(t, srs2PeerStatus.PrimaryRoutes) @@ -570,7 +587,8 @@ func TestHASubnetRouterFailover(t *testing.T) { } // Bring up subnet router 2, should result in no change. - t.Logf("bringing up subnet router 2 (%s)", subRouter2.Hostname()) + t.Logf("bringing up subnet router r2 (%s)", subRouter2.Hostname()) + t.Logf("both online, expecting r1 (%s) to still be primary (no flapping)", subRouter1.Hostname()) err = subRouter2.Up() assertNoErr(t, err) @@ -594,12 +612,12 @@ func TestHASubnetRouterFailover(t *testing.T) { // Node 1 is not primary assert.Equal(t, true, routesAfter2Up[0].GetAdvertised()) assert.Equal(t, true, routesAfter2Up[0].GetEnabled()) - assert.Equal(t, true, routesAfter2Up[0].GetIsPrimary()) + assert.Equal(t, true, routesAfter2Up[0].GetIsPrimary(), "r1 and r2 is back up, expected r1 to _still_ be primary") // Node 2 is primary assert.Equal(t, true, routesAfter2Up[1].GetAdvertised()) assert.Equal(t, true, routesAfter2Up[1].GetEnabled()) - assert.Equal(t, false, routesAfter2Up[1].GetIsPrimary()) + assert.Equal(t, false, routesAfter2Up[1].GetIsPrimary(), "r1 and r2 is back up, expected r1 to _still_ be primary") // Verify that the route is announced from subnet router 1 clientStatus, err = client.Status() @@ -608,6 +626,9 @@ func TestHASubnetRouterFailover(t *testing.T) { srs1PeerStatus = clientStatus.Peer[srs1.Self.PublicKey] srs2PeerStatus = clientStatus.Peer[srs2.Self.PublicKey] + assert.True(t, srs1PeerStatus.Online, "r1 up, r2 up") + assert.True(t, srs2PeerStatus.Online, "r1 up, r2 up") + assert.NotNil(t, srs1PeerStatus.PrimaryRoutes) assert.Nil(t, srs2PeerStatus.PrimaryRoutes) @@ -620,7 +641,8 @@ func TestHASubnetRouterFailover(t *testing.T) { } // Disable the route of subnet router 1, making it failover to 2 - t.Logf("disabling route in subnet router 1 (%s)", subRouter1.Hostname()) + t.Logf("disabling route in subnet router r1 (%s)", subRouter1.Hostname()) + t.Logf("expecting route to failover to r2 (%s), which is still available", subRouter2.Hostname()) _, err = headscale.Execute( []string{ "headscale", @@ -648,7 +670,7 @@ func TestHASubnetRouterFailover(t *testing.T) { assertNoErr(t, err) assert.Len(t, routesAfterDisabling1, 2) - t.Logf("routes after disabling1 %#v", routesAfterDisabling1) + t.Logf("routes after disabling r1 %#v", routesAfterDisabling1) // Node 1 is not primary assert.Equal(t, true, routesAfterDisabling1[0].GetAdvertised()) @@ -680,6 +702,7 @@ func TestHASubnetRouterFailover(t *testing.T) { // enable the route of subnet router 1, no change expected t.Logf("enabling route in subnet router 1 (%s)", subRouter1.Hostname()) + t.Logf("both online, expecting r2 (%s) to still be primary (no flapping)", subRouter2.Hostname()) _, err = headscale.Execute( []string{ "headscale", @@ -736,7 +759,8 @@ func TestHASubnetRouterFailover(t *testing.T) { } // delete the route of subnet router 2, failover to one expected - t.Logf("deleting route in subnet router 2 (%s)", subRouter2.Hostname()) + t.Logf("deleting route in subnet router r2 (%s)", subRouter2.Hostname()) + t.Logf("expecting route to failover to r1 (%s)", subRouter1.Hostname()) _, err = headscale.Execute( []string{ "headscale", @@ -764,7 +788,7 @@ func TestHASubnetRouterFailover(t *testing.T) { assertNoErr(t, err) assert.Len(t, routesAfterDeleting2, 1) - t.Logf("routes after deleting2 %#v", routesAfterDeleting2) + t.Logf("routes after deleting r2 %#v", routesAfterDeleting2) // Node 1 is primary assert.Equal(t, true, routesAfterDeleting2[0].GetAdvertised()) diff --git a/integration/scenario.go b/integration/scenario.go index a2c63e6..ebd12bc 100644 --- a/integration/scenario.go +++ b/integration/scenario.go @@ -50,6 +50,8 @@ var ( tailscaleVersions2021 = map[string]bool{ "head": true, "unstable": true, + "1.60": true, // CapVer: 82 + "1.58": true, // CapVer: 82 "1.56": true, // CapVer: 82 "1.54": true, // CapVer: 79 "1.52": true, // CapVer: 79 diff --git a/integration/tailscale.go b/integration/tailscale.go index 9d6796b..6bcf607 100644 --- a/integration/tailscale.go +++ b/integration/tailscale.go @@ -27,7 +27,7 @@ type TailscaleClient interface { Down() error IPs() ([]netip.Addr, error) FQDN() (string, error) - Status() (*ipnstate.Status, error) + Status(...bool) (*ipnstate.Status, error) Netmap() (*netmap.NetworkMap, error) Netcheck() (*netcheck.Report, error) WaitForNeedsLogin() error diff --git a/integration/tsic/tsic.go b/integration/tsic/tsic.go index 320ae0d..6ae0226 100644 --- a/integration/tsic/tsic.go +++ b/integration/tsic/tsic.go @@ -9,6 +9,7 @@ import ( "log" "net/netip" "net/url" + "os" "strconv" "strings" "time" @@ -503,7 +504,7 @@ func (t *TailscaleInContainer) IPs() ([]netip.Addr, error) { } // Status returns the ipnstate.Status of the Tailscale instance. -func (t *TailscaleInContainer) Status() (*ipnstate.Status, error) { +func (t *TailscaleInContainer) Status(save ...bool) (*ipnstate.Status, error) { command := []string{ "tailscale", "status", @@ -521,60 +522,70 @@ func (t *TailscaleInContainer) Status() (*ipnstate.Status, error) { return nil, fmt.Errorf("failed to unmarshal tailscale status: %w", err) } + err = os.WriteFile(fmt.Sprintf("/tmp/control/%s_status.json", t.hostname), []byte(result), 0o755) + if err != nil { + return nil, fmt.Errorf("status netmap to /tmp/control: %w", err) + } + return &status, err } // Netmap returns the current Netmap (netmap.NetworkMap) of the Tailscale instance. // Only works with Tailscale 1.56 and newer. // Panics if version is lower then minimum. -// func (t *TailscaleInContainer) Netmap() (*netmap.NetworkMap, error) { -// if !util.TailscaleVersionNewerOrEqual("1.56", t.version) { -// panic(fmt.Sprintf("tsic.Netmap() called with unsupported version: %s", t.version)) -// } +func (t *TailscaleInContainer) Netmap() (*netmap.NetworkMap, error) { + if !util.TailscaleVersionNewerOrEqual("1.56", t.version) { + panic(fmt.Sprintf("tsic.Netmap() called with unsupported version: %s", t.version)) + } -// command := []string{ -// "tailscale", -// "debug", -// "netmap", -// } + command := []string{ + "tailscale", + "debug", + "netmap", + } -// result, stderr, err := t.Execute(command) -// if err != nil { -// fmt.Printf("stderr: %s\n", stderr) -// return nil, fmt.Errorf("failed to execute tailscale debug netmap command: %w", err) -// } + result, stderr, err := t.Execute(command) + if err != nil { + fmt.Printf("stderr: %s\n", stderr) + return nil, fmt.Errorf("failed to execute tailscale debug netmap command: %w", err) + } -// var nm netmap.NetworkMap -// err = json.Unmarshal([]byte(result), &nm) -// if err != nil { -// return nil, fmt.Errorf("failed to unmarshal tailscale netmap: %w", err) -// } + var nm netmap.NetworkMap + err = json.Unmarshal([]byte(result), &nm) + if err != nil { + return nil, fmt.Errorf("failed to unmarshal tailscale netmap: %w", err) + } -// return &nm, err -// } + err = os.WriteFile(fmt.Sprintf("/tmp/control/%s_netmap.json", t.hostname), []byte(result), 0o755) + if err != nil { + return nil, fmt.Errorf("saving netmap to /tmp/control: %w", err) + } + + return &nm, err +} // Netmap returns the current Netmap (netmap.NetworkMap) of the Tailscale instance. // This implementation is based on getting the netmap from `tailscale debug watch-ipn` // as there seem to be some weirdness omitting endpoint and DERP info if we use // Patch updates. // This implementation works on all supported versions. -func (t *TailscaleInContainer) Netmap() (*netmap.NetworkMap, error) { - // watch-ipn will only give an update if something is happening, - // since we send keep alives, the worst case for this should be - // 1 minute, but set a slightly more conservative time. - ctx, _ := context.WithTimeout(context.Background(), 3*time.Minute) +// func (t *TailscaleInContainer) Netmap() (*netmap.NetworkMap, error) { +// // watch-ipn will only give an update if something is happening, +// // since we send keep alives, the worst case for this should be +// // 1 minute, but set a slightly more conservative time. +// ctx, _ := context.WithTimeout(context.Background(), 3*time.Minute) - notify, err := t.watchIPN(ctx) - if err != nil { - return nil, err - } +// notify, err := t.watchIPN(ctx) +// if err != nil { +// return nil, err +// } - if notify.NetMap == nil { - return nil, fmt.Errorf("no netmap present in ipn.Notify") - } +// if notify.NetMap == nil { +// return nil, fmt.Errorf("no netmap present in ipn.Notify") +// } - return notify.NetMap, nil -} +// return notify.NetMap, nil +// } // watchIPN watches `tailscale debug watch-ipn` for a ipn.Notify object until // it gets one that has a netmap.NetworkMap. diff --git a/integration/utils.go b/integration/utils.go index b9e25be..1e2cfd2 100644 --- a/integration/utils.go +++ b/integration/utils.go @@ -7,6 +7,7 @@ import ( "testing" "time" + "github.com/juanfont/headscale/hscontrol/util" "github.com/juanfont/headscale/integration/tsic" "github.com/stretchr/testify/assert" ) @@ -154,11 +155,11 @@ func assertClientsState(t *testing.T, clients []TailscaleClient) { func assertValidNetmap(t *testing.T, client TailscaleClient) { t.Helper() - // if !util.TailscaleVersionNewerOrEqual("1.56", client.Version()) { - // t.Logf("%q has version %q, skipping netmap check...", client.Hostname(), client.Version()) + if !util.TailscaleVersionNewerOrEqual("1.56", client.Version()) { + t.Logf("%q has version %q, skipping netmap check...", client.Hostname(), client.Version()) - // return - // } + return + } t.Logf("Checking netmap of %q", client.Hostname()) @@ -175,7 +176,11 @@ func assertValidNetmap(t *testing.T, client TailscaleClient) { assert.NotEmptyf(t, netmap.SelfNode.AllowedIPs(), "%q does not have any allowed IPs", client.Hostname()) assert.NotEmptyf(t, netmap.SelfNode.Addresses(), "%q does not have any addresses", client.Hostname()) - assert.Truef(t, *netmap.SelfNode.Online(), "%q is not online", client.Hostname()) + if netmap.SelfNode.Online() != nil { + assert.Truef(t, *netmap.SelfNode.Online(), "%q is not online", client.Hostname()) + } else { + t.Errorf("Online should not be nil for %s", client.Hostname()) + } assert.Falsef(t, netmap.SelfNode.Key().IsZero(), "%q does not have a valid NodeKey", client.Hostname()) assert.Falsef(t, netmap.SelfNode.Machine().IsZero(), "%q does not have a valid MachineKey", client.Hostname()) @@ -213,7 +218,7 @@ func assertValidNetmap(t *testing.T, client TailscaleClient) { // This test is not suitable for ACL/partial connection tests. func assertValidStatus(t *testing.T, client TailscaleClient) { t.Helper() - status, err := client.Status() + status, err := client.Status(true) if err != nil { t.Fatalf("getting status for %q: %s", client.Hostname(), err) }