fix runaway cpu bug, simplify setupAgent
Jes Olson j3s@c3f.net
Fri, 17 Feb 2023 21:48:33 -0800
5 files changed,
35 insertions(+),
29 deletions(-)
M
agent.go
→
agent.go
@@ -8,17 +8,10 @@
"github.com/hashicorp/serf/serf" ) -// Agent starts and manages a Serf & adds -// service discovery +// Agent starts and manages a Serf & adds service discovery (TODO) type Agent struct { -// conf *Config serfConf *serf.Config eventCh chan serf.Event - - // i doubt we care about handling events, - // but i'm leaving this here just in case - // eventHandlerList []EventHandler - // eventHandlersLock sync.Mutex // This is the underlying Serf we are wrapping serf *serf.Serf@@ -53,7 +46,7 @@ case e := <-a.eventCh:
log.Printf("[INFO] cascade: Received event: %s", e.String()) case <-serfShutdownCh: - log.Printf("[WARN] cascade: Serf shutdown detected, quitting") + log.Printf("[INFO] cascade: serf shutdown detected, quitting eventloop") a.Shutdown() return
M
main.go
→
main.go
@@ -66,8 +66,7 @@ }
// Bail fast if not doing a graceful leave if !graceful { - log.Printf("[WARN] cascade: non-graceful leave detected") - return nil + log.Fatal("[WARN] cascade: non-graceful leave detected") } // Attempt a graceful leave@@ -129,31 +128,31 @@ bindIP, bindPort, err := config.AddrParts(config.BindAddr)
if err != nil { log.Panic(err) } + serfConfig := serf.DefaultConfig() serfConfig.NodeName = config.NodeName serfConfig.ProtocolVersion = uint8(serf.ProtocolVersionMax) - serfConfig.CoalescePeriod = 3 * time.Second - serfConfig.QuiescentPeriod = time.Second - serfConfig.QueryResponseSizeLimit = 1024 - serfConfig.QuerySizeLimit = 1024 - serfConfig.UserEventSizeLimit = 512 - serfConfig.UserCoalescePeriod = 3 * time.Second - serfConfig.UserQuiescentPeriod = time.Second - // TODO: look at reconnect/tombstone settings w more scrutiny - serfConfig.ReconnectInterval = 0 - serfConfig.ReconnectTimeout = 0 - serfConfig.TombstoneTimeout = 0 - serfConfig.BroadcastTimeout = 0 - // TODO: what are the implications of true here o_O - serfConfig.EnableNameConflictResolution = true - // hardcode DefaultWANConfig because cascade is designed to be - // used as a single global system. + // TODO: how should cascade handle name conflicts? + // defaulting to just blowing up for now, but + // we _could_ take the tailscale route & append + // -1 or whatever to the node. that would be more user friendly. + // TODO: some of these serf settings were pulled + // from consul[1]. re-examine them eventually. + serfConfig.EnableNameConflictResolution = false + serfConfig.ReconnectTimeout = 3 * 24 * time.Hour + serfConfig.QueueDepthWarning = 1000000 + serfConfig.MinQueueDepth = 4096 + serfConfig.LeavePropagateDelay = 3 * time.Second + serfConfig.MemberlistConfig = memberlist.DefaultWANConfig() + serfConfig.MemberlistConfig.DeadNodeReclaimTime = 30 * time.Second serfConfig.MemberlistConfig.BindAddr = bindIP serfConfig.MemberlistConfig.BindPort = bindPort - serfConfig.MemberlistConfig.AdvertiseAddr = "" - serfConfig.MemberlistConfig.AdvertisePort = 0 agent := Create(serfConfig) return agent } + +// [1]: sources for consul serf tweaks +// https://github.com/hashicorp/consul/blob/v1.14.4/agent/consul/config.go +// https://github.com/hashicorp/consul/blob/v1.14.4/lib/serf/serf.go