Compare commits

...

1 Commits

Author SHA1 Message Date
Irbe Krumina
7c13973bc9 cmd/containerboot: fix healthcheck
The current container healthcheck is not able to catch cases where a previously healthy
node gets disconnected from control (because we rely on updates received over the notify channel
to update health status and there are no notifications on 'Online' status changes).
This change makes the healthcheck endpoint call the LocalAPI /status endpoint instead.

Updates tailscale/tailscale#13620

Signed-off-by: Irbe Krumina <irbe@tailscale.com>
2024-11-26 14:19:34 +00:00
2 changed files with 40 additions and 22 deletions

View File

@@ -6,34 +6,45 @@
package main
import (
"context"
"fmt"
"log"
"net"
"net/http"
"sync"
"time"
"tailscale.com/client/tailscale"
"tailscale.com/ipn/ipnstate"
)
// healthz is a simple health check server, if enabled it returns 200 OK if
// this tailscale node currently has at least one tailnet IP address else
// returns 503.
// healthz is a simple health check server, if enabled it returns 200 OK if this tailscale device can be considered
// healthy (running, connected to control plane, has tailnet IPs) else returns 503.
type healthz struct {
sync.Mutex
hasAddrs bool
lc *tailscale.LocalClient
}
func (h *healthz) ServeHTTP(w http.ResponseWriter, r *http.Request) {
h.Lock()
defer h.Unlock()
if h.hasAddrs {
// Most health checks will have their own timeout, but a local client call should not take more than 5s.
ctx, cancel := context.WithTimeout(r.Context(), time.Second*5)
defer cancel()
st, err := h.lc.StatusWithoutPeers(ctx)
if err != nil {
http.Error(w, fmt.Sprintf("unable to check status of the tailscale device: %v", err), http.StatusServiceUnavailable)
return
}
online := isOnline(st)
addrs := getAddrs(st)
if st.BackendState == "Running" && online && len(addrs) != 0 {
w.Write([]byte("ok"))
} else {
http.Error(w, "node currently has no tailscale IPs", http.StatusInternalServerError)
log.Printf("healthz: tailscale device is not ready, state: %q, online: %t, addrs: %v", st.BackendState, online, addrs)
http.Error(w, "tailscale device is not ready", http.StatusServiceUnavailable)
}
}
// runHealthz runs a simple HTTP health endpoint on /healthz, listening on the
// provided address. A containerized tailscale instance is considered healthy if
// it has at least one tailnet IP address.
func runHealthz(addr string, h *healthz) {
// provided address.
func (h *healthz) run(addr string) {
lis, err := net.Listen("tcp", addr)
if err != nil {
log.Fatalf("error listening on the provided health endpoint address %q: %v", addr, err)
@@ -49,3 +60,14 @@ func runHealthz(addr string, h *healthz) {
}
}()
}
func isOnline(st *ipnstate.Status) bool {
return st != nil && st.Self != nil && st.Self.Online
}
func getAddrs(st *ipnstate.Status) (addrs []string) {
if st == nil || st.Self == nil {
return
}
return st.Self.Addrs
}

View File

@@ -328,10 +328,12 @@ authLoop:
certDomain = new(atomic.Pointer[string])
certDomainChanged = make(chan bool, 1)
h = &healthz{} // http server for the healthz endpoint
healthzRunner = sync.OnceFunc(func() { runHealthz(cfg.HealthCheckAddrPort, h) })
)
if cfg.HealthCheckAddrPort != "" {
h := &healthz{lc: client}
h.run(cfg.HealthCheckAddrPort)
}
if cfg.ServeConfigPath != "" {
go watchServeConfigChanges(ctx, cfg.ServeConfigPath, certDomainChanged, certDomain, client)
}
@@ -556,12 +558,6 @@ runLoop:
}
}
if cfg.HealthCheckAddrPort != "" {
h.Lock()
h.hasAddrs = len(addrs) != 0
h.Unlock()
healthzRunner()
}
if egressSvcsNotify != nil {
egressSvcsNotify <- n
}