/*
Copyright 2012 Google Inc.
Copyright 2019-2025 Vimeo Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

// Package galaxycache provides a data loading mechanism with caching
// and de-duplication that works across a set of peer processes.
//
// Each data Get first consults its local cache, otherwise delegates
// to the requested key's canonical owner, which then checks its cache
// or finally gets the data.  In the common case, many concurrent
// cache misses across a set of peers for the same key result in just
// one cache fill.
//
// In most cases, one will construct a [Universe] with [NewUniverse], and then
// construct a [Galaxy] with [Universe.NewGalaxy].
//
// # Expiration/TTL
//
// [Galaxy] implementations support the concept of a value's expiration time.
// This may either be set by providing an [BackendGetterWithInfo]
// implementation to [Universe.NewGalaxyWithBackendInfo] which returns a
// non-zero [BackendGetInfo].Expiration.
//
// Additionally, [Universe.NewGalaxy] and [Universe.NewGalaxyWithBackendInfo] may take
// [WithGetTTL] and [WithPeekTTL] as arguments to provide default expiration-times.
// [WithPeekTTL] only applies to values that are pulled from peers via
// [RemoteFetcher].Peek and [RemoteFetcherWithInfo].PeekWithInfo
// requests.
package galaxycache // import "github.com/vimeo/galaxycache"

import (
	"context"
	"errors"
	"fmt"
	"math/rand/v2"
	"strconv"
	"sync"
	"time"
	"unsafe"

	"github.com/vimeo/galaxycache/promoter"
	"github.com/vimeo/galaxycache/singleflight"

	"github.com/vimeo/go-clocks"

	"go.opencensus.io/stats"
	"go.opencensus.io/tag"
	"go.opencensus.io/trace"
)

// BackendGetInfo contains additional information from a
// [BackendGetterWithInfo] implementation. Currently, this is just an
// expiration, but, it may expand in the future.
type BackendGetInfo struct {
	// Expiration is a timestamp at which this value should be considered expired
	// the zero-value is no expiration.
	// Values should always be in the future according to the clock for this universe/galaxy
	Expiration time.Time
}

// A BackendGetter loads data for a key.
type BackendGetter interface {
	// Get populates dest with the value identified by key
	//
	// The returned data must be unversioned. That is, key must
	// uniquely describe the loaded data, without an implicit
	// current time, and without relying on cache expiration
	// mechanisms.
	Get(ctx context.Context, key string, dest Codec) error
}

// legacyBackendGetterAdapter is an adapter-type so we can store legacy backend getters in a BackendGetterWithInfo field.
type legacyBackendGetterAdapter struct {
	be BackendGetter
}

// GetWithInfo populates dest with the value identified by key
// The returned data must be unversioned. That is, the key must
// uniquely describe the loaded data. One may set Expiration on the
// BackendGetInfo return value.
func (l legacyBackendGetterAdapter) GetWithInfo(ctx context.Context, key string, dest Codec) (BackendGetInfo, error) {
	return BackendGetInfo{}, l.be.Get(ctx, key, dest)
}

// BackendGetterWithInfo provides the GetWithInfo method for an enhanced
// BackendGetter that's capable of returning expiration information (and may
// provide other enhancements later)
type BackendGetterWithInfo interface {
	// GetWithInfo populates dest with the value identified by key
	//
	// The returned data must be unversioned. That is, the key must
	// uniquely describe the loaded data. One may set Expiration on the
	// [BackendGetInfo] return value.
	GetWithInfo(ctx context.Context, key string, dest Codec) (BackendGetInfo, error)
}

// A GetterFunc implements BackendGetter with a function.
type GetterFunc func(ctx context.Context, key string, dest Codec) error

// Get implements Get from BackendGetter
func (f GetterFunc) Get(ctx context.Context, key string, dest Codec) error {
	return f(ctx, key, dest)
}

// A GetterFuncWithInfo implements BackendGetterWithInfo with a function.
type GetterFuncWithInfo func(ctx context.Context, key string, dest Codec) (BackendGetInfo, error)

// Get implements [BackendGetterWithInfo.Get]
func (f GetterFuncWithInfo) Get(ctx context.Context, key string, dest Codec) error {
	_, err := f(ctx, key, dest)
	return err
}

// GetWithInfo implements [BackendGetterWithInfo.GetWithInfo]
func (f GetterFuncWithInfo) GetWithInfo(ctx context.Context, key string, dest Codec) (BackendGetInfo, error) {
	return f(ctx, key, dest)
}

type universeOpts struct {
	hashOpts *HashOptions
	recorder stats.Recorder
	clock    clocks.Clock
}

// UniverseOpt is a functional Universe option.
type UniverseOpt func(*universeOpts)

// WithHashOpts sets the HashOptions on a universe.
func WithHashOpts(hashOpts *HashOptions) UniverseOpt {
	return func(u *universeOpts) {
		u.hashOpts = hashOpts
	}
}

// WithRecorder allows you to override the default stats.Recorder used for
// stats.
func WithRecorder(recorder stats.Recorder) UniverseOpt {
	return func(u *universeOpts) {
		u.recorder = recorder
	}
}

// WithUniversalClock specifices a clock to use at the universe level (and for galaxies to inherit by default)
func WithUniversalClock(clock clocks.Clock) UniverseOpt {
	return func(u *universeOpts) {
		u.clock = clock
	}
}

// Universe defines the primary container for all galaxycache operations.
// It contains the galaxies and PeerPicker
type Universe struct {
	mu         sync.RWMutex
	galaxies   map[string]*Galaxy // galaxies are indexed by their name
	peerPicker *PeerPicker
	clock      clocks.Clock
	recorder   stats.Recorder
}

// NewUniverse is the main constructor for the Universe object. It is passed a
// FetchProtocol (to specify fetching via GRPC or HTTP) and its own URL along
// with options.
func NewUniverse(protocol FetchProtocol, selfID string, opts ...UniverseOpt) *Universe {
	options := &universeOpts{
		clock: clocks.DefaultClock(),
	}
	for _, opt := range opts {
		opt(options)
	}

	c := &Universe{
		galaxies:   make(map[string]*Galaxy),
		peerPicker: newPeerPicker(protocol, options.clock, selfID, options.hashOpts),
		recorder:   options.recorder,
		clock:      options.clock,
	}
	// Insert the Self-ID into the hash-ring
	c.peerPicker.set(Peer{ID: selfID, URI: ""})

	return c
}

// NewUniverseWithOpts is a deprecated constructor for the Universe object that
// defines a non-default hash function and number of replicas.
// Deprecated: Please use `NewUniverse` with the `WithHashOpts` option instead.
func NewUniverseWithOpts(protocol FetchProtocol, selfID string, options *HashOptions) *Universe {
	return NewUniverse(protocol, selfID, WithHashOpts(options))
}

// NewGalaxy creates a coordinated galaxy-aware BackendGetter from a
// BackendGetter.
//
// The returned [Galaxy] tries (but does not guarantee) to run only one
// [BackendGetter.Get] call for a given key across an entire set of peer
// processes. Concurrent callers both in the local process and in
// other processes receive copies of the answer once the original Get
// completes.
//
// The galaxy name must be unique for each BackendGetter.
func (universe *Universe) NewGalaxy(name string, cacheBytes int64, getter BackendGetter, opts ...GalaxyOption) *Galaxy {
	return universe.NewGalaxyWithBackendInfo(name, cacheBytes, legacyBackendGetterAdapter{be: getter}, opts...)
}

// NewGalaxyWithBackendInfo creates a coordinated galaxy-aware [BackendGetter] from a
// [BackendGetterWithInfo].
//
// The returned [Galaxy] tries (but does not guarantee) to run only one
// [BackendGetterWithInfo.GetWithInfo] call for a given key across an entire set of peer
// processes. Concurrent callers both in the local process and in
// other processes receive copies of the answer once the original Get
// completes.
//
// The galaxy name must be unique for each BackendGetter.
func (universe *Universe) NewGalaxyWithBackendInfo(name string, cacheBytes int64, getter BackendGetterWithInfo, opts ...GalaxyOption) *Galaxy {
	if getter == nil {
		panic("nil Getter")
	}
	if nameErr := isNameValid(name); nameErr != nil {
		panic(fmt.Errorf("invalid galaxy name: %s", nameErr))
	}

	universe.mu.Lock()
	defer universe.mu.Unlock()

	if _, dup := universe.galaxies[name]; dup {
		panic("duplicate registration of galaxy " + name)
	}

	gOpts := galaxyOpts{
		promoter:          &promoter.DefaultPromoter{},
		hcRatio:           8, // default hotcache size is 1/8th of cacheBytes
		maxCandidates:     1024,
		clock:             universe.clock,
		resetIdleStatsAge: time.Minute,
	}
	for _, opt := range opts {
		opt.apply(&gOpts)
	}
	if gOpts.peekTTL.entryMaxTTL == 0 {
		// Default the peek TTL if the get TTL is unset
		// most of the time you don't want the peek TTL to be set if
		// the get TTL isn't, but, if the content of a galaxy
		// changes/gets more info, it can be useful to bound how long
		// data can bounce round without being re-hydrated anew.
		gOpts.peekTTL = gOpts.getTTL
	}
	g := &Galaxy{
		name:              name,
		parent:            universe,
		getter:            getter,
		peerPicker:        universe.peerPicker,
		cacheBytes:        cacheBytes,
		mainCache:         newCache(MainCache),
		hotCache:          newCache(HotCache),
		candidateCache:    newCandidateCache(gOpts.maxCandidates),
		baseTime:          gOpts.clock.Now(),
		resetIdleStatsAge: gOpts.resetIdleStatsAge,
		clock:             gOpts.clock,
		hcStatsWithTime: HCStatsWithTime{
			hcs: &promoter.HCStats{
				HCCapacity: cacheBytes / gOpts.hcRatio,
			}},
		loadGroup: &singleflight.Group{},
		opts:      gOpts,
	}
	g.mainCache.setLRUOnEvicted(nil)
	g.hotCache.setLRUOnEvicted(g.candidateCache.addToCandidateCache)
	g.mainCache.lru.Clock = g.clock
	g.hotCache.lru.Clock = g.clock

	universe.galaxies[name] = g
	return g
}

func isNameValid(name string) error {
	// check galaxy name is valid for an opencensus tag value
	_, err := tag.New(context.Background(), tag.Insert(GalaxyKey, name))
	return err
}

// GetGalaxy returns the named galaxy previously created with NewGalaxy, or
// nil if there's no such galaxy.
func (universe *Universe) GetGalaxy(name string) *Galaxy {
	universe.mu.RLock()
	defer universe.mu.RUnlock()
	return universe.galaxies[name]
}

// Set updates the Universe's list of peers (contained in the PeerPicker).
// Each PeerURL value should be a valid base URL,
// for example "example.net:8000".
// This is a compatibility wrapper around SetPeers which sets the ID and URI
// equal.
func (universe *Universe) Set(peerURLs ...string) error {
	return universe.peerPicker.setURLs(peerURLs...)
}

// SetPeers updates the Universe's list of peers (contained in the PeerPicker).
// Each Peer's URI value should be a valid base URL, while the ID may be anything that's unique,
// for example "example.net:8000".
// If AddPeer, Set and SetPeers are mixed, the ID and URI fields must match.
func (universe *Universe) SetPeers(peers ...Peer) error {
	return universe.peerPicker.set(peers...)
}

// AddPeer updates the Universe's list of peers to include the passed peer (contained in the PeerPicker).
// The Peer's URI value should be a valid base URL as understood by the RemoteFetcher implementation, while the ID may
// be anything that's unique, for example "example.net:8000" (However, in k8s, it's recommended to use a pod name
// (possibly with some qualification)).
// If Set, SetPeers and AddPeer calls are mixed, the ID and URI fields must match.
func (universe *Universe) AddPeer(peer Peer) error {
	return universe.peerPicker.add(peer)
}

// SetIncludeSelf toggles the inclusion of the "self ID" for the universe in the PeerPicker's hash-ring
func (universe *Universe) SetIncludeSelf(incSelf bool) {
	universe.peerPicker.setIncludeSelf(incSelf)
}

// IncludeSelf returns a bool indicating whether the "self ID" for the universe is currently included in the
// PeerPicker's hash-ring
// This is generally not useful oustide of tests that need to verify whether events are being handled correctly.
func (universe *Universe) IncludeSelf() bool {
	return universe.peerPicker.includeSelfVal()
}

// RemovePeers updates the Universe's list of peers to remove the passed peers IDs (contained in the PeerPicker).
// The arguments should match the ID field on SetPeers and AddPeers calls and the URLs passed to Set.
// unrecognized IDs are ignored
func (universe *Universe) RemovePeers(ids ...string) error {
	return universe.peerPicker.remove(ids...)
}

// ListPeers returns a map of remote fetchers keyed by Peer ID,
// useful for testing incremental changes to galaxycache peers.
func (universe *Universe) ListPeers() map[string]RemoteFetcher {
	return universe.peerPicker.listPeers()
}

// Shutdown closes all open fetcher connections
func (universe *Universe) Shutdown() error {
	return universe.peerPicker.shutdown()
}

// SelfID returns the selfID that was passed to the constructor and is used for
// self-identification in the hash-ring.
func (universe *Universe) SelfID() string {
	return universe.peerPicker.selfID
}

// HCStatsWithTime includes a time stamp along with the hotcache stats
// to ensure updates happen no more than once per second
type HCStatsWithTime struct {
	hcs *promoter.HCStats
	t   time.Time
}

// A Galaxy is a cache namespace and associated data spread over
// a group of 1 or more machines.
type Galaxy struct {
	name       string
	getter     BackendGetterWithInfo
	peerPicker *PeerPicker
	mu         sync.Mutex
	cacheBytes int64 // limit for sum of mainCache and hotCache size

	// mainCache is a cache of the keys for which this process
	// (amongst its peers) is authoritative. That is, this cache
	// contains keys which consistent hash on to this process's
	// peer number.
	mainCache cache

	// hotCache contains keys/values for which this peer is not
	// authoritative (otherwise they would be in mainCache), but
	// are popular enough to warrant mirroring in this process to
	// avoid going over the network to fetch from a peer.  Having
	// a hotCache avoids network hotspotting, where a peer's
	// network card could become the bottleneck on a popular key.
	// This cache is used sparingly to maximize the total number
	// of key/value pairs that can be stored globally.
	hotCache cache

	candidateCache candidateCache

	hcStatsWithTime HCStatsWithTime

	// loadGroup ensures that each key is only fetched once
	// (either locally or remotely), regardless of the number of
	// concurrent callers.
	loadGroup flightGroup

	opts galaxyOpts

	baseTime time.Time

	// Time that must elapse without any touches to a key before we clear
	// its stats with the next touch.
	// This protects intermittently hot keys from having very low qps
	// calculations during a traffic burst.
	resetIdleStatsAge time.Duration

	clock clocks.Clock

	_ int32 // force Stats to be 8-byte aligned on 32-bit platforms

	// Stats are statistics on the galaxy.
	Stats GalaxyStats

	// pointer to the parent universe that created this galaxy
	parent *Universe
}

// now returns the current time relative to the baseTime
func (g *Galaxy) now() time.Duration {
	return g.clock.Now().Sub(g.baseTime)
}

// PeekPeerCfg provides tuning parameters Peeking behavior, when configured.
type PeekPeerCfg struct {
	// PeekTimeout is the timeout to use when making Peek requests for this Galaxy.
	// this may be in the 2-10ms range for local networks, as the remote
	// end should always service this request from memory.
	PeekTimeout time.Duration `dialsdesc:"timeout to set on Peek requests to the peer that would own a specific key if that process wasn't in the hash-ring (should be short: 2-10ms because it's a memory-only operation, and if it fails the Galaxy will fall back to calling the BackendGetter)"`
	// WarmTime indicates how long after this galaxy initializes to stop
	// making Peek requests for a range after it took over that range.
	// This should be on par with the cache-warming time for initial startup.
	// Ranges transfered to this instance from peers that scale-down will
	// send Peek requests to those dying peers until it starts erroring.
	WarmTime time.Duration `dialsdesc:"time after Universe creation at which one should consider the cache warm and to stop making Peek requests to peers"`

	// PeekedValueMaxTTL and PeekedValueTTLJitter allow one to specify a maximum Time To Live (TTL) for
	// the values in this Galaxy pulled from peers via Peek requests.
	//
	// Jitter may be 0 to always set the expiration to be exactly maxTTL time in
	// the future.
	//
	// If a value has an Expiration time closer than maxTTL in the
	// future it will be left alone no matter the source. Conversely, if there is
	// no Expiration set, or it's farther in the future than maxTTL, a new one will
	// be set based on the value of maxTTL and jitter.
	//
	// Setting a non-zero Jitter will randomly pick an expiry between maxTTL-Jitter
	// and maxTTL in the future. When set appropriately, this can be leveraged to
	// prevent values populated at about the same time from expiring
	// simultaneously and causing a burst in activity while rehydrating values.
	// When used, Jitter values should be large enough that, over a reasonable
	// number of maxTTL intervals, keys that are continually accessed will
	// spread their expiration across the entire interval.
	//
	// Negative TTLs and jitter values are silently ignored, and jitter values that
	// are greater than maxTTL will be capped at maxTTL.
	PeekedValueMaxTTL    time.Duration `dialsdesc:"max time in the future to allow a value pulled in via a Peek request have their expiration"`
	PeekedValueTTLJitter time.Duration `dialsdesc:"max interval by to reduce the TTL by from PeekedValueMaxTTL"`
}

// Verify implements the [github.com/vimeo/dials.VerifiedConfig] interface
func (p *PeekPeerCfg) Verify() error {
	if p.PeekTimeout < time.Microsecond*100 {
		return fmt.Errorf("PeekTimeout must be > 100μs; got %s", p.PeekTimeout)
	}
	if p.WarmTime < 0 {
		return fmt.Errorf("WarmTime must be non-negative; got %s", p.WarmTime)
	}

	if p.PeekedValueMaxTTL < 0 {
		return fmt.Errorf("PeekedValueMaxTTL should either be zero to disable, or positive; got %s", p.PeekedValueMaxTTL)
	}
	if p.PeekedValueTTLJitter < 0 {
		return fmt.Errorf("PeekedValueTTLJitter should either be zero to disable jittering, or positive; got %s", p.PeekedValueTTLJitter)
	}

	return nil
}

// GalaxyOption is an interface for implementing functional galaxy options
type GalaxyOption interface {
	apply(*galaxyOpts)
}

// galaxyOpts contains optional fields for the galaxy (each with a default
// value if not set)
type galaxyOpts struct {
	promoter          promoter.Interface
	hcRatio           int64
	maxCandidates     int
	clock             clocks.Clock
	resetIdleStatsAge time.Duration
	// parameters for capping the TTL on gets and peeks
	getTTL ttlJitter
	// peeks may be set separately to trigger delayed reloading around version
	// upgrades (to prevent old versions of cache-values from persisting
	// indefinitely -- may be O(days) with substantial jitter)
	//
	// Defaults to matching getTTL
	peekTTL ttlJitter

	peekPeer *PeekPeerCfg
}

type funcGalaxyOption func(*galaxyOpts)

func (f funcGalaxyOption) apply(g *galaxyOpts) {
	f(g)
}

func newFuncGalaxyOption(f func(*galaxyOpts)) funcGalaxyOption {
	return funcGalaxyOption(f)
}

// WithPromoter allows the client to specify a promoter for the galaxy;
// defaults to a simple QPS comparison
func WithPromoter(p promoter.Interface) GalaxyOption {
	return newFuncGalaxyOption(func(g *galaxyOpts) {
		g.promoter = p
	})
}

// WithHotCacheRatio allows the client to specify a ratio for the
// main-to-hot cache sizes for the galaxy; defaults to 8:1
func WithHotCacheRatio(r int64) GalaxyOption {
	return newFuncGalaxyOption(func(g *galaxyOpts) {
		g.hcRatio = r
	})
}

// WithMaxCandidates allows the client to specify the size of the
// candidate cache by the max number of candidates held at one time;
// defaults to 100
func WithMaxCandidates(n int) GalaxyOption {
	return newFuncGalaxyOption(func(g *galaxyOpts) {
		g.maxCandidates = n
	})
}

// WithClock lets one override the clock used internally for key-stats
// accounting (among other things).
func WithClock(clk clocks.Clock) GalaxyOption {
	return newFuncGalaxyOption(func(g *galaxyOpts) {
		g.clock = clk
	})
}

// WithIdleStatsAgeResetWindow overrides the default interval after which a key
// that's been idle for a while gets its stats reset (such that that hit is
// recorded as if it were the first).
func WithIdleStatsAgeResetWindow(age time.Duration) GalaxyOption {
	return newFuncGalaxyOption(func(g *galaxyOpts) {
		g.resetIdleStatsAge = age
	})
}

// WithPreviousPeerPeeking enables peer-peeking and sets the config
func WithPreviousPeerPeeking(cfg PeekPeerCfg) GalaxyOption {
	return newFuncGalaxyOption(func(g *galaxyOpts) {
		g.peekPeer = &cfg
		g.peekTTL = newTTLJitter(cfg.PeekedValueMaxTTL, cfg.PeekedValueTTLJitter)
	})
}

func newTTLJitter(maxTTL, jitter time.Duration) ttlJitter {
	if maxTTL <= 0 {
		return ttlJitter{}
	}
	return ttlJitter{
		entryMaxTTL:    maxTTL,
		entryTTLJitter: min(max(jitter, 0), maxTTL),
	}
}

// WithGetTTL allows the client to specify a maximum Time To Live (TTL) for the
// values in this galaxy, with an optional jitter.
//
// Jitter may be 0 to always set the expiration to be exactly maxTTL time in
// the future.
//
// If a value has an Expiration time closer than maxTTL in the
// future it will be left alone no matter the source. Conversely, if there is
// no Expiration set, or it's farther in the future than maxTTL, a new one will
// be set based on the value of maxTTL and jitter.
//
// Setting a non-zero Jitter will randomly pick an expiry between maxTTL-Jitter
// and maxTTL in the future. When set appropriately, this can be leveraged to
// prevent values populated at about the same time from expiring
// simultaneously and causing a burst in activity while rehydrating values.
// When used, Jitter values should be large enough that, over a reasonable
// number of maxTTL intervals, keys that are continually accessed will
// spread their expiration across the entire interval.
//
// Negative TTLs and jitter values are silently ignored, and jitter values that
// are greater than maxTTL will be capped at maxTTL.
func WithGetTTL(maxTTL, jitter time.Duration) GalaxyOption {
	return newFuncGalaxyOption(func(g *galaxyOpts) {
		g.getTTL = newTTLJitter(maxTTL, jitter)
	})
}

// flightGroup is defined as an interface which flightgroup.Group
// satisfies.  We define this so that we may test with an alternate
// implementation.
type flightGroup interface {
	// Done is called when Do is done.
	Do(key string, fn func() (interface{}, error)) (interface{}, error)
}

// GalaxyStats are per-galaxy statistics.
type GalaxyStats struct {
	Gets              AtomicInt // any Get request, including from peers
	Loads             AtomicInt // (gets - cacheHits)
	CoalescedLoads    AtomicInt // inside singleflight
	MaincacheHits     AtomicInt // number of maincache hits
	HotcacheHits      AtomicInt // number of hotcache hits
	PeerLoads         AtomicInt // either remote load or remote cache hit (not an error)
	PeerLoadErrors    AtomicInt // errors on getFromPeer
	BackendLoads      AtomicInt // load from backend locally
	BackendLoadErrors AtomicInt // total bad local loads
	PeerPeekHits      AtomicInt // peer Peek hits
	PeerPeeks         AtomicInt // peer Peek requests

	CoalescedMaincacheHits  AtomicInt // maincache hit in singleflight
	CoalescedHotcacheHits   AtomicInt // hotcache hit in singleflight
	CoalescedPeerLoads      AtomicInt // peer load in singleflight
	CoalescedBackendLoads   AtomicInt // backend load in singleflight
	CoalescedPeerPeekHits   AtomicInt // peek hit in singleflight
	CoalescedPeerPeeks      AtomicInt // peek request in singleflight
	CoalescedPeerPeekErrors AtomicInt // peek failure (not not-found) in singleflight

	ServerRequests AtomicInt // gets that came over the network from peers
}

// Name returns the name of the galaxy.
func (g *Galaxy) Name() string {
	return g.name
}

// hitLevel specifies the level at which data was found on Get
type hitLevel int

const (
	hitHotcache hitLevel = iota + 1
	hitMaincache
	hitPeek
	hitPeer
	hitBackend
	miss // for checking cache hit/miss in lookupCache
)

func (h hitLevel) String() string {
	switch h {
	case hitHotcache:
		return "hotcache"
	case hitMaincache:
		return "maincache"
	case hitPeek:
		return "peek"
	case hitPeer:
		return "peer"
	case hitBackend:
		return "backend"
	default:
		return ""
	}
}

func (h hitLevel) isHit() bool {
	return h != miss
}

// recordRequest records the corresponding opencensus measurement
// to the level at which data was found on Get/load
func (g *Galaxy) recordRequest(ctx context.Context, h hitLevel, localAuthoritative bool) {
	span := trace.FromContext(ctx)
	span.Annotatef([]trace.Attribute{trace.StringAttribute("hit_level", h.String())}, "fetched from %s", h)
	switch h {
	case hitMaincache:
		g.Stats.MaincacheHits.Add(1)
		g.recordStats(ctx, []tag.Mutator{tag.Upsert(CacheLevelKey, h.String())}, MCacheHits.M(1))
	case hitHotcache:
		g.Stats.HotcacheHits.Add(1)
		g.recordStats(ctx, []tag.Mutator{tag.Upsert(CacheLevelKey, h.String())}, MCacheHits.M(1))
	case hitPeek:
		g.Stats.PeerPeekHits.Add(1)
		g.recordStats(ctx, nil, MPeeks.M(1))
	case hitPeer:
		g.Stats.PeerLoads.Add(1)
		g.recordStats(ctx, nil, MPeerLoads.M(1))
	case hitBackend:
		g.Stats.BackendLoads.Add(1)
		g.recordStats(ctx, nil, MBackendLoads.M(1))
		if !localAuthoritative {
			span.Annotate(nil, "failed to fetch from peer, not authoritative for key")
		}
	}
}

// NotFoundErr is an error indicating that the key was not found. If an
// error unwraps to it (with [errors.As]), Galaxycache may skip a local lookup
// and return an error implementing this interface (possibly forwarding an
// error message from the remote peer (fetch protocol permitting)
// [TrivialNotFoundErr] is available for wrapping to satisfy this requirement.
type NotFoundErr interface {
	error
	IsNotFound()
}

// TrivialNotFoundErr is an error-type that can be wrapped to mark the error as
// a not-found. (and unwrap to NotFoundErr)
type TrivialNotFoundErr struct{}

// IsNotFound implements NotFoundErr
func (TrivialNotFoundErr) IsNotFound() {}

// Error implements error
func (TrivialNotFoundErr) Error() string {
	return "not found"
}

// keyPeekNotFound implements NotFoundErr, and is used for peek requests
// (GetWithOptions where both backend and peer fetches are disabled)
type keyPeekNotFound struct {
	galaxy string
	key    string
}

// Error implements the error interface
func (n *keyPeekNotFound) Error() string {
	return fmt.Sprintf("key %q not found in galaxy %q", n.key, n.galaxy)
}

// indicate that this error is a NotFoundErr
func (n *keyPeekNotFound) IsNotFound() {}

// Get as defined here is the primary "get" called on a galaxy to
// find the value for the given key, using the following logic:
// - First, try the local cache; if its a cache hit, we're done
// - On a cache miss, search for which peer is the owner of the
// key based on the consistent hash
// - If a different peer is the owner, use the corresponding fetcher
// to Fetch from it; otherwise, if the calling instance is the key's
// canonical owner, call the BackendGetter to retrieve the value
// (which will now be cached locally)
// This is a wrapper around GetWithOptions.
func (g *Galaxy) Get(ctx context.Context, key string, dest Codec) error {
	_, getErr := g.GetWithOptions(ctx, GetOptions{}, key, dest)
	return getErr
}

// FetchMode is a trimode enum indicating how to handle fetching
type FetchMode uint8

const (
	// Normal fetch behavior: check local cache, then peer, then do a local backend fetch
	FetchModeRegular FetchMode = iota
	// Like Regular, but only issue peek calls to peers, so we don't incur backend gets there.
	// (the peek call may have a very short deadline, if issued (controlled at the galaxy level))
	FetchModeNoPeerBackend
	// Only check whether this is in-cache
	FetchModePeek
)

func (f FetchMode) allowPeerFetch() bool {
	switch f {
	case FetchModeRegular:
		return true
	case FetchModeNoPeerBackend, FetchModePeek:
		return false
	default:
		panic("unknown fetch mode: " + strconv.Itoa(int(f)))
	}
}

type GetOptions struct {
	// FetchMode
	FetchMode FetchMode
}

// GetInfo provides information about the Get call and any auxilliary
// information for the value. Notably whether there's an expiration associated
// with the returned value.
type GetInfo struct {
	// If non-zero, Expiry provides an expiration time after which
	// Galaxycache should not return this value (and should ideally evict
	// it from the cache to prevent unexpired, but more recently-touched
	// items from being evicted)
	Expiry time.Time
	// TODO: include information about hit-level, and/or backend fetches.
}

// GetWithOptions as defined here is the primary "get" called on a galaxy to
// find the value for the given key, using the following logic:
//
//   - First, try the local cache; if its a cache hit, we're done
//
//   - if the FetchMode is FetchModePeek; we're done, return a Not Found
//
//   - otherwise, search for which peer is the owner of the key based on the
//     consistent hash
//
//   - If a different peer is the owner and the FetchMode is not
//     FetchModeNoPeerBackend: use the corresponding fetcher to Fetch from it
//
//   - if the peer request fails with a Not Found; return a Not Found
//
//   - if the peer request fails with any other error, fallthrough to the local BackendGetter
//
//   - if peer-peeking is enabled, and this instance is the key's owner, and
//     we're within the configured warmup period, send a Peek request with a
//     short-deadline to the "fallthrough owner" (who would own this key if the
//     calling instance wasn't in the hash-ring)
//
//   - if that fails or returns Not Found fallthrough
//
//   - call the BackendGetter to retrieve the value (which will now be cached
//     locally).
//
//   - return whatever the BackendGetter provides
func (g *Galaxy) GetWithOptions(ctx context.Context, opts GetOptions, key string, dest Codec) (GetInfo, error) {
	ctx, tagErr := tag.New(ctx, tag.Upsert(GalaxyKey, g.name))
	if tagErr != nil {
		panic(fmt.Errorf("error tagging context: %s", tagErr))
	}

	ctx, span := trace.StartSpan(ctx, "galaxycache.(*Galaxy).Get on "+g.name)
	startTime := time.Now()
	defer func() {
		g.recordStats(ctx, nil, MRoundtripLatencyMilliseconds.M(sinceInMilliseconds(startTime)))
		span.End()
	}()

	g.Stats.Gets.Add(1)
	g.recordStats(ctx, nil, MGets.M(1))
	if dest == nil {
		span.SetStatus(trace.Status{Code: trace.StatusCodeInvalidArgument, Message: "no Codec was provided"})
		return GetInfo{}, errors.New("galaxycache: no Codec was provided")
	}
	value, exp, hlvl := g.lookupCache(key)
	g.recordStats(ctx, nil, MKeyLength.M(int64(len(key))))

	if hlvl.isHit() {
		span.Annotatef([]trace.Attribute{trace.BoolAttribute("cache_hit", true)}, "Cache hit in %s", hlvl)
		value.stats.touch(g.resetIdleStatsAge, g.now())
		g.recordRequest(ctx, hlvl, false)
		g.recordStats(ctx, nil, MValueLength.M(int64(len(value.data))))
		return GetInfo{Expiry: exp}, dest.UnmarshalBinary(value.data)
	}

	span.Annotatef([]trace.Attribute{trace.BoolAttribute("cache_hit", false)}, "Cache miss")

	if opts.FetchMode == FetchModePeek {
		// This is a peek terminate here, before we do anything expensive.
		return GetInfo{}, &keyPeekNotFound{
			galaxy: g.name,
			key:    key,
		}
	}

	// Optimization to avoid double unmarshalling or copying: keep
	// track of whether the dest was already populated. One caller
	// (if local) will set this; the losers will not. The common
	// case will likely be one caller.
	destPopulated := false
	lo := loadOpts{
		fetchMode: opts.FetchMode,
	}
	value, loadExpiry, destPopulated, err := g.load(ctx, lo, key, dest)
	if err != nil {
		span.SetStatus(trace.Status{Code: trace.StatusCodeUnknown, Message: "Failed to load key: " + err.Error()})
		g.recordStats(ctx, nil, MLoadErrors.M(1))
		return GetInfo{}, err
	}
	value.stats.touch(g.resetIdleStatsAge, g.now())
	g.recordStats(ctx, nil, MValueLength.M(int64(len(value.data))))
	if destPopulated {
		return GetInfo{Expiry: loadExpiry}, nil
	}
	return GetInfo{Expiry: loadExpiry}, dest.UnmarshalBinary(value.data)
}

type valWithLevel struct {
	val                valWithStat
	expiry             time.Time
	level              hitLevel
	localAuthoritative bool
	peerErr            error
	localErr           error
}

type loadOpts struct {
	fetchMode FetchMode
}

// load loads key either by invoking the getter locally or by sending it to another machine.
func (g *Galaxy) load(ctx context.Context, opts loadOpts, key string, dest Codec) (value valWithStat, expiry time.Time, destPopulated bool, err error) {
	g.Stats.Loads.Add(1)
	g.recordStats(ctx, nil, MLoads.M(1))

	viewi, err := g.loadGroup.Do(key, func() (interface{}, error) {
		// Check the cache again because singleflight can only dedup calls
		// that overlap concurrently.  It's possible for 2 concurrent
		// requests to miss the cache, resulting in 2 load() calls.  An
		// unfortunate goroutine scheduling would result in this callback
		// being run twice, serially.  If we don't check the cache again,
		// cache.nbytes would be incremented below even though there will
		// be only one entry for this key.
		//
		// Consider the following serialized event ordering for two
		// goroutines in which this callback gets called twice for the
		// same key:
		// 1: Get("key")
		// 2: Get("key")
		// 1: lookupCache("key")
		// 2: lookupCache("key")
		// 1: load("key")
		// 2: load("key")
		// 1: loadGroup.Do("key", fn)
		// 1: fn()
		// 2: loadGroup.Do("key", fn)
		// 2: fn()
		if value, exp, hlvl := g.lookupCache(key); hlvl.isHit() {
			if hlvl == hitHotcache {
				g.Stats.CoalescedHotcacheHits.Add(1)
			} else {
				g.Stats.CoalescedMaincacheHits.Add(1)
			}
			g.recordStats(ctx, []tag.Mutator{tag.Insert(CacheLevelKey, hlvl.String())}, MCoalescedCacheHits.M(1))
			return &valWithLevel{value, exp, hlvl, false, nil, nil}, nil

		}
		g.Stats.CoalescedLoads.Add(1)
		g.recordStats(ctx, nil, MCoalescedLoads.M(1))

		authoritative := true
		var peerErr error
		var bgInfo BackendGetInfo
		if peer, ok := g.peerPicker.pickPeer(key); opts.fetchMode.allowPeerFetch() && ok {
			value, bgInfo, peerErr = g.getFromPeer(ctx, peer, key)
			authoritative = false
			if peerErr == nil {
				g.Stats.CoalescedPeerLoads.Add(1)
				g.recordStats(ctx, nil, MCoalescedPeerLoads.M(1))
				return &valWithLevel{val: value, level: hitPeer, expiry: bgInfo.Expiration, localAuthoritative: false, peerErr: nil, localErr: nil}, nil
			}

			g.Stats.PeerLoadErrors.Add(1)
			g.recordStats(ctx, nil, MPeerLoadErrors.M(1))
			// TODO(bradfitz): log the peer's error? keep
			// log of the past few for /galaxycache?  It's
			// probably boring (normal task movement), so not
			// worth logging I imagine.

			// If this was a NotFoundErr return it without trying to fetch locally
			if nfErr := NotFoundErr(nil); errors.As(peerErr, &nfErr) {
				return nil, peerErr
			}
		} else if !ok && g.opts.peekPeer != nil {
			value, bgInfo, peerErr = g.peekPeer(ctx, key)
			authoritative = false
			if peerErr == nil {
				return &valWithLevel{val: value, level: hitPeek, expiry: bgInfo.Expiration, localAuthoritative: true, peerErr: nil, localErr: nil}, nil
			}
			if nfErr := NotFoundErr(nil); !errors.As(peerErr, &nfErr) {
				// Not a not-found, mark it as a peek-error.
				g.recordStats(ctx, nil, MPeerLoadErrors.M(1))
			}

		}
		data, bgInfo, err := g.getLocally(ctx, key, dest)
		if err != nil {
			g.Stats.BackendLoadErrors.Add(1)
			g.recordStats(ctx, nil, MBackendLoadErrors.M(1))
			return nil, err
		}

		g.Stats.CoalescedBackendLoads.Add(1)
		g.recordStats(ctx, nil, MCoalescedBackendLoads.M(1))
		destPopulated = true // only one caller of load gets this return value
		value = g.newValWithStat(data, nil)
		g.populateCache(ctx, key, value, &g.mainCache, bgInfo)
		return &valWithLevel{value, bgInfo.Expiration, hitBackend, authoritative, peerErr, err}, nil
	})
	if err == nil {
		value = viewi.(*valWithLevel).val
		expiry = viewi.(*valWithLevel).expiry
		level := viewi.(*valWithLevel).level
		authoritative := viewi.(*valWithLevel).localAuthoritative
		g.recordRequest(ctx, level, authoritative) // record the hits for all load calls, including those that tagged onto the singleflight
	}
	return
}

type ttlJitter struct {
	// if entryMaxTTL > 0, we'll cap the expiry at now + entryMaxTTL - math.Int64N(entryTTLJitter)
	entryMaxTTL, entryTTLJitter time.Duration
}

func (g ttlJitter) capExpiry(clk clocks.Clock, bgInfo *BackendGetInfo) {
	// initial common-case: no TTL set
	if g.entryMaxTTL <= 0 {
		return
	}
	now := clk.Now()
	// We're done if the expiration is already closer than ttl + maxJitter
	if !bgInfo.Expiration.IsZero() && bgInfo.Expiration.Sub(now) <= (g.entryMaxTTL+g.entryTTLJitter) {
		return
	}
	// Either there's no expiration, or it's too far in the future so we need to adjust it downward.
	newCapInterval := g.entryMaxTTL
	if g.entryTTLJitter > 0 {
		newCapInterval -= time.Duration(rand.Int64N(int64(g.entryTTLJitter)))
	}
	bgInfo.Expiration = now.Add(newCapInterval)
}

func (g *Galaxy) getLocally(ctx context.Context, key string, dest Codec) ([]byte, BackendGetInfo, error) {
	startTime := time.Now()
	defer func() {
		g.recordStats(ctx, nil, MGetterFuncLatencyMilliseconds.M(sinceInMilliseconds(startTime)))
	}()
	bgInfo, err := g.getter.GetWithInfo(ctx, key, dest)
	if err != nil {
		return nil, BackendGetInfo{}, err
	}
	mar, marErr := dest.MarshalBinary()
	g.opts.getTTL.capExpiry(g.clock, &bgInfo)
	return mar, bgInfo, marErr
}

func (g *Galaxy) peekPeer(ctx context.Context, key string) (valWithStat, BackendGetInfo, error) {
	peer, ok := g.parent.peerPicker.pickPeekPeer(g.opts.peekPeer.WarmTime, key)
	if !ok {
		// just pretend it's a cache-miss (it's easier that way)
		return valWithStat{}, BackendGetInfo{}, TrivialNotFoundErr{}
	}

	// This is a quick inmemory lookup; we want to set an aggressive
	// deadline so we don't waste any work, but also don't delay for too
	// long if something's awry on the peer.
	ctx, cancel := g.clock.ContextWithTimeout(ctx, g.opts.peekPeer.PeekTimeout)
	defer cancel()

	span := trace.FromContext(ctx)

	span.Annotate(nil, "sending peek")
	peekVal, bgInfo, peekErr := peer.PeekWithInfo(ctx, g.name, key)
	g.Stats.CoalescedPeerPeeks.Add(1)
	g.recordStats(ctx, nil, MCoalescedPeeks.M(1))
	if peekErr != nil {
		if nfErr := NotFoundErr(nil); errors.As(peekErr, &nfErr) {
			span.Annotatef(nil, "peek miss: %s", peekErr)
		} else {
			span.Annotatef(nil, "peek failed: %s", peekErr)
			g.recordStats(ctx, nil, MCoalescedPeekErrors.M(1))
			g.Stats.CoalescedPeerPeekErrors.Add(1)
		}
		return valWithStat{}, BackendGetInfo{}, fmt.Errorf("peek failed: %w", peekErr)
	}
	g.Stats.CoalescedPeerPeekHits.Add(1)
	g.recordStats(ctx, nil, MCoalescedPeekHits.M(1))
	span.Annotate(nil, "peek hit")

	// We only peek for keys that this instance owns, so they'll be unconditionally
	// inserted into the main cache.

	value := g.newValWithStat(peekVal, nil)
	g.opts.peekTTL.capExpiry(g.clock, &bgInfo)
	g.populateCache(ctx, key, value, &g.mainCache, bgInfo)

	return value, bgInfo, nil
}

func (g *Galaxy) getFromPeer(ctx context.Context, peer RemoteFetcherWithInfo, key string) (valWithStat, BackendGetInfo, error) {
	data, bgInfo, err := peer.FetchWithInfo(ctx, g.name, key)
	if err != nil {
		return valWithStat{}, BackendGetInfo{}, err
	}
	kStats, ok := g.candidateCache.get(key)
	if !ok {
		kStats = g.addNewToCandidateCache(key)
		// NB: we do not touch() kStats here because that's reserved
		//     for code outside the singleflight block.
		// This has the advantageous effect of guaranteeing that
		// hitCount is 0 if it's a new key, thus making it easy for a
		// promoter to distinguish a new key.
	}

	g.maybeUpdateHotCacheStats() // will update if at least a second has passed since the last update

	hitCount, keyQPS := kStats.val(g.now())
	stats := promoter.Stats{
		KeyQPS:  keyQPS,
		Hits:    hitCount,
		HCStats: g.hcStatsWithTime.hcs,
	}
	value := g.newValWithStat(data, kStats)
	if g.opts.promoter.ShouldPromote(key, value.data, stats) {
		g.populateCache(ctx, key, value, &g.hotCache, bgInfo)
	}
	g.opts.getTTL.capExpiry(g.clock, &bgInfo)
	return value, bgInfo, nil
}

func (g *Galaxy) lookupCache(key string) (valWithStat, time.Time, hitLevel) {
	if g.cacheBytes <= 0 {
		return valWithStat{}, time.Time{}, miss
	}
	vi, exp, ok := g.mainCache.get(key)
	if ok {
		return vi, exp, hitMaincache
	}
	vi, exp, ok = g.hotCache.get(key)
	if !ok {
		return valWithStat{}, time.Time{}, miss
	}
	g.Stats.HotcacheHits.Add(1)
	return vi, exp, hitHotcache
}

func (g *Galaxy) populateCache(ctx context.Context, key string, value valWithStat, cache *cache, bgInfo BackendGetInfo) {
	if g.cacheBytes <= 0 {
		return
	}
	cache.add(key, value, bgInfo.Expiration)
	// Record the size of this cache after we've finished evicting any necessary values.
	defer func() {
		g.recordStats(ctx, []tag.Mutator{tag.Upsert(CacheTypeKey, cache.ctype.String())},
			MCacheSize.M(cache.bytes()), MCacheEntries.M(cache.items()))
	}()

	// Evict items from cache(s) if necessary.
	for {
		mainBytes := g.mainCache.bytes()
		hotBytes := g.hotCache.bytes()
		if mainBytes+hotBytes <= g.cacheBytes {
			return
		}

		// TODO(bradfitz): this is good-enough-for-now logic.
		// It should be something based on measurements and/or
		// respecting the costs of different resources.
		victim := &g.mainCache
		if hotBytes > mainBytes/g.opts.hcRatio {
			victim = &g.hotCache
		}
		victim.removeOldest()
	}
}

func (g *Galaxy) recordStats(ctx context.Context, mutators []tag.Mutator, measurements ...stats.Measurement) {
	stats.RecordWithOptions(
		ctx,
		stats.WithMeasurements(measurements...),
		stats.WithTags(mutators...),
		stats.WithRecorder(g.parent.recorder),
	)
}

// CacheType represents a type of cache.
type CacheType uint8

const (
	// MainCache is the cache for items that this peer is the
	// owner of.
	MainCache CacheType = iota + 1

	// HotCache is the cache for items that seem popular
	// enough to replicate to this node, even though it's not the
	// owner.
	HotCache

	// CandidateCache is the cache for peer-owned keys that
	// may become popular enough to put in the HotCache
	CandidateCache
)

// CacheStats returns stats about the provided cache within the galaxy.
func (g *Galaxy) CacheStats(which CacheType) CacheStats {
	switch which {
	case MainCache:
		return g.mainCache.stats()
	case HotCache:
		return g.hotCache.stats()
	case CandidateCache:
		// not worth tracking this for the CandidateCache
		return CacheStats{}
	default:
		return CacheStats{}
	}
}

func (c *cache) stats() CacheStats {
	c.mu.Lock()
	defer c.mu.Unlock()
	return CacheStats{
		Bytes:     c.nbytes.Get(),
		Items:     c.itemsLocked(),
		Gets:      c.nget,
		Hits:      c.nhit,
		Evictions: c.nevict,
	}
}

type valWithStat struct {
	data  []byte
	stats *keyStats
}

// sizeOfValWithStats returns the total size of the value in the hot/main
// cache, including the data, key stats, and a pointer to the val itself
func (v *valWithStat) size() int64 {
	const statsSize = int64(unsafe.Sizeof(*v.stats))
	const ptrSize = int64(unsafe.Sizeof(v))
	const vwsSize = int64(unsafe.Sizeof(*v))
	// using cap() instead of len() for data leads to inconsistency
	// after unmarshaling/marshaling the data
	return statsSize + ptrSize + vwsSize + int64(len(v.data))
}

func (c *cache) add(key string, value valWithStat, exp time.Time) {
	c.mu.Lock()
	defer c.mu.Unlock()
	c.lru.AddExpiring(key, value, exp)
	c.nbytes.Add(int64(len(key)) + value.size())
}

func (c *cache) removeOldest() {
	c.mu.Lock()
	defer c.mu.Unlock()
	if c.lru != nil {
		c.lru.RemoveOldest()
	}

}

func (c *cache) bytes() int64 {
	return c.nbytes.Get()
}

func (c *cache) items() int64 {
	c.mu.Lock()
	defer c.mu.Unlock()
	return c.itemsLocked()
}

func (c *cache) itemsLocked() int64 {
	if c.lru == nil {
		return 0
	}
	return int64(c.lru.Len())
}

// CacheStats are returned by stats accessors on Galaxy.
type CacheStats struct {
	Bytes     int64
	Items     int64
	Gets      int64
	Hits      int64
	Evictions int64
}

//go:generate stringer -type=CacheType
