Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,31 @@ Git history (tags `v1.0.0` through `v1.15.0`).

## [Unreleased]

## [1.19.5] — 2026-06-04

### Added

- **Per-query SQL tracing (`PDBPLUS_OTEL_SQL`, on by default).** The shared
`*sql.DB` is opened through XSAM/otelsql so every statement — ent's and the
raw `sync_status` queries — emits a DB span nested under the active
request/sync span, surfacing the SQL behind a request that flat traces
couldn't show. On by default (the data is useful and bounded); set
`PDBPLUS_OTEL_SQL=false` to disable. DB-span volume is bounded by the
existing sampler: API reads inherit `PDBPLUS_OTEL_SAMPLE_RATE`, and
**scheduled sync cycles are no longer traced at all** (so they emit no DB
spans — the historical high-volume concern), while a **manually triggered
`POST /sync` is traced by default** — pass `?trace=0` to opt out.
Implemented via two new sampler gates (`pdbplus.origin=sync` drops scheduled
cycles; `pdbplus.force_sample` force-samples a manual run) and a
`WithForceTrace` context flag threaded from the `/sync` handler.

### Changed

- **Scheduled sync cycles are no longer sampled into traces** (previously the
per-route default ~1%). They are dropped by the new `pdbplus.origin=sync`
sampler gate so the sync path stays trace-free unless a sync is triggered
manually via `POST /sync`. Independent of `PDBPLUS_OTEL_SQL`.

## [1.19.4] — 2026-06-04

### Changed
Expand Down
13 changes: 11 additions & 2 deletions cmd/peeringdb-plus/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ func main() {
}

// Open database.
entClient, db, err := database.Open(cfg.DBPath)
entClient, db, err := database.Open(cfg.DBPath, cfg.OTelSQL)
if err != nil {
logger.Error("failed to open database", slog.Any("error", err))
os.Exit(1)
Expand Down Expand Up @@ -1202,7 +1202,16 @@ func newSyncHandler(appCtx context.Context, in SyncHandlerInput) http.HandlerFun
}
// Use application root ctx, NOT r.Context() -- request context
// is cancelled when the response is sent, which would kill the sync.
go in.SyncFn(appCtx, mode)
//
// A manually-triggered sync is traced by default (you asked for it,
// you want to see it); ?trace=0 opts out. Scheduled timer syncs are
// never traced (see internal/otel sampler). The force-trace flag rides
// the app root ctx so it reaches the worker's root span.
syncCtx := appCtx
if r.URL.Query().Get("trace") != "0" {
syncCtx = pdbsync.WithForceTrace(appCtx)
}
go in.SyncFn(syncCtx, mode)
w.WriteHeader(http.StatusAccepted)
fmt.Fprint(w, `{"status":"accepted"}`)
}
Expand Down
2 changes: 1 addition & 1 deletion cmd/peeringdb-plus/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ func TestFreshnessFromDB(t *testing.T) {
t.Parallel()
ctx := context.Background()

_, db, err := database.Open(filepath.Join(t.TempDir(), "freshness.db"))
_, db, err := database.Open(filepath.Join(t.TempDir(), "freshness.db"), false)
if err != nil {
t.Fatalf("open db: %v", err)
}
Expand Down
1 change: 1 addition & 0 deletions docs/CONFIGURATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ line (`sync interval configured`) — the API key itself is never logged.
| Variable | Required | Default | Type | Description |
|----------|----------|---------|------|-------------|
| `PDBPLUS_OTEL_SAMPLE_RATE` | No | `1.0` | float | Trace sampling ratio for the known app surfaces (`/api/`, `/rest/v1/`, `/peeringdb.v1.`, `/graphql`). Unknown-path / scanner-bait / health-probe ratios are hardcoded — see `docs/ARCHITECTURE.md` § Sampling Matrix. Must be in the inclusive range `[0.0, 1.0]`. Values outside this range are rejected at startup. |
| `PDBPLUS_OTEL_SQL` | No | `true` | bool | Emit a per-query OpenTelemetry DB span (via XSAM/otelsql) for every SQL statement on the shared `*sql.DB` — ent's queries and the raw `sync_status` statements alike. On by default; set `PDBPLUS_OTEL_SQL=false` to disable. DB spans are children of the active request/sync span, so volume is bounded by the existing sampler: API-read spans inherit `PDBPLUS_OTEL_SAMPLE_RATE`; scheduled sync cycles are not traced at all (so emit no DB spans — this was the historical high-volume concern), while a manually-triggered `POST /sync` is traced by default (pass `?trace=0` to opt out). |
| `PDBPLUS_LOG_LEVEL` | No | `INFO` | enum | Minimum severity for log records shipped via the OTel logging pipeline (and from there to Loki). Accepted values (case-insensitive, parsed via `slog.Level.UnmarshalText`): `DEBUG`, `INFO`, `WARN`, `ERROR`. The stdout (Fly log) handler is independently gated at INFO and is not affected by this variable. Default `INFO` was chosen so DEBUG records remain local for opt-in debugging without polluting production Loki ingestion volume to keep production Loki ingestion volumes bounded. Invalid values fall back to `INFO` with no error (logging-level config is operator-friendly; fail-fast is normally preferred, but a malformed log level should not take production down). Consumed by `internal/otel/logger.go` `otelLevelFromEnv()` — not parsed by `internal/config`. |
| `PDBPLUS_HEAP_WARN_MIB` | No | `400` | integer (MiB) | Peak Go heap (MiB) threshold checked at end of every sync cycle. When `runtime.MemStats.HeapInuse` exceeds this value, the worker emits `slog.Warn("heap threshold crossed", …)` with typed attrs (`peak_heap_bytes`, `heap_warn_bytes`, `heap_over`, etc.). The OTel span attribute `pdbplus.sync.peak_heap_bytes` (Prometheus gauge `pdbplus_sync_peak_heap_bytes`) emits on every cycle regardless. `0` disables the warn. Sustained breach across multiple cycles is the operational signal to revisit the incremental-sync defaults and rollout planning. Default sits comfortably under the 512 MB Fly VM cap so the failure order is `log → app crash → Fly OOM-kill`. Observed baseline (2026-04-17): primary peak ~84 MiB, replicas 58–59 MiB — ~4.5× headroom. **Bare integer only** — no unit suffix (the variable name encodes the unit); `400MB` is rejected. Negative values are rejected at startup. |
| `PDBPLUS_RSS_WARN_MIB` | No | `384` | integer (MiB) | Peak OS RSS (MiB) threshold derived from `/proc/self/status` `VmHWM` (Linux only). Same warn semantics as `PDBPLUS_HEAP_WARN_MIB`. The OTel span attr `pdbplus.sync.peak_rss_bytes` (Prometheus gauge `pdbplus_sync_peak_rss_bytes`) is omitted on non-Linux platforms (RSS not available). `0` disables the warn. Bare integer only — no unit suffix. |
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ require (
entgo.io/ent v0.14.6
github.com/99designs/gqlgen v0.17.90
github.com/KimMachineGun/automemlimit v0.7.5
github.com/XSAM/otelsql v0.42.0
github.com/a-h/templ v0.3.1020
github.com/charmbracelet/colorprofile v0.4.3
github.com/go-playground/form/v4 v4.3.0
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ github.com/KimMachineGun/automemlimit v0.7.5 h1:RkbaC0MwhjL1ZuBKunGDjE/ggwAX43Dw
github.com/KimMachineGun/automemlimit v0.7.5/go.mod h1:QZxpHaGOQoYvFhv/r4u3U0JTC2ZcOwbSr11UZF46UBM=
github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
github.com/XSAM/otelsql v0.42.0 h1:Li0xF4eJUxG2e0x3D4rvRlys1f27yJKvjTh7ljkUP5o=
github.com/XSAM/otelsql v0.42.0/go.mod h1:4mOrEv+cS1KmKzrvTktvJnstr5GtKSAK+QHvFR9OcpI=
github.com/a-h/parse v0.0.0-20250122154542-74294addb73e h1:HjVbSQHy+dnlS6C3XajZ69NYAb5jbGNfHanvm1+iYlo=
github.com/a-h/parse v0.0.0-20250122154542-74294addb73e/go.mod h1:3mnrkvGpurZ4ZrTDbYU84xhwXW2TjTKShSwjRi2ihfQ=
github.com/a-h/templ v0.3.1020 h1:ypAT/L5ySWEnZ6Zft/5yfoWXYYkhFNvEFOeeqecg4tw=
Expand Down
14 changes: 14 additions & 0 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,14 @@ type Config struct {
// Configured via PDBPLUS_OTEL_SAMPLE_RATE. Default is 1.0 (always sample).
OTelSampleRate float64

// OTelSQL enables per-query OpenTelemetry DB spans (XSAM/otelsql) on the
// shared *sql.DB. Configured via PDBPLUS_OTEL_SQL. Default true — the data
// is useful and its volume is bounded by the trace sampler; set
// PDBPLUS_OTEL_SQL=false to disable. Scheduled sync cycles are never traced
// regardless (the prior high-volume concern); only a manually-triggered
// POST /sync is — see internal/otel sampler.
OTelSQL bool

// SyncStaleThreshold is the maximum age of sync data before health reports degraded.
// Configured via PDBPLUS_SYNC_STALE_THRESHOLD. Default is 24h.
SyncStaleThreshold time.Duration
Expand Down Expand Up @@ -280,6 +288,12 @@ func Load() (*Config, error) {
}
cfg.OTelSampleRate = sampleRate

otelSQL, err := parseBool("PDBPLUS_OTEL_SQL", true)
if err != nil {
return nil, fmt.Errorf("parsing PDBPLUS_OTEL_SQL: %w", err)
}
cfg.OTelSQL = otelSQL

syncStaleThreshold, err := parseDuration("PDBPLUS_SYNC_STALE_THRESHOLD", 24*time.Hour)
if err != nil {
return nil, fmt.Errorf("parsing PDBPLUS_SYNC_STALE_THRESHOLD: %w", err)
Expand Down
22 changes: 20 additions & 2 deletions internal/database/database.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import (

"entgo.io/ent/dialect"
entsql "entgo.io/ent/dialect/sql"
"github.com/XSAM/otelsql"
"go.opentelemetry.io/otel/attribute"
"modernc.org/sqlite"

"github.com/dotwaffle/peeringdb-plus/ent"
Expand Down Expand Up @@ -38,13 +40,29 @@ func init() {
// - temp_store(MEMORY): keeps sorter and temp tables in RAM. modernc.org/
// sqlite's default is FILE which on Fly hits the rootfs overlay (NOT
// tmpfs — verified via /proc/mounts).
func Open(dbPath string) (*ent.Client, *sql.DB, error) {
//
// When traceSQL is true the underlying *sql.DB is opened through XSAM/otelsql,
// so every query — ent's and the raw sync_status statements that share this
// handle — emits an OpenTelemetry span beneath the active request/sync span.
// Controlled by PDBPLUS_OTEL_SQL (default on; set false to disable). Span
// volume is bounded by the trace sampler: scheduled sync cycles are not traced,
// so the high-volume sync path produces no DB spans — see internal/otel sampler.
func Open(dbPath string, traceSQL bool) (*ent.Client, *sql.DB, error) {
dsn := fmt.Sprintf(
"file:%s?_pragma=foreign_keys(1)&_pragma=journal_mode(WAL)&_pragma=busy_timeout(5000)"+
"&_pragma=synchronous(NORMAL)&_pragma=cache_size(-32000)&_pragma=temp_store(MEMORY)",
dbPath,
)
db, err := sql.Open("sqlite3", dsn)
var (
db *sql.DB
err error
)
if traceSQL {
db, err = otelsql.Open("sqlite3", dsn,
otelsql.WithAttributes(attribute.String("db.system", "sqlite")))
} else {
db, err = sql.Open("sqlite3", dsn)
}
if err != nil {
return nil, nil, fmt.Errorf("opening database %s: %w", dbPath, err)
}
Expand Down
29 changes: 26 additions & 3 deletions internal/database/database_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import (
func TestOpen_Success(t *testing.T) {
t.Parallel()
dbPath := filepath.Join(t.TempDir(), "test.db")
client, db, err := Open(dbPath)
client, db, err := Open(dbPath, false)
if err != nil {
t.Fatalf("Open(%q) error: %v", dbPath, err)
}
Expand All @@ -26,7 +26,7 @@ func TestOpen_Success(t *testing.T) {
func TestOpen_Pragmas(t *testing.T) {
t.Parallel()
dbPath := filepath.Join(t.TempDir(), "test.db")
client, db, err := Open(dbPath)
client, db, err := Open(dbPath, false)
if err != nil {
t.Fatalf("Open(%q) error: %v", dbPath, err)
}
Expand Down Expand Up @@ -55,10 +55,33 @@ func TestOpen_Pragmas(t *testing.T) {
}
}

// TestOpen_TracedSQL verifies the otelsql-wrapped path (traceSQL=true) opens a
// working handle — the instrumentation is transparent to query execution.
// (Span emission is exercised live with PDBPLUS_OTEL_SQL=1, not here, to avoid
// mutating the global TracerProvider from a parallel test.)
func TestOpen_TracedSQL(t *testing.T) {
t.Parallel()
dbPath := filepath.Join(t.TempDir(), "traced.db")
client, db, err := Open(dbPath, true)
if err != nil {
t.Fatalf("Open(%q, true) error: %v", dbPath, err)
}
defer client.Close()
defer db.Close()

var n int
if err := db.QueryRow("SELECT 1").Scan(&n); err != nil {
t.Fatalf("query on otelsql-wrapped DB: %v", err)
}
if n != 1 {
t.Errorf("SELECT 1 = %d, want 1", n)
}
}

func TestOpen_PoolConfig(t *testing.T) {
t.Parallel()
dbPath := filepath.Join(t.TempDir(), "test.db")
client, db, err := Open(dbPath)
client, db, err := Open(dbPath, false)
if err != nil {
t.Fatalf("Open(%q) error: %v", dbPath, err)
}
Expand Down
57 changes: 57 additions & 0 deletions internal/otel/sampler.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,19 @@ import (
sdktrace "go.opentelemetry.io/otel/sdk/trace"
)

// Span-start attributes the sync worker stamps on its root span so this
// sampler can gate sync traces independently of HTTP route. AttrSyncOrigin set
// to SyncOriginValue marks a scheduled sync cycle (dropped by default so it
// emits no trace — and, with PDBPLUS_OTEL_SQL on, no per-query DB spans);
// AttrForceSample set true marks a manually-triggered sync (POST /sync) and
// forces the trace to be sampled. Untyped string constants so they compare
// directly against attribute.Key in the ShouldSample scan.
const (
AttrSyncOrigin = "pdbplus.origin"
SyncOriginValue = "sync"
AttrForceSample = "pdbplus.force_sample"
)

// PerRouteSamplerInput configures NewPerRouteSampler.
//
// Routes maps URL-path prefix (e.g. "/healthz", "/api/") to a 0–1
Expand Down Expand Up @@ -102,6 +115,18 @@ type perRouteSampler struct {
// Hot-path allocation is bounded — the entries slice is pre-sorted at
// construction time and the SamplingResult is the only allocation.
func (s *perRouteSampler) ShouldSample(params sdktrace.SamplingParameters) sdktrace.SamplingResult {
// Sync-trace gating, independent of route: a manual force-sample wins;
// otherwise a sync-origin span (scheduled cycle) is dropped so it emits no
// trace. Non-sync spans fall through to the per-route logic below.
switch syncSampleDecision(params.Attributes) {
case decideForceSample:
return sdktrace.AlwaysSample().ShouldSample(params)
case decideDrop:
return sdktrace.NeverSample().ShouldSample(params)
case decideUnset:
// Not a sync span — fall through to per-route sampling below.
}

path := pathFromAttributes(params.Attributes)
if path == "" {
return s.defaultSampler.ShouldSample(params)
Expand Down Expand Up @@ -163,6 +188,38 @@ func pathFromAttributes(attrs []attribute.KeyValue) string {
return legacy
}

type syncDecision int

const (
decideUnset syncDecision = iota
decideForceSample
decideDrop
)

// syncSampleDecision inspects a span's start attributes for the sync-trace
// gating markers. Returns decideForceSample when AttrForceSample is true
// (manual POST /sync), decideDrop when the span is sync-origin without a force
// flag (scheduled cycle), or decideUnset for everything else (HTTP/internal).
func syncSampleDecision(attrs []attribute.KeyValue) syncDecision {
origin := false
for _, kv := range attrs {
switch kv.Key {
case AttrForceSample:
if kv.Value.AsBool() {
return decideForceSample
}
case AttrSyncOrigin:
if kv.Value.AsString() == SyncOriginValue {
origin = true
}
}
}
if origin {
return decideDrop
}
return decideUnset
}

// Description returns a stable human-readable identifier for OTel debug
// output. Includes the marker "PerRouteSampler" and the configured
// route count for at-a-glance diagnostics.
Expand Down
40 changes: 40 additions & 0 deletions internal/otel/sampler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,46 @@ func TestPerRouteSampler_APIDispatchedToFullRatio(t *testing.T) {
}
}

// TestPerRouteSampler_SyncTraceGating locks the sync-trace gates: a sync-origin
// span is dropped by default (scheduled cycles emit no trace), force_sample
// always samples (manual POST /sync), and force wins over origin. allOnes
// TraceID makes the assertions deterministic against the route/default ratios.
func TestPerRouteSampler_SyncTraceGating(t *testing.T) {
t.Parallel()

// DefaultRatio 1.0 would otherwise admit allOnes — proves the gate
// overrides route/default for sync-origin spans.
s := NewPerRouteSampler(PerRouteSamplerInput{
DefaultRatio: 1.0,
Routes: map[string]float64{"/api/": 1.0},
})

if res := s.ShouldSample(sampleParams(allOnesTraceID,
attribute.String(AttrSyncOrigin, SyncOriginValue))); res.Decision != sdktrace.Drop {
t.Errorf("origin=sync (no force): got %v, want Drop", res.Decision)
}

// force_sample always samples — here against a 0-ratio default that would
// otherwise drop the all-ones TraceID.
s0 := NewPerRouteSampler(PerRouteSamplerInput{DefaultRatio: 0.0})
if res := s0.ShouldSample(sampleParams(allOnesTraceID,
attribute.Bool(AttrForceSample, true))); res.Decision != sdktrace.RecordAndSample {
t.Errorf("force_sample: got %v, want RecordAndSample", res.Decision)
}

if res := s.ShouldSample(sampleParams(allOnesTraceID,
attribute.String(AttrSyncOrigin, SyncOriginValue),
attribute.Bool(AttrForceSample, true))); res.Decision != sdktrace.RecordAndSample {
t.Errorf("force_sample + origin=sync: got %v, want RecordAndSample (force wins)", res.Decision)
}

// No sync markers: ordinary per-route logic still applies.
if res := s.ShouldSample(sampleParams(allOnesTraceID,
attribute.String("url.path", "/api/net"))); res.Decision != sdktrace.RecordAndSample {
t.Errorf("no markers, /api/ at 1.0: got %v, want RecordAndSample", res.Decision)
}
}

func TestPerRouteSampler_RestV1PrefixMatch(t *testing.T) {
t.Parallel()

Expand Down
28 changes: 27 additions & 1 deletion internal/sync/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,24 @@ func (w *Worker) resolveEffectiveMode(ctx context.Context, configured config.Syn
return configured
}

// forceTraceKey is the context key carrying the manual-sync force-trace flag.
type forceTraceKey struct{}

// WithForceTrace marks ctx so the sync cycle run under it force-samples its
// trace, overriding the sampler's default of dropping scheduled-sync traces.
// It is set by the manual POST /sync handler and never by the timer scheduler,
// so an on-demand sync is observable end to end — including its per-query DB
// spans when PDBPLUS_OTEL_SQL is enabled.
func WithForceTrace(ctx context.Context) context.Context {
return context.WithValue(ctx, forceTraceKey{}, true)
}

// forceTraceFromContext reports whether ctx was marked by WithForceTrace.
func forceTraceFromContext(ctx context.Context) bool {
v, _ := ctx.Value(forceTraceKey{}).(bool)
return v
}

func (w *Worker) Sync(ctx context.Context, mode config.SyncMode) (err error) {
ctx = privacy.DecisionContext(ctx, privacy.Allow) // privacy bypass — sole production call site
if !w.running.CompareAndSwap(false, true) {
Expand All @@ -588,7 +606,15 @@ func (w *Worker) Sync(ctx context.Context, mode config.SyncMode) (err error) {
}
defer w.running.Store(false)

ctx, span := otel.Tracer("sync").Start(ctx, "sync-"+string(mode))
// Tag the root sync span so the sampler can gate sync traces: origin=sync
// makes scheduled cycles drop by default; a manual POST /sync sets
// force_sample (via WithForceTrace) so that one cycle — and its per-query
// DB spans when PDBPLUS_OTEL_SQL is on — is sampled.
spanAttrs := []attribute.KeyValue{attribute.String(pdbotel.AttrSyncOrigin, pdbotel.SyncOriginValue)}
if forceTraceFromContext(ctx) {
spanAttrs = append(spanAttrs, attribute.Bool(pdbotel.AttrForceSample, true))
}
ctx, span := otel.Tracer("sync").Start(ctx, "sync-"+string(mode), trace.WithAttributes(spanAttrs...))
defer span.End()

start := time.Now()
Expand Down