TVL Platform - Observability & Monitoring
Summary
This document describes the complete observability stack for the TVL platform, including metrics, traces, logs, dashboards, and alerting. Covers OpenTelemetry instrumentation, custom metrics, structured logging, Grafana dashboards, and Sentry error tracking.
Table of Contents
- Observability Architecture
- OpenTelemetry Configuration
- Metrics
- Distributed Tracing
- Structured Logging
- Grafana Dashboards
- Sentry Error Tracking
- Alerting
- Debugging Workflows
Observability Architecture
Three Pillars of Observability
Technology Stack
| Component | Technology | Purpose | 
|---|---|---|
| Instrumentation | OpenTelemetry (Node.js SDK) | Traces, metrics, logs | 
| Metrics Storage | Grafana Mimir (Prometheus-compatible) | Time-series metrics | 
| Trace Storage | Grafana Tempo | Distributed traces | 
| Log Storage | Grafana Loki | Structured logs | 
| Dashboards | Grafana Cloud | Visualization | 
| Error Tracking | Sentry | Exception monitoring | 
| Uptime Monitoring | UptimeRobot | Synthetic checks | 
| APM | OpenTelemetry + Grafana | Application performance | 
OpenTelemetry Configuration
Installation
# Install OpenTelemetry packages
npm install @opentelemetry/sdk-node \
            @opentelemetry/auto-instrumentations-node \
            @opentelemetry/exporter-trace-otlp-http \
            @opentelemetry/exporter-metrics-otlp-http \
            @opentelemetry/resources \
            @opentelemetry/semantic-conventions
Initialization
File: src/observability/otel.ts
import { NodeSDK } from '@opentelemetry/sdk-node';
import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http';
import { OTLPMetricExporter } from '@opentelemetry/exporter-metrics-otlp-http';
import { PeriodicExportingMetricReader } from '@opentelemetry/sdk-metrics';
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
import { Resource } from '@opentelemetry/resources';
import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';
// Environment configuration
const OTEL_ENABLED = process.env.OTEL_ENABLED === 'true';
const OTEL_ENDPOINT = process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'https://otlp.grafana.net';
const OTEL_AUTH = process.env.OTEL_EXPORTER_OTLP_HEADERS || '';
const SERVICE_NAME = process.env.OTEL_SERVICE_NAME || 'tvl-api';
const ENVIRONMENT = process.env.NODE_ENV || 'development';
if (!OTEL_ENABLED) {
  console.log('OpenTelemetry disabled (OTEL_ENABLED=false)');
}
// Resource attributes
const resource = new Resource({
  [SemanticResourceAttributes.SERVICE_NAME]: SERVICE_NAME,
  [SemanticResourceAttributes.SERVICE_VERSION]: process.env.APP_VERSION || '1.0.0',
  [SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: ENVIRONMENT,
  'service.instance.id': process.env.RAILWAY_DEPLOYMENT_ID || process.env.HOSTNAME || 'local',
});
// Trace exporter
const traceExporter = new OTLPTraceExporter({
  url: `${OTEL_ENDPOINT}/v1/traces`,
  headers: parseHeaders(OTEL_AUTH),
});
// Metric exporter
const metricExporter = new OTLPMetricExporter({
  url: `${OTEL_ENDPOINT}/v1/metrics`,
  headers: parseHeaders(OTEL_AUTH),
});
// Metric reader (export every 60 seconds)
const metricReader = new PeriodicExportingMetricReader({
  exporter: metricExporter,
  exportIntervalMillis: 60000,
});
// Initialize SDK
export const sdk = new NodeSDK({
  resource,
  traceExporter,
  metricReader,
  instrumentations: [
    getNodeAutoInstrumentations({
      // Disable instrumentations that are too verbose
      '@opentelemetry/instrumentation-fs': {
        enabled: false,
      },
      // Configure HTTP instrumentation
      '@opentelemetry/instrumentation-http': {
        ignoreIncomingPaths: ['/health', '/metrics'],
        requestHook: (span, request) => {
          // Add custom attributes
          span.setAttribute('http.request.id', request.headers['x-request-id'] || 'unknown');
        },
      },
      // Configure Express instrumentation
      '@opentelemetry/instrumentation-express': {
        enabled: true,
      },
      // Configure Postgres instrumentation
      '@opentelemetry/instrumentation-pg': {
        enabled: true,
        enhancedDatabaseReporting: true,
      },
      // Configure Redis instrumentation
      '@opentelemetry/instrumentation-ioredis': {
        enabled: true,
      },
    }),
  ],
});
// Start SDK
if (OTEL_ENABLED) {
  sdk.start();
  console.log('OpenTelemetry SDK started');
  // Graceful shutdown
  process.on('SIGTERM', () => {
    sdk
      .shutdown()
      .then(() => console.log('OpenTelemetry SDK shut down'))
      .catch((error) => console.error('Error shutting down OpenTelemetry SDK', error))
      .finally(() => process.exit(0));
  });
}
// Helper function
function parseHeaders(headerString: string): Record<string, string> {
  const headers: Record<string, string> = {};
  if (!headerString) return headers;
  headerString.split(',').forEach((header) => {
    const [key, value] = header.split('=');
    if (key && value) {
      headers[key.trim()] = value.trim();
    }
  });
  return headers;
}
Environment Variables
# Production configuration
OTEL_ENABLED=true
OTEL_EXPORTER_OTLP_ENDPOINT=https://otlp-gateway-prod-us-east-0.grafana.net/otlp
OTEL_EXPORTER_OTLP_HEADERS="Authorization=Basic <base64-encoded-credentials>"
OTEL_SERVICE_NAME=tvl-api
OTEL_SERVICE_VERSION=1.2.3
NODE_ENV=production
# Staging configuration
OTEL_ENABLED=true
OTEL_EXPORTER_OTLP_ENDPOINT=https://otlp-gateway-prod-us-east-0.grafana.net/otlp
OTEL_EXPORTER_OTLP_HEADERS="Authorization=Basic <base64-encoded-credentials>"
OTEL_SERVICE_NAME=tvl-api-staging
NODE_ENV=staging
# Local development (disabled)
OTEL_ENABLED=false
Application Entrypoint
File: src/server.ts
// Import OpenTelemetry FIRST (before any other imports)
import './observability/otel';
// Now import application code
import express from 'express';
import { setupRoutes } from './routes';
import { setupMiddleware } from './middleware';
const app = express();
setupMiddleware(app);
setupRoutes(app);
app.listen(4000, () => {
  console.log('TVL API server listening on port 4000');
});
Metrics
Custom Metrics Namespace
All custom metrics use the tvl_* prefix.
Metric Types
| Metric Type | Use Case | Example | 
|---|---|---|
| Counter | Cumulative count (requests, errors) | tvl_http_requests_total | 
| Gauge | Current value (active connections) | tvl_active_bookings | 
| Histogram | Distribution (latency, sizes) | tvl_request_duration_seconds | 
| Summary | Statistical summary | tvl_payment_amount_usd | 
Custom Metrics Definition
File: src/observability/metrics.ts
import { metrics } from '@opentelemetry/api';
const meter = metrics.getMeter('tvl-api');
// HTTP Metrics
export const httpRequestsTotal = meter.createCounter('tvl_http_requests_total', {
  description: 'Total number of HTTP requests',
  unit: '1',
});
export const httpRequestDuration = meter.createHistogram('tvl_http_request_duration_seconds', {
  description: 'HTTP request duration in seconds',
  unit: 's',
});
export const httpRequestsActive = meter.createUpDownCounter('tvl_http_requests_active', {
  description: 'Number of active HTTP requests',
  unit: '1',
});
// Business Metrics - Bookings
export const bookingsTotal = meter.createCounter('tvl_bookings_total', {
  description: 'Total number of bookings created',
  unit: '1',
});
export const bookingsActive = meter.createObservableGauge('tvl_bookings_active', {
  description: 'Number of active bookings',
  unit: '1',
});
export const bookingValue = meter.createHistogram('tvl_booking_value_usd', {
  description: 'Booking value in USD',
  unit: 'USD',
});
// Business Metrics - Payments
export const paymentsTotal = meter.createCounter('tvl_payments_total', {
  description: 'Total number of payments processed',
  unit: '1',
});
export const paymentAmount = meter.createHistogram('tvl_payment_amount_usd', {
  description: 'Payment amount in USD',
  unit: 'USD',
});
export const paymentFailures = meter.createCounter('tvl_payment_failures_total', {
  description: 'Total number of failed payments',
  unit: '1',
});
// Business Metrics - Sync Jobs
export const syncJobsTotal = meter.createCounter('tvl_sync_jobs_total', {
  description: 'Total number of sync jobs processed',
  unit: '1',
});
export const syncJobDuration = meter.createHistogram('tvl_sync_job_duration_seconds', {
  description: 'Sync job duration in seconds',
  unit: 's',
});
export const syncJobFailures = meter.createCounter('tvl_sync_job_failures_total', {
  description: 'Total number of failed sync jobs',
  unit: '1',
});
// Database Metrics
export const dbQueryDuration = meter.createHistogram('tvl_db_query_duration_seconds', {
  description: 'Database query duration in seconds',
  unit: 's',
});
export const dbConnectionsActive = meter.createObservableGauge('tvl_db_connections_active', {
  description: 'Number of active database connections',
  unit: '1',
});
// Redis Metrics
export const redisCommandDuration = meter.createHistogram('tvl_redis_command_duration_seconds', {
  description: 'Redis command duration in seconds',
  unit: 's',
});
export const cacheHits = meter.createCounter('tvl_cache_hits_total', {
  description: 'Total number of cache hits',
  unit: '1',
});
export const cacheMisses = meter.createCounter('tvl_cache_misses_total', {
  description: 'Total number of cache misses',
  unit: '1',
});
// Worker Queue Metrics
export const queueJobsWaiting = meter.createObservableGauge('tvl_queue_jobs_waiting', {
  description: 'Number of jobs waiting in queue',
  unit: '1',
});
export const queueJobsActive = meter.createObservableGauge('tvl_queue_jobs_active', {
  description: 'Number of jobs currently being processed',
  unit: '1',
});
export const queueJobsFailed = meter.createObservableGauge('tvl_queue_jobs_failed', {
  description: 'Number of failed jobs in DLQ',
  unit: '1',
});
Metric Usage Examples
Recording HTTP Requests:
import { httpRequestsTotal, httpRequestDuration } from './observability/metrics';
app.use((req, res, next) => {
  const start = Date.now();
  res.on('finish', () => {
    const duration = (Date.now() - start) / 1000;
    // Increment counter with labels
    httpRequestsTotal.add(1, {
      method: req.method,
      route: req.route?.path || 'unknown',
      status: res.statusCode.toString(),
    });
    // Record duration
    httpRequestDuration.record(duration, {
      method: req.method,
      route: req.route?.path || 'unknown',
      status: res.statusCode.toString(),
    });
  });
  next();
});
Recording Business Metrics:
import { bookingsTotal, bookingValue } from './observability/metrics';
async function createBooking(data: CreateBookingInput): Promise<Booking> {
  const booking = await db.bookings.create({ data });
  // Record metrics
  bookingsTotal.add(1, {
    org_id: booking.org_id,
    source: booking.source,
    status: booking.status,
  });
  bookingValue.record(booking.total_amount, {
    org_id: booking.org_id,
    source: booking.source,
  });
  return booking;
}
Recording Cache Metrics:
import { cacheHits, cacheMisses } from './observability/metrics';
async function getCachedProperty(propertyId: string): Promise<Property | null> {
  const cacheKey = `property:${propertyId}`;
  const cached = await redis.get(cacheKey);
  if (cached) {
    cacheHits.add(1, { key_type: 'property' });
    return JSON.parse(cached);
  }
  cacheMisses.add(1, { key_type: 'property' });
  const property = await db.properties.findUnique({ where: { id: propertyId } });
  if (property) {
    await redis.setex(cacheKey, 300, JSON.stringify(property));
  }
  return property;
}
Observable Gauge Callbacks
For gauge metrics that need periodic collection:
import { queueJobsWaiting, queueJobsActive, queueJobsFailed } from './observability/metrics';
// Register callback for observable gauges
queueJobsWaiting.addCallback(async (observableResult) => {
  const counts = await bullQueue.getJobCounts();
  observableResult.observe(counts.waiting, { queue: 'sync-worker' });
});
queueJobsActive.addCallback(async (observableResult) => {
  const counts = await bullQueue.getJobCounts();
  observableResult.observe(counts.active, { queue: 'sync-worker' });
});
queueJobsFailed.addCallback(async (observableResult) => {
  const counts = await bullQueue.getJobCounts();
  observableResult.observe(counts.failed, { queue: 'sync-worker' });
});
Distributed Tracing
Trace Context Propagation
OpenTelemetry automatically propagates trace context across HTTP requests.
Custom Spans
Creating Custom Spans:
import { trace } from '@opentelemetry/api';
const tracer = trace.getTracer('tvl-api');
async function syncPropertyAvailability(propertyId: string): Promise<void> {
  // Create span
  const span = tracer.startSpan('sync_property_availability', {
    attributes: {
      'property.id': propertyId,
      'sync.type': 'availability',
    },
  });
  try {
    // Fetch from database (automatically traced)
    const property = await db.properties.findUnique({ where: { id: propertyId } });
    span.setAttribute('property.name', property.name);
    // Fetch from Hostaway (automatically traced)
    const availability = await hostawayClient.getAvailability(property.external_id);
    span.setAttribute('availability.entries_count', availability.length);
    // Update database (automatically traced)
    await db.blocks.createMany({ data: availability });
    span.setStatus({ code: SpanStatusCode.OK });
  } catch (error) {
    // Record exception
    span.recordException(error as Error);
    span.setStatus({
      code: SpanStatusCode.ERROR,
      message: (error as Error).message,
    });
    throw error;
  } finally {
    span.end();
  }
}
Trace Attributes
Standard Attributes:
// HTTP attributes (automatic)
'http.method': 'POST'
'http.url': 'https://api.tvl.com/api/v1/bookings'
'http.status_code': 200
'http.request_content_length': 1234
// Database attributes (automatic)
'db.system': 'postgresql'
'db.name': 'tvl_production'
'db.statement': 'SELECT * FROM properties WHERE id = $1'
'db.operation': 'SELECT'
// Redis attributes (automatic)
'db.system': 'redis'
'db.statement': 'GET property:123'
Custom Attributes:
// Business entity attributes
span.setAttribute('booking.id', bookingId);
span.setAttribute('booking.status', booking.status);
span.setAttribute('booking.source', booking.source);
span.setAttribute('org.id', orgId);
span.setAttribute('user.id', userId);
// External API attributes
span.setAttribute('external.service', 'hostaway');
span.setAttribute('external.endpoint', '/v1/properties');
span.setAttribute('external.response_time_ms', 250);
Trace Sampling
Configuration:
import { ParentBasedSampler, TraceIdRatioBasedSampler } from '@opentelemetry/sdk-trace-node';
const sampler = new ParentBasedSampler({
  // Sample 10% of root spans in production
  root: new TraceIdRatioBasedSampler(0.1),
});
// Use in SDK
const sdk = new NodeSDK({
  // ... other config
  spanProcessor: new BatchSpanProcessor(traceExporter, {
    maxQueueSize: 2048,
    maxExportBatchSize: 512,
    scheduledDelayMillis: 5000,
  }),
});
Structured Logging
Logger Configuration
File: src/observability/logger.ts
import winston from 'winston';
import { trace, context } from '@opentelemetry/api';
// Define log format
const logFormat = winston.format.combine(
  winston.format.timestamp({ format: 'YYYY-MM-DD HH:mm:ss.SSS' }),
  winston.format.errors({ stack: true }),
  winston.format.json()
);
// Add trace context to logs
const traceContextFormat = winston.format((info) => {
  const span = trace.getSpan(context.active());
  if (span) {
    const spanContext = span.spanContext();
    info.trace_id = spanContext.traceId;
    info.span_id = spanContext.spanId;
    info.trace_flags = spanContext.traceFlags;
  }
  return info;
});
// Create logger
export const logger = winston.createLogger({
  level: process.env.LOG_LEVEL || 'info',
  format: winston.format.combine(traceContextFormat(), logFormat),
  defaultMeta: {
    service: process.env.OTEL_SERVICE_NAME || 'tvl-api',
    environment: process.env.NODE_ENV || 'development',
    version: process.env.APP_VERSION || '1.0.0',
  },
  transports: [
    // Console (structured JSON)
    new winston.transports.Console({
      format: winston.format.combine(
        winston.format.colorize({ all: false }),
        winston.format.json()
      ),
    }),
  ],
});
// Log levels: error, warn, info, http, debug
Log Structure
Standard Log Fields:
{
  "timestamp": "2025-10-24 10:30:45.123",
  "level": "info",
  "message": "Booking created successfully",
  "service": "tvl-api",
  "environment": "production",
  "version": "1.2.3",
  "trace_id": "4bf92f3577b34da6a3ce929d0e0e4736",
  "span_id": "00f067aa0ba902b7",
  "trace_flags": 1,
  "booking_id": "550e8400-e29b-41d4-a716-446655440000",
  "org_id": "660e8400-e29b-41d4-a716-446655440000",
  "user_id": "770e8400-e29b-41d4-a716-446655440000"
}
Usage Examples
Structured Logging:
import { logger } from './observability/logger';
// Info log with context
logger.info('Booking created successfully', {
  booking_id: booking.id,
  org_id: booking.org_id,
  user_id: currentUser.id,
  total_amount: booking.total_amount,
  check_in: booking.check_in,
  check_out: booking.check_out,
});
// Warning log
logger.warn('Payment capture delayed', {
  payment_id: payment.id,
  booking_id: booking.id,
  delay_seconds: delaySeconds,
  reason: 'Stripe API timeout',
});
// Error log with exception
logger.error('Failed to sync availability', {
  property_id: propertyId,
  connector: 'hostaway',
  error: error.message,
  stack: error.stack,
});
// Debug log (only in development)
logger.debug('Cache hit', {
  cache_key: cacheKey,
  ttl_remaining: ttlRemaining,
});
Log Levels
| Level | Usage | Production Visibility | 
|---|---|---|
| error | Application errors, exceptions | Always logged | 
| warn | Warnings, degraded functionality | Always logged | 
| info | Business events, state changes | Always logged | 
| http | HTTP request/response logs | Sampled (10%) | 
| debug | Detailed debugging information | Disabled | 
Sensitive Data Redaction
Auto-Redact PII:
import { logger } from './observability/logger';
const REDACTED_FIELDS = ['password', 'ssn', 'credit_card', 'api_key', 'secret'];
function redactSensitiveData(obj: any): any {
  if (typeof obj !== 'object' || obj === null) return obj;
  const redacted = { ...obj };
  for (const key in redacted) {
    if (REDACTED_FIELDS.some((field) => key.toLowerCase().includes(field))) {
      redacted[key] = '[REDACTED]';
    } else if (typeof redacted[key] === 'object') {
      redacted[key] = redactSensitiveData(redacted[key]);
    }
  }
  return redacted;
}
// Usage
logger.info('User created', redactSensitiveData({
  user_id: user.id,
  email: user.email,
  password: user.password, // Will be redacted
}));
Grafana Dashboards
Dashboard: API Overview
Purpose: High-level API health and performance
Panels:
- 
Request Rate (Graph) - Query: rate(tvl_http_requests_total[5m])
- Group by: status,method
 
- Query: 
- 
Request Duration P95 (Graph) - Query: histogram_quantile(0.95, rate(tvl_http_request_duration_seconds_bucket[5m]))
- Group by: route
 
- Query: 
- 
Error Rate (Single Stat) - Query: rate(tvl_http_requests_total{status=~"5.."}[5m]) / rate(tvl_http_requests_total[5m])
- Threshold: Warn > 1%, Critical > 5%
 
- Query: 
- 
Active Requests (Gauge) - Query: tvl_http_requests_active
 
- Query: 
- 
Request by Route (Table) - Query: sum by (route) (rate(tvl_http_requests_total[5m]))
- Columns: Route, RPS, P50, P95, Error Rate
 
- Query: 
Dashboard: Business Metrics
Purpose: Business KPIs and revenue tracking
Panels:
- 
Bookings Created (Graph) - Query: rate(tvl_bookings_total[1h])
- Group by: source,org_id
 
- Query: 
- 
Booking Value (Graph) - Query: sum(rate(tvl_booking_value_usd[1h]))
- Group by: org_id
 
- Query: 
- 
Active Bookings (Single Stat) - Query: tvl_bookings_active
 
- Query: 
- 
Payment Success Rate (Graph) - Query: rate(tvl_payments_total{status="succeeded"}[1h]) / rate(tvl_payments_total[1h])
 
- Query: 
- 
Payment Failures (Table) - Query: sum by (reason) (tvl_payment_failures_total)
- Columns: Reason, Count, Percentage
 
- Query: 
- 
Revenue (24h) (Single Stat) - Query: sum(increase(tvl_payment_amount_usd[24h]))
 
- Query: 
Dashboard: Worker Jobs
Purpose: Background job processing monitoring
Panels:
- 
Jobs Waiting (Graph) - Query: tvl_queue_jobs_waiting
- Group by: queue
 
- Query: 
- 
Jobs Active (Graph) - Query: tvl_queue_jobs_active
- Group by: queue
 
- Query: 
- 
Jobs Failed (Single Stat) - Query: tvl_queue_jobs_failed
- Threshold: Warn > 10, Critical > 100
 
- Query: 
- 
Job Duration P95 (Graph) - Query: histogram_quantile(0.95, rate(tvl_sync_job_duration_seconds_bucket[5m]))
- Group by: job_type
 
- Query: 
- 
Job Failure Rate (Graph) - Query: rate(tvl_sync_job_failures_total[5m]) / rate(tvl_sync_jobs_total[5m])
 
- Query: 
Dashboard: Database & Cache
Purpose: Database and Redis performance
Panels:
- 
Database Query Duration P95 (Graph) - Query: histogram_quantile(0.95, rate(tvl_db_query_duration_seconds_bucket[5m]))
 
- Query: 
- 
Active DB Connections (Gauge) - Query: tvl_db_connections_active
 
- Query: 
- 
Cache Hit Rate (Graph) - Query: rate(tvl_cache_hits_total[5m]) / (rate(tvl_cache_hits_total[5m]) + rate(tvl_cache_misses_total[5m]))
 
- Query: 
- 
Redis Command Duration P95 (Graph) - Query: histogram_quantile(0.95, rate(tvl_redis_command_duration_seconds_bucket[5m]))
 
- Query: 
Dashboard Export/Import
# Export dashboard as JSON
curl -H "Authorization: Bearer <grafana-api-key>" \
  https://tvl.grafana.net/api/dashboards/uid/<dashboard-uid> \
  > dashboard-api-overview.json
# Import dashboard
curl -X POST -H "Authorization: Bearer <grafana-api-key>" \
  -H "Content-Type: application/json" \
  -d @dashboard-api-overview.json \
  https://tvl.grafana.net/api/dashboards/db
Sentry Error Tracking
Configuration
File: src/observability/sentry.ts
import * as Sentry from '@sentry/node';
import { ProfilingIntegration } from '@sentry/profiling-node';
Sentry.init({
  dsn: process.env.SENTRY_DSN,
  environment: process.env.NODE_ENV || 'development',
  release: process.env.APP_VERSION || '1.0.0',
  // Performance monitoring
  tracesSampleRate: process.env.NODE_ENV === 'production' ? 0.1 : 1.0,
  // Profiling
  profilesSampleRate: 0.1,
  integrations: [
    new ProfilingIntegration(),
  ],
  // Ignore common errors
  ignoreErrors: [
    'NetworkError',
    'AbortError',
    'TimeoutError',
  ],
  // BeforeSend hook
  beforeSend(event, hint) {
    // Redact sensitive data
    if (event.request) {
      delete event.request.cookies;
      if (event.request.headers) {
        delete event.request.headers.authorization;
      }
    }
    return event;
  },
});
export { Sentry };
Error Capture
Automatic Error Capture:
import express from 'express';
import { Sentry } from './observability/sentry';
const app = express();
// Sentry request handler (must be first middleware)
app.use(Sentry.Handlers.requestHandler());
app.use(Sentry.Handlers.tracingHandler());
// ... your routes
// Sentry error handler (must be after routes)
app.use(Sentry.Handlers.errorHandler());
// Fallback error handler
app.use((err, req, res, next) => {
  logger.error('Unhandled error', {
    error: err.message,
    stack: err.stack,
  });
  res.status(500).json({ error: 'Internal server error' });
});
Manual Error Capture:
import { Sentry } from './observability/sentry';
try {
  await processPayment(paymentId);
} catch (error) {
  // Capture exception with context
  Sentry.captureException(error, {
    tags: {
      payment_id: paymentId,
      org_id: orgId,
    },
    contexts: {
      payment: {
        amount: payment.amount,
        currency: payment.currency,
        status: payment.status,
      },
    },
    level: 'error',
  });
  throw error;
}
Custom Error Events:
Sentry.captureMessage('Payment processing delayed', {
  level: 'warning',
  tags: {
    payment_id: paymentId,
    delay_seconds: delaySeconds,
  },
});
Error Context
User Context:
Sentry.setUser({
  id: user.id,
  email: user.email,
  org_id: user.org_id,
});
Custom Context:
Sentry.setContext('booking', {
  id: booking.id,
  status: booking.status,
  check_in: booking.check_in,
  total_amount: booking.total_amount,
});
Alerting
Alert Rules
Critical Alerts (PagerDuty):
- 
API Downtime - Condition: up{job="tvl-api"} == 0
- Duration: 1 minute
- Notification: PagerDuty
 
- Condition: 
- 
High Error Rate - Condition: rate(tvl_http_requests_total{status=~"5.."}[5m]) / rate(tvl_http_requests_total[5m]) > 0.05
- Duration: 5 minutes
- Notification: PagerDuty
 
- Condition: 
- 
Payment Failures - Condition: rate(tvl_payment_failures_total[5m]) > 10
- Duration: 5 minutes
- Notification: PagerDuty
 
- Condition: 
Warning Alerts (Slack):
- 
High Latency - Condition: histogram_quantile(0.95, rate(tvl_http_request_duration_seconds_bucket[5m])) > 1
- Duration: 10 minutes
- Notification: #engineering-alerts
 
- Condition: 
- 
Queue Backlog - Condition: tvl_queue_jobs_waiting > 1000
- Duration: 15 minutes
- Notification: #engineering-alerts
 
- Condition: 
- 
Failed Jobs - Condition: tvl_queue_jobs_failed > 100
- Duration: 30 minutes
- Notification: #engineering-alerts
 
- Condition: 
- 
Cache Miss Rate High - Condition: rate(tvl_cache_misses_total[5m]) / (rate(tvl_cache_hits_total[5m]) + rate(tvl_cache_misses_total[5m])) > 0.5
- Duration: 30 minutes
- Notification: #engineering-alerts
 
- Condition: 
Alert Configuration
Grafana Alert Rules (YAML):
# /etc/grafana/provisioning/alerting/rules.yaml
apiVersion: 1
groups:
  - name: tvl-api-alerts
    interval: 1m
    rules:
      - uid: tvl-api-downtime
        title: TVL API Downtime
        condition: A
        data:
          - refId: A
            queryType: ''
            relativeTimeRange:
              from: 300
              to: 0
            datasourceUid: prometheus
            model:
              expr: up{job="tvl-api"} == 0
        annotations:
          summary: TVL API is down
          description: API has been unreachable for 1+ minutes
        labels:
          severity: critical
        for: 1m
        noDataState: NoData
        execErrState: Error
Debugging Workflows
Workflow: High Latency Investigation
Step 1: Identify slow endpoints
# Find slowest endpoints (P95 latency)
topk(10,
  histogram_quantile(0.95,
    rate(tvl_http_request_duration_seconds_bucket[5m])
  )
) by (route)
Step 2: Examine traces
- Go to Grafana → Explore → Tempo
- Query: {service.name="tvl-api"} && duration > 1s
- Select slow trace
- Identify slowest span
Step 3: Check database queries
# Slow database queries
histogram_quantile(0.95,
  rate(tvl_db_query_duration_seconds_bucket[5m])
) > 0.5
Step 4: Review logs
# Grafana Loki query
{service="tvl-api", level="warn"}
|= "slow query"
| json
| duration > 1000
Workflow: Error Rate Spike
Step 1: Identify error type
# Errors by route
sum by (route, status) (
  rate(tvl_http_requests_total{status=~"5.."}[5m])
)
Step 2: Check Sentry
- Go to Sentry dashboard
- Filter by time range
- Group by error message
- Examine stack traces
Step 3: Correlate with deployment
# Check deployment logs
{service="tvl-api"}
|= "deployment"
| json
| __error__=""
Step 4: Review external dependencies
# External API errors
rate(tvl_sync_job_failures_total{reason="external_api_error"}[5m])
Sources
- /mnt/c/GitHub/claude-test/docs/01-architecture/deployment-views.md
- /mnt/c/GitHub/claude-test/docs/01-architecture/logical-architecture.md
- OpenTelemetry documentation
- Grafana observability best practices