Skip to content

Commit dac9c83

Browse files
authored
chore(webapp,run-engine): downgrade boundary log noise to warn (#3462)
## Summary Several boundary catches and customer-input validation paths were logging at `error` level for failures the system already handles gracefully — disconnect on auth failure, return undefined, skip retries, etc. This batch routes them to `warn` (which stays in stdout) or counts them as OTel metrics, so visibility is preserved without surfacing them as alerts. ## Changes **New helper / pattern:** - `apiBuilder.server.ts` — `logBoundaryError(message, error, url)` inspects the inner error type at loader/action boundary catches; downgrades to `warn` for `AbortError`, `ServiceValidationError`, and `EngineServiceValidationError`. - `platform.v3.server.ts` — `platform_client.failures_total` OTel counter with `{function, kind}` labels; helper `recordPlatformFailure(fn, kind)` replaces the previous error-level logging across all `BillingClient` wrappers. **Log-level downgrades:** - `handleSocketIo.server.ts` — `Worker authentication failed` → warn (system disconnects on failure; refs TRI-8863) - `waitpointSystem.ts` — when `runStatus === "CANCELED"` in the suspended-without-checkpoint branch, skip the throw and warn instead (benign cancel-vs-resume race, nothing to resume) - `runAttemptSystem.ts` — `flushedMetadata` parse/validate failures → warn (customer-side data shape, system returns gracefully) - `batch-queue/index.ts` — final-attempt failures with `result.skipRetries` → warn (callbacks already opted out of retry, e.g. queue size limit hit) - `queryPerformanceMonitor.server.ts` — slow queries → warn (observability signal, not an application error) - `timeoutDeployment.server.ts` — deployment-state mismatch in the timeout job → warn (timeout-vs-completion race) **Inner error preservation:** - `waitpointCompletionPacket.server.ts` — `logger.error(uploadError)` before throwing the `ServiceValidationError` wrapper, so the underlying upload error stays visible. ## Why The pattern across all of these is the same: a boundary log treated any thrown/returned error as `error` regardless of cause, even when the cause was an expected, system-handled condition (client disconnect, customer quota, race condition, schema validation of customer data). That made the logs noisy and made it harder to spot real bugs. Where the underlying signal is still useful operationally (slow queries, billing call failures), we route it to OTel metrics with low-cardinality labels so dashboards and alerts can be tuned independently of error logs. ## Test plan - [ ] `pnpm run typecheck --filter webapp` - [ ] `pnpm run build --filter @internal/run-engine` - [ ] Trigger a run on hello-world and verify task lifecycle is unaffected - [ ] Cancel a suspended run and verify the cancel-while-suspended branch in `waitpointSystem.ts` returns `{status: "skipped"}` instead of throwing - [ ] Confirm `platform_client.failures_total` counter shows up in metrics with `{function, kind}` labels when the billing client errors
1 parent 1a7943c commit dac9c83

11 files changed

Lines changed: 180 additions & 124 deletions

File tree

apps/webapp/app/runEngine/concerns/waitpointCompletionPacket.server.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import { type IOPacket, packetRequiresOffloading, tryCatch } from "@trigger.dev/
22
import type { AuthenticatedEnvironment } from "~/services/apiAuth.server";
33
import { env } from "~/env.server";
44
import { uploadPacketToObjectStore } from "~/v3/objectStore.server";
5+
import { logger } from "~/services/logger.server";
56
import { ServiceValidationError } from "~/v3/services/common.server";
67

78
function packetExtensionForDataType(dataType: string): string {
@@ -53,6 +54,11 @@ export async function processWaitpointCompletionPacket(
5354
);
5455

5556
if (uploadError) {
57+
logger.error("Failed to upload large waitpoint to object store", {
58+
error: uploadError,
59+
filename,
60+
environmentId: environment.id,
61+
});
5662
throw new ServiceValidationError("Failed to upload large waitpoint to object store", 500);
5763
}
5864

apps/webapp/app/services/platform.v3.server.ts

Lines changed: 51 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import { newProjectPath, organizationBillingPath } from "~/utils/pathBuilder";
3131
import { singleton } from "~/utils/singleton";
3232
import { RedisCacheStore } from "./unkey/redisCacheStore.server";
3333
import { $replica } from "~/db.server";
34+
import { metrics } from "@opentelemetry/api";
3435

3536
function initializeClient() {
3637
if (isCloud() && process.env.BILLING_API_URL && process.env.BILLING_API_KEY) {
@@ -43,6 +44,23 @@ function initializeClient() {
4344
}
4445

4546
const client = singleton("billingClient", initializeClient);
47+
// Failures from @trigger.dev/platform billing client calls are tracked via
48+
// this metric (with low-cardinality {function, kind} labels) rather than
49+
// logged. Every task invocation hits these paths, so per-call logs were too
50+
// noisy; dashboard the counter for visibility instead.
51+
const platformClientMeter = metrics.getMeter("trigger.dev/platform-client");
52+
const platformClientFailuresCounter = platformClientMeter.createCounter(
53+
"platform_client.failures_total",
54+
{
55+
description:
56+
"Failures returned or thrown by @trigger.dev/platform billing client calls",
57+
}
58+
);
59+
60+
function recordPlatformFailure(fn: string, kind: "caught" | "no_success") {
61+
platformClientFailuresCounter.add(1, { function: fn, kind });
62+
}
63+
4664

4765
function initializePlatformCache() {
4866
const ctx = new DefaultStatefulContext();
@@ -206,7 +224,7 @@ export async function getCurrentPlan(orgId: string) {
206224
firstDayOfNextMonth.setUTCHours(0, 0, 0, 0);
207225

208226
if (!result.success) {
209-
logger.error("Error getting current plan - no success", { orgId, error: result.error });
227+
recordPlatformFailure("getCurrentPlan", "no_success");
210228
return undefined;
211229
}
212230

@@ -222,7 +240,7 @@ export async function getCurrentPlan(orgId: string) {
222240

223241
return { ...result, usage };
224242
} catch (e) {
225-
logger.error("Error getting current plan - caught error", { orgId, error: e });
243+
recordPlatformFailure("getCurrentPlan", "caught");
226244
return undefined;
227245
}
228246
}
@@ -233,13 +251,13 @@ export async function getLimits(orgId: string) {
233251
try {
234252
const result = await client.currentPlan(orgId);
235253
if (!result.success) {
236-
logger.error("Error getting limits - no success", { orgId, error: result.error });
254+
recordPlatformFailure("getLimits", "no_success");
237255
return undefined;
238256
}
239257

240258
return result.v3Subscription?.plan?.limits;
241259
} catch (e) {
242-
logger.error("Error getting limits - caught error", { orgId, error: e });
260+
recordPlatformFailure("getLimits", "caught");
243261
return undefined;
244262
}
245263
}
@@ -315,7 +333,7 @@ export async function customerPortalUrl(orgId: string, orgSlug: string) {
315333
returnUrl: `${env.APP_ORIGIN}${organizationBillingPath({ slug: orgSlug })}`,
316334
});
317335
} catch (e) {
318-
logger.error("Error getting customer portal Url", { orgId, error: e });
336+
recordPlatformFailure("customerPortalUrl", "caught");
319337
return undefined;
320338
}
321339
}
@@ -326,12 +344,12 @@ export async function getPlans() {
326344
try {
327345
const result = await client.plans();
328346
if (!result.success) {
329-
logger.error("Error getting plans - no success", { error: result.error });
347+
recordPlatformFailure("getPlans", "no_success");
330348
return undefined;
331349
}
332350
return result;
333351
} catch (e) {
334-
logger.error("Error getting plans - caught error", { error: e });
352+
recordPlatformFailure("getPlans", "caught");
335353
return undefined;
336354
}
337355
}
@@ -408,12 +426,12 @@ export async function setConcurrencyAddOn(organizationId: string, amount: number
408426
try {
409427
const result = await client.setAddOn(organizationId, { type: "concurrency", amount });
410428
if (!result.success) {
411-
logger.error("Error setting concurrency add on - no success", { error: result.error });
429+
recordPlatformFailure("setConcurrencyAddOn", "no_success");
412430
return undefined;
413431
}
414432
return result;
415433
} catch (e) {
416-
logger.error("Error setting concurrency add on - caught error", { error: e });
434+
recordPlatformFailure("setConcurrencyAddOn", "caught");
417435
return undefined;
418436
}
419437
}
@@ -424,12 +442,12 @@ export async function setSeatsAddOn(organizationId: string, amount: number) {
424442
try {
425443
const result = await client.setAddOn(organizationId, { type: "seats", amount });
426444
if (!result.success) {
427-
logger.error("Error setting seats add on - no success", { error: result.error });
445+
recordPlatformFailure("setSeatsAddOn", "no_success");
428446
return undefined;
429447
}
430448
return result;
431449
} catch (e) {
432-
logger.error("Error setting seats add on - caught error", { error: e });
450+
recordPlatformFailure("setSeatsAddOn", "caught");
433451
return undefined;
434452
}
435453
}
@@ -440,12 +458,12 @@ export async function setBranchesAddOn(organizationId: string, amount: number) {
440458
try {
441459
const result = await client.setAddOn(organizationId, { type: "branches", amount });
442460
if (!result.success) {
443-
logger.error("Error setting branches add on - no success", { error: result.error });
461+
recordPlatformFailure("setBranchesAddOn", "no_success");
444462
return undefined;
445463
}
446464
return result;
447465
} catch (e) {
448-
logger.error("Error setting branches add on - caught error", { error: e });
466+
recordPlatformFailure("setBranchesAddOn", "caught");
449467
return undefined;
450468
}
451469
}
@@ -456,12 +474,12 @@ export async function getUsage(organizationId: string, { from, to }: { from: Dat
456474
try {
457475
const result = await client.usage(organizationId, { from, to });
458476
if (!result.success) {
459-
logger.error("Error getting usage - no success", { error: result.error });
477+
recordPlatformFailure("getUsage", "no_success");
460478
return undefined;
461479
}
462480
return result;
463481
} catch (e) {
464-
logger.error("Error getting usage - caught error", { error: e });
482+
recordPlatformFailure("getUsage", "caught");
465483
return undefined;
466484
}
467485
}
@@ -490,12 +508,12 @@ export async function getUsageSeries(organizationId: string, params: UsageSeries
490508
try {
491509
const result = await client.usageSeries(organizationId, params);
492510
if (!result.success) {
493-
logger.error("Error getting usage series - no success", { error: result.error });
511+
recordPlatformFailure("getUsageSeries", "no_success");
494512
return undefined;
495513
}
496514
return result;
497515
} catch (e) {
498-
logger.error("Error getting usage series - caught error", { error: e });
516+
recordPlatformFailure("getUsageSeries", "caught");
499517
return undefined;
500518
}
501519
}
@@ -514,12 +532,12 @@ export async function reportInvocationUsage(
514532
additionalData,
515533
});
516534
if (!result.success) {
517-
logger.error("Error reporting invocation - no success", { error: result.error });
535+
recordPlatformFailure("reportInvocationUsage", "no_success");
518536
return undefined;
519537
}
520538
return result;
521539
} catch (e) {
522-
logger.error("Error reporting invocation - caught error", { error: e });
540+
recordPlatformFailure("reportInvocationUsage", "caught");
523541
return undefined;
524542
}
525543
}
@@ -550,12 +568,12 @@ export async function getEntitlement(
550568
try {
551569
const response = await client.getEntitlement(organizationId);
552570
if (!response.success) {
553-
logger.error("Error getting entitlement - no success", { error: response.error });
571+
recordPlatformFailure("getEntitlement", "no_success");
554572
return undefined;
555573
}
556574
return response;
557575
} catch (e) {
558-
logger.error("Error getting entitlement - caught error", { error: e });
576+
recordPlatformFailure("getEntitlement", "caught");
559577
return undefined;
560578
}
561579
});
@@ -602,7 +620,7 @@ export async function getBillingAlerts(
602620
if (!client) return undefined;
603621
const result = await client.getBillingAlerts(organizationId);
604622
if (!result.success) {
605-
logger.error("Error getting billing alert", { error: result.error, organizationId });
623+
recordPlatformFailure("getBillingAlert", "no_success");
606624
throw new Error("Error getting billing alert");
607625
}
608626
return result;
@@ -615,7 +633,7 @@ export async function setBillingAlert(
615633
if (!client) return undefined;
616634
const result = await client.updateBillingAlerts(organizationId, alert);
617635
if (!result.success) {
618-
logger.error("Error setting billing alert", { error: result.error, organizationId });
636+
recordPlatformFailure("setBillingAlert", "no_success");
619637
throw new Error("Error setting billing alert");
620638
}
621639
return result;
@@ -628,11 +646,7 @@ export async function generateRegistryCredentials(
628646
if (!client) return undefined;
629647
const result = await client.generateRegistryCredentials(projectId, region);
630648
if (!result.success) {
631-
logger.error("Error generating registry credentials", {
632-
error: result.error,
633-
projectId,
634-
region,
635-
});
649+
recordPlatformFailure("generateRegistryCredentials", "no_success");
636650
throw new Error("Failed to generate registry credentials");
637651
}
638652

@@ -651,13 +665,7 @@ export async function enqueueBuild(
651665
if (!client) return undefined;
652666
const result = await client.enqueueBuild(projectId, { deploymentId, artifactKey, options });
653667
if (!result.success) {
654-
logger.error("Error enqueuing build", {
655-
error: result.error,
656-
projectId,
657-
deploymentId,
658-
artifactKey,
659-
options,
660-
});
668+
recordPlatformFailure("enqueueBuild", "no_success");
661669
throw new Error("Failed to enqueue build");
662670
}
663671

@@ -672,12 +680,12 @@ export async function getPrivateLinks(
672680
const [error, result] = await tryCatch(client.getPrivateLinks(organizationId));
673681

674682
if (error) {
675-
logger.error("Error getting private links", { organizationId, error });
683+
recordPlatformFailure("getPrivateLinks", "caught");
676684
return undefined;
677685
}
678686

679687
if (!result.success) {
680-
logger.error("Error getting private links - no success", { organizationId, error: result.error });
688+
recordPlatformFailure("getPrivateLinks", "no_success");
681689
return undefined;
682690
}
683691

@@ -693,12 +701,12 @@ export async function createPrivateLink(
693701
const [error, result] = await tryCatch(client.createPrivateLink(organizationId, body));
694702

695703
if (error) {
696-
logger.error("Error creating private link", { organizationId, error });
704+
recordPlatformFailure("createPrivateLink", "caught");
697705
throw error;
698706
}
699707

700708
if (!result.success) {
701-
logger.error("Error creating private link - no success", { organizationId, error: result.error });
709+
recordPlatformFailure("createPrivateLink", "no_success");
702710
throw new Error(result.error ?? "Failed to create private link");
703711
}
704712

@@ -714,12 +722,12 @@ export async function deletePrivateLink(
714722
const [error, result] = await tryCatch(client.deletePrivateLink(organizationId, connectionId));
715723

716724
if (error) {
717-
logger.error("Error deleting private link", { organizationId, connectionId, error });
725+
recordPlatformFailure("deletePrivateLink", "caught");
718726
throw error;
719727
}
720728

721729
if (!result.success) {
722-
logger.error("Error deleting private link - no success", { organizationId, connectionId, error: result.error });
730+
recordPlatformFailure("deletePrivateLink", "no_success");
723731
throw new Error(result.error ?? "Failed to delete private link");
724732
}
725733
}
@@ -732,12 +740,12 @@ export async function getPrivateLinkRegions(
732740
const [error, result] = await tryCatch(client.getPrivateLinkRegions(organizationId));
733741

734742
if (error) {
735-
logger.error("Error getting private link regions", { organizationId, error });
743+
recordPlatformFailure("getPrivateLinkRegions", "caught");
736744
return undefined;
737745
}
738746

739747
if (!result.success) {
740-
logger.error("Error getting private link regions - no success", { organizationId, error: result.error });
748+
recordPlatformFailure("getPrivateLinkRegions", "no_success");
741749
return undefined;
742750
}
743751

0 commit comments

Comments
 (0)