From ca84754bd28888f855f79b498b4fd6e3f6986cb9 Mon Sep 17 00:00:00 2001 From: Stringadmin Date: Tue, 9 Jun 2026 12:02:29 +0800 Subject: [PATCH] fix: feed provider health into circuit breaker --- .env.example | 1 + src/providerHealthMonitor.js | 23 ++++++++++++++++++----- src/routes/ai.js | 10 ++++++++-- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/.env.example b/.env.example index 03cf184..9596af2 100644 --- a/.env.example +++ b/.env.example @@ -25,6 +25,7 @@ PG_POOL_MAX=10 TASK_PROVIDER_POLL_MAX_CONCURRENCY=8 TASK_PROVIDER_POLL_SLOT_TTL_MS=30000 TASK_PROVIDER_POLL_REQUEST_TIMEOUT_MS=25000 +GRSAI_IMAGE_SUBMIT_TIMEOUT_MS=30000 # CORS (comma separated allowed origins, * for all) CORS_ORIGINS=* diff --git a/src/providerHealthMonitor.js b/src/providerHealthMonitor.js index 0b7cd61..7bde689 100644 --- a/src/providerHealthMonitor.js +++ b/src/providerHealthMonitor.js @@ -7,6 +7,7 @@ */ const { pool } = require("./db"); +const { recordProviderSuccess, recordProviderFailure, getAllBreakerStats } = require("./providerCircuitBreaker"); const CHECK_INTERVAL_MS = 5 * 60 * 1000; const DASHSCOPE_TEST_MODEL = "qwen-max"; @@ -21,6 +22,15 @@ const providerHealthCache = { grsai: { status: "unknown", lastCheck: null, lastError: null, details: null }, }; +function recordProbeOutcome(provider, result, latencyMs) { + if (!provider) return; + if (result?.ok) { + recordProviderSuccess(provider, latencyMs); + } else { + recordProviderFailure(provider); + } +} + async function getDashScopeKey() { const { rows } = await pool.query( "SELECT id, api_key FROM api_keys WHERE provider LIKE '%dashscope%' AND enabled = 1 ORDER BY id LIMIT 1" @@ -120,8 +130,10 @@ async function runHealthCheck() { // ── DashScope ── const dashKey = await getDashScopeKey(); if (dashKey) { + const startedAt = Date.now(); try { const result = await probeDashScope(dashKey); + recordProbeOutcome("dashscope", result, Date.now() - startedAt); const prev = providerHealthCache.dashscope.status; providerHealthCache.dashscope = { status: result.status, @@ -144,6 +156,7 @@ async function runHealthCheck() { } } } catch (err) { + recordProviderFailure("dashscope"); providerHealthCache.dashscope = { status: "timeout", lastCheck: new Date().toISOString(), @@ -164,8 +177,10 @@ async function runHealthCheck() { // ── GrsAI ── const grsaiKey = await getGrsaiKey(); if (grsaiKey) { + const startedAt = Date.now(); try { const result = await probeGrsai(grsaiKey); + recordProbeOutcome("grsai", result, Date.now() - startedAt); const prev = providerHealthCache.grsai.status; providerHealthCache.grsai = { status: result.status, @@ -186,6 +201,7 @@ async function runHealthCheck() { } } } catch (err) { + recordProviderFailure("grsai"); providerHealthCache.grsai = { status: "timeout", lastCheck: new Date().toISOString(), @@ -204,10 +220,7 @@ async function runHealthCheck() { } // ── Circuit breaker summary ── - try { - const cb = require("./providerCircuitBreaker"); - providerHealthCache.circuitBreaker = cb.getProviderStatusMap ? cb.getProviderStatusMap() : null; - } catch {} + providerHealthCache.circuitBreaker = getAllBreakerStats(); // ── Admin low-balance alert ── try { @@ -256,4 +269,4 @@ module.exports = { stopProviderHealthMonitor, getProviderHealthCache, runHealthCheck, -}; \ No newline at end of file +}; diff --git a/src/routes/ai.js b/src/routes/ai.js index d71cf4e..3666f34 100644 --- a/src/routes/ai.js +++ b/src/routes/ai.js @@ -4,7 +4,7 @@ const crypto = require("node:crypto"); const { requireAuth, keyManager, preauthorizeCall, pool, withTransaction, deductImageGenerationCredits } = require("./context"); const { putObject, isOssConfigured } = require("../ossClient"); const { buildImageProviderDebug, resolveImageProviderCandidates, resolveVideoProvider, resolveTextProvider, getPostUrl } = require("../aiProviderRouter"); -const { shouldSkipProvider, recordProviderSuccess, recordProviderFailure } = require("../providerCircuitBreaker"); +const { shouldSkipProvider, recordProviderSuccess, recordProviderFailure, getAdaptiveTimeout } = require("../providerCircuitBreaker"); const { isEnterpriseVideoBillingUser, markEnterpriseVideoCreditsAccepted, @@ -60,6 +60,7 @@ function toViapiAccessibleUrl(url) { const SUPER_RESOLVE_POLL_INTERVAL_MS = 3000; const SUPER_RESOLVE_MAX_POLL_ATTEMPTS = 200; const IMAGE_PROVIDER_SUBMIT_TIMEOUT_MS = 90_000; +const GRSAI_IMAGE_SUBMIT_TIMEOUT_MS = Number(process.env.GRSAI_IMAGE_SUBMIT_TIMEOUT_MS || 30_000); const GEMINI_IMAGE_SUBMIT_TIMEOUT_MS = 180_000; const DASHSCOPE_VIDEO_STYLE_ENDPOINT = "https://dashscope.aliyuncs.com/api/v1/services/aigc/video-generation/video-synthesis"; const DASHSCOPE_IMAGE_EDIT_ENDPOINT = "https://dashscope.aliyuncs.com/api/v1/services/aigc/image2image/image-synthesis"; @@ -2035,7 +2036,12 @@ async function submitImageToProvider(taskDbId, providerConfig, slotResult, param const { headers, body } = buildImageRequest(providerConfig, params, slotResult.apiKey); await updateTaskInDb(taskDbId, { status: "running", progress: 10 }); - const submitTimeout = providerConfig.transport === "gemini-image" ? GEMINI_IMAGE_SUBMIT_TIMEOUT_MS : IMAGE_PROVIDER_SUBMIT_TIMEOUT_MS; + const defaultSubmitTimeout = providerConfig.transport === "gemini-image" + ? GEMINI_IMAGE_SUBMIT_TIMEOUT_MS + : providerConfig.transport === "grsai-image" + ? GRSAI_IMAGE_SUBMIT_TIMEOUT_MS + : IMAGE_PROVIDER_SUBMIT_TIMEOUT_MS; + const submitTimeout = getAdaptiveTimeout(providerConfig.provider, defaultSubmitTimeout); const response = await fetchWithTimeout(url, { method: "POST", headers, body: JSON.stringify(body) }, submitTimeout); if (!response.ok) { const errText = await response.text().catch(() => "provider error");