273 lines
8.7 KiB
JavaScript
273 lines
8.7 KiB
JavaScript
/**
|
|
* Provider Health Monitor — periodic health checks for DashScope and other providers.
|
|
*
|
|
* - Every 5 minutes, probes DashScope with a lightweight text call
|
|
* - If Arrearage or auth failure detected, logs error + inserts admin notification
|
|
* - Tracks provider health status in-memory for the /api/admin/providers/status endpoint
|
|
*/
|
|
|
|
const { pool } = require("./db");
|
|
const { recordProviderSuccess, recordProviderFailure, getAllBreakerStats } = require("./providerCircuitBreaker");
|
|
|
|
const CHECK_INTERVAL_MS = 5 * 60 * 1000;
|
|
const DASHSCOPE_TEST_MODEL = "qwen-max";
|
|
const DASHSCOPE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions";
|
|
const LOW_BALANCE_THRESHOLD = 500; // cents — alert when balance below this
|
|
|
|
let timerId = null;
|
|
|
|
// In-memory health cache for the status endpoint
|
|
const providerHealthCache = {
|
|
dashscope: { status: "unknown", lastCheck: null, lastError: null, details: null },
|
|
grsai: { status: "unknown", lastCheck: null, lastError: null, details: null },
|
|
};
|
|
|
|
function recordProbeOutcome(provider, result, latencyMs) {
|
|
if (!provider) return;
|
|
if (result?.ok) {
|
|
recordProviderSuccess(provider, latencyMs);
|
|
} else {
|
|
recordProviderFailure(provider);
|
|
}
|
|
}
|
|
|
|
async function getDashScopeKey() {
|
|
const { rows } = await pool.query(
|
|
"SELECT id, api_key FROM api_keys WHERE provider LIKE '%dashscope%' AND enabled = 1 ORDER BY id LIMIT 1"
|
|
);
|
|
if (!rows.length) return null;
|
|
return rows[0].api_key;
|
|
}
|
|
|
|
async function getGrsaiKey() {
|
|
const { rows } = await pool.query(
|
|
"SELECT id, api_key FROM api_keys WHERE provider = 'grsai' AND enabled = 1 ORDER BY id LIMIT 1"
|
|
);
|
|
if (!rows.length) return null;
|
|
return rows[0].api_key;
|
|
}
|
|
|
|
async function probeDashScope(apiKey) {
|
|
const body = {
|
|
model: DASHSCOPE_TEST_MODEL,
|
|
messages: [{ role: "user", content: "ping" }],
|
|
stream: false,
|
|
max_tokens: 4,
|
|
enable_thinking: false,
|
|
};
|
|
const res = await fetch(DASHSCOPE_URL, {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json", Authorization: "Bearer " + apiKey },
|
|
body: JSON.stringify(body),
|
|
signal: AbortSignal.timeout(30000),
|
|
});
|
|
const text = await res.text();
|
|
|
|
if (res.status === 400 || res.status === 403) {
|
|
let json = {};
|
|
try { json = JSON.parse(text); } catch {}
|
|
const errorCode = json.error?.code || "";
|
|
if (errorCode === "Arrearage") {
|
|
return { ok: false, status: "arrears", error: "DashScope 账户欠费,所有 qwen 模型不可用", code: errorCode };
|
|
}
|
|
if (errorCode === "AccessDenied" || res.status === 403) {
|
|
return { ok: false, status: "denied", error: "DashScope 访问被拒绝", code: errorCode };
|
|
}
|
|
return { ok: false, status: "error", error: `DashScope 返回 HTTP ${res.status}: ${errorCode}`, code: errorCode };
|
|
}
|
|
|
|
if (!res.ok) {
|
|
return { ok: false, status: "error", error: `DashScope 返回 HTTP ${res.status}`, code: "http_error" };
|
|
}
|
|
|
|
return { ok: true, status: "healthy", error: null };
|
|
}
|
|
|
|
async function probeGrsai(apiKey) {
|
|
// GrsAI uses the same OpenAI-compatible endpoint
|
|
const GRSAI_BASE = "https://grsai.dakka.com.cn";
|
|
const url = `${GRSAI_BASE}/v1/chat/completions`;
|
|
const body = {
|
|
model: "gemini-3.1-pro",
|
|
messages: [{ role: "user", content: "ping" }],
|
|
stream: false,
|
|
max_tokens: 4,
|
|
};
|
|
const res = await fetch(url, {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json", Authorization: "Bearer " + apiKey },
|
|
body: JSON.stringify(body),
|
|
signal: AbortSignal.timeout(30000),
|
|
});
|
|
|
|
if (!res.ok) {
|
|
const errText = await res.text().catch(() => "");
|
|
return { ok: false, status: "error", error: `GrsAI 返回 HTTP ${res.status}: ${errText.slice(0, 200)}` };
|
|
}
|
|
|
|
return { ok: true, status: "healthy", error: null };
|
|
}
|
|
|
|
async function notifyAdmin(title, description) {
|
|
// Find admin users to notify
|
|
const { rows: admins } = await pool.query(
|
|
"SELECT id FROM users WHERE role = 'admin' AND enabled = 1"
|
|
);
|
|
if (!admins.length) {
|
|
console.error("[providerHealthMonitor] No admin users found for notification");
|
|
return;
|
|
}
|
|
for (const admin of admins) {
|
|
await pool.query(
|
|
`INSERT INTO web_notifications (user_id, type, title, description, metadata_json)
|
|
VALUES ($1, 'provider_health', $2, $3, '{}')`,
|
|
[admin.id, title, description]
|
|
);
|
|
}
|
|
}
|
|
|
|
async function runHealthCheck() {
|
|
// ── DashScope ──
|
|
const dashKey = await getDashScopeKey();
|
|
if (dashKey) {
|
|
const startedAt = Date.now();
|
|
try {
|
|
const result = await probeDashScope(dashKey);
|
|
recordProbeOutcome("dashscope", result, Date.now() - startedAt);
|
|
const prev = providerHealthCache.dashscope.status;
|
|
providerHealthCache.dashscope = {
|
|
status: result.status,
|
|
lastCheck: new Date().toISOString(),
|
|
lastError: result.error,
|
|
details: result,
|
|
};
|
|
|
|
if (!result.ok) {
|
|
console.error(`[providerHealthMonitor] DashScope unhealthy: ${result.error}`);
|
|
// Only notify on state change (healthy → unhealthy)
|
|
if (prev === "healthy" || prev === "unknown") {
|
|
await notifyAdmin("DashScope 服务异常", result.error);
|
|
}
|
|
} else {
|
|
// Recovery notification
|
|
if (prev !== "healthy" && prev !== "unknown") {
|
|
console.log("[providerHealthMonitor] DashScope recovered");
|
|
await notifyAdmin("DashScope 服务恢复正常", "DashScope 已恢复正常可用状态");
|
|
}
|
|
}
|
|
} catch (err) {
|
|
recordProviderFailure("dashscope");
|
|
providerHealthCache.dashscope = {
|
|
status: "timeout",
|
|
lastCheck: new Date().toISOString(),
|
|
lastError: err.message,
|
|
details: null,
|
|
};
|
|
console.error("[providerHealthMonitor] DashScope probe failed:", err.message);
|
|
}
|
|
} else {
|
|
providerHealthCache.dashscope = {
|
|
status: "no_key",
|
|
lastCheck: new Date().toISOString(),
|
|
lastError: "No DashScope API key found in database",
|
|
details: null,
|
|
};
|
|
}
|
|
|
|
// ── GrsAI ──
|
|
const grsaiKey = await getGrsaiKey();
|
|
if (grsaiKey) {
|
|
const startedAt = Date.now();
|
|
try {
|
|
const result = await probeGrsai(grsaiKey);
|
|
recordProbeOutcome("grsai", result, Date.now() - startedAt);
|
|
const prev = providerHealthCache.grsai.status;
|
|
providerHealthCache.grsai = {
|
|
status: result.status,
|
|
lastCheck: new Date().toISOString(),
|
|
lastError: result.error,
|
|
details: result,
|
|
};
|
|
|
|
if (!result.ok) {
|
|
console.error(`[providerHealthMonitor] GrsAI unhealthy: ${result.error}`);
|
|
if (prev === "healthy" || prev === "unknown") {
|
|
await notifyAdmin("GrsAI 服务异常", result.error);
|
|
}
|
|
} else {
|
|
if (prev !== "healthy" && prev !== "unknown") {
|
|
console.log("[providerHealthMonitor] GrsAI recovered");
|
|
await notifyAdmin("GrsAI 服务恢复正常", "GrsAI 已恢复正常可用状态");
|
|
}
|
|
}
|
|
} catch (err) {
|
|
recordProviderFailure("grsai");
|
|
providerHealthCache.grsai = {
|
|
status: "timeout",
|
|
lastCheck: new Date().toISOString(),
|
|
lastError: err.message,
|
|
details: null,
|
|
};
|
|
console.error("[providerHealthMonitor] GrsAI probe failed:", err.message);
|
|
}
|
|
} else {
|
|
providerHealthCache.grsai = {
|
|
status: "no_key",
|
|
lastCheck: new Date().toISOString(),
|
|
lastError: "No GrsAI API key found in database",
|
|
details: null,
|
|
};
|
|
}
|
|
|
|
// ── Circuit breaker summary ──
|
|
providerHealthCache.circuitBreaker = getAllBreakerStats();
|
|
|
|
// ── Admin low-balance alert ──
|
|
try {
|
|
const { rows } = await pool.query(
|
|
"SELECT id, username, balance_cents FROM users WHERE role = 'admin' AND enabled = 1 AND balance_cents < $1",
|
|
[LOW_BALANCE_THRESHOLD]
|
|
);
|
|
for (const user of rows) {
|
|
console.warn(`[providerHealthMonitor] Admin ${user.username} balance low: ${user.balance_cents} cents`);
|
|
}
|
|
} catch {}
|
|
}
|
|
|
|
function startProviderHealthMonitor() {
|
|
if (timerId) return;
|
|
|
|
runHealthCheck().catch((err) => {
|
|
console.error("[providerHealthMonitor] initial run failed:", err.message);
|
|
});
|
|
|
|
timerId = setInterval(() => {
|
|
runHealthCheck().catch((err) => {
|
|
console.error("[providerHealthMonitor] periodic run failed:", err.message);
|
|
});
|
|
}, CHECK_INTERVAL_MS);
|
|
|
|
if (timerId.unref) timerId.unref();
|
|
|
|
console.log(`[providerHealthMonitor] started (interval=${CHECK_INTERVAL_MS}ms)`);
|
|
}
|
|
|
|
function stopProviderHealthMonitor() {
|
|
if (timerId) {
|
|
clearInterval(timerId);
|
|
timerId = null;
|
|
console.log("[providerHealthMonitor] stopped");
|
|
}
|
|
}
|
|
|
|
function getProviderHealthCache() {
|
|
return providerHealthCache;
|
|
}
|
|
|
|
module.exports = {
|
|
startProviderHealthMonitor,
|
|
stopProviderHealthMonitor,
|
|
getProviderHealthCache,
|
|
runHealthCheck,
|
|
};
|