Files
omniai-server/src/providerHealthMonitor.js
T

273 lines
8.7 KiB
JavaScript

/**
* Provider Health Monitor — periodic health checks for DashScope and other providers.
*
* - Every 5 minutes, probes DashScope with a lightweight text call
* - If Arrearage or auth failure detected, logs error + inserts admin notification
* - Tracks provider health status in-memory for the /api/admin/providers/status endpoint
*/
const { pool } = require("./db");
const { recordProviderSuccess, recordProviderFailure, getAllBreakerStats } = require("./providerCircuitBreaker");
const CHECK_INTERVAL_MS = 5 * 60 * 1000;
const DASHSCOPE_TEST_MODEL = "qwen-max";
const DASHSCOPE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions";
const LOW_BALANCE_THRESHOLD = 500; // cents — alert when balance below this
let timerId = null;
// In-memory health cache for the status endpoint
const providerHealthCache = {
dashscope: { status: "unknown", lastCheck: null, lastError: null, details: null },
grsai: { status: "unknown", lastCheck: null, lastError: null, details: null },
};
function recordProbeOutcome(provider, result, latencyMs) {
if (!provider) return;
if (result?.ok) {
recordProviderSuccess(provider, latencyMs);
} else {
recordProviderFailure(provider);
}
}
async function getDashScopeKey() {
const { rows } = await pool.query(
"SELECT id, api_key FROM api_keys WHERE provider LIKE '%dashscope%' AND enabled = 1 ORDER BY id LIMIT 1"
);
if (!rows.length) return null;
return rows[0].api_key;
}
async function getGrsaiKey() {
const { rows } = await pool.query(
"SELECT id, api_key FROM api_keys WHERE provider = 'grsai' AND enabled = 1 ORDER BY id LIMIT 1"
);
if (!rows.length) return null;
return rows[0].api_key;
}
async function probeDashScope(apiKey) {
const body = {
model: DASHSCOPE_TEST_MODEL,
messages: [{ role: "user", content: "ping" }],
stream: false,
max_tokens: 4,
enable_thinking: false,
};
const res = await fetch(DASHSCOPE_URL, {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: "Bearer " + apiKey },
body: JSON.stringify(body),
signal: AbortSignal.timeout(30000),
});
const text = await res.text();
if (res.status === 400 || res.status === 403) {
let json = {};
try { json = JSON.parse(text); } catch {}
const errorCode = json.error?.code || "";
if (errorCode === "Arrearage") {
return { ok: false, status: "arrears", error: "DashScope 账户欠费,所有 qwen 模型不可用", code: errorCode };
}
if (errorCode === "AccessDenied" || res.status === 403) {
return { ok: false, status: "denied", error: "DashScope 访问被拒绝", code: errorCode };
}
return { ok: false, status: "error", error: `DashScope 返回 HTTP ${res.status}: ${errorCode}`, code: errorCode };
}
if (!res.ok) {
return { ok: false, status: "error", error: `DashScope 返回 HTTP ${res.status}`, code: "http_error" };
}
return { ok: true, status: "healthy", error: null };
}
async function probeGrsai(apiKey) {
// GrsAI uses the same OpenAI-compatible endpoint
const GRSAI_BASE = "https://grsai.dakka.com.cn";
const url = `${GRSAI_BASE}/v1/chat/completions`;
const body = {
model: "gemini-3.1-pro",
messages: [{ role: "user", content: "ping" }],
stream: false,
max_tokens: 4,
};
const res = await fetch(url, {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: "Bearer " + apiKey },
body: JSON.stringify(body),
signal: AbortSignal.timeout(30000),
});
if (!res.ok) {
const errText = await res.text().catch(() => "");
return { ok: false, status: "error", error: `GrsAI 返回 HTTP ${res.status}: ${errText.slice(0, 200)}` };
}
return { ok: true, status: "healthy", error: null };
}
async function notifyAdmin(title, description) {
// Find admin users to notify
const { rows: admins } = await pool.query(
"SELECT id FROM users WHERE role = 'admin' AND enabled = 1"
);
if (!admins.length) {
console.error("[providerHealthMonitor] No admin users found for notification");
return;
}
for (const admin of admins) {
await pool.query(
`INSERT INTO web_notifications (user_id, type, title, description, metadata_json)
VALUES ($1, 'provider_health', $2, $3, '{}')`,
[admin.id, title, description]
);
}
}
async function runHealthCheck() {
// ── DashScope ──
const dashKey = await getDashScopeKey();
if (dashKey) {
const startedAt = Date.now();
try {
const result = await probeDashScope(dashKey);
recordProbeOutcome("dashscope", result, Date.now() - startedAt);
const prev = providerHealthCache.dashscope.status;
providerHealthCache.dashscope = {
status: result.status,
lastCheck: new Date().toISOString(),
lastError: result.error,
details: result,
};
if (!result.ok) {
console.error(`[providerHealthMonitor] DashScope unhealthy: ${result.error}`);
// Only notify on state change (healthy → unhealthy)
if (prev === "healthy" || prev === "unknown") {
await notifyAdmin("DashScope 服务异常", result.error);
}
} else {
// Recovery notification
if (prev !== "healthy" && prev !== "unknown") {
console.log("[providerHealthMonitor] DashScope recovered");
await notifyAdmin("DashScope 服务恢复正常", "DashScope 已恢复正常可用状态");
}
}
} catch (err) {
recordProviderFailure("dashscope");
providerHealthCache.dashscope = {
status: "timeout",
lastCheck: new Date().toISOString(),
lastError: err.message,
details: null,
};
console.error("[providerHealthMonitor] DashScope probe failed:", err.message);
}
} else {
providerHealthCache.dashscope = {
status: "no_key",
lastCheck: new Date().toISOString(),
lastError: "No DashScope API key found in database",
details: null,
};
}
// ── GrsAI ──
const grsaiKey = await getGrsaiKey();
if (grsaiKey) {
const startedAt = Date.now();
try {
const result = await probeGrsai(grsaiKey);
recordProbeOutcome("grsai", result, Date.now() - startedAt);
const prev = providerHealthCache.grsai.status;
providerHealthCache.grsai = {
status: result.status,
lastCheck: new Date().toISOString(),
lastError: result.error,
details: result,
};
if (!result.ok) {
console.error(`[providerHealthMonitor] GrsAI unhealthy: ${result.error}`);
if (prev === "healthy" || prev === "unknown") {
await notifyAdmin("GrsAI 服务异常", result.error);
}
} else {
if (prev !== "healthy" && prev !== "unknown") {
console.log("[providerHealthMonitor] GrsAI recovered");
await notifyAdmin("GrsAI 服务恢复正常", "GrsAI 已恢复正常可用状态");
}
}
} catch (err) {
recordProviderFailure("grsai");
providerHealthCache.grsai = {
status: "timeout",
lastCheck: new Date().toISOString(),
lastError: err.message,
details: null,
};
console.error("[providerHealthMonitor] GrsAI probe failed:", err.message);
}
} else {
providerHealthCache.grsai = {
status: "no_key",
lastCheck: new Date().toISOString(),
lastError: "No GrsAI API key found in database",
details: null,
};
}
// ── Circuit breaker summary ──
providerHealthCache.circuitBreaker = getAllBreakerStats();
// ── Admin low-balance alert ──
try {
const { rows } = await pool.query(
"SELECT id, username, balance_cents FROM users WHERE role = 'admin' AND enabled = 1 AND balance_cents < $1",
[LOW_BALANCE_THRESHOLD]
);
for (const user of rows) {
console.warn(`[providerHealthMonitor] Admin ${user.username} balance low: ${user.balance_cents} cents`);
}
} catch {}
}
function startProviderHealthMonitor() {
if (timerId) return;
runHealthCheck().catch((err) => {
console.error("[providerHealthMonitor] initial run failed:", err.message);
});
timerId = setInterval(() => {
runHealthCheck().catch((err) => {
console.error("[providerHealthMonitor] periodic run failed:", err.message);
});
}, CHECK_INTERVAL_MS);
if (timerId.unref) timerId.unref();
console.log(`[providerHealthMonitor] started (interval=${CHECK_INTERVAL_MS}ms)`);
}
function stopProviderHealthMonitor() {
if (timerId) {
clearInterval(timerId);
timerId = null;
console.log("[providerHealthMonitor] stopped");
}
}
function getProviderHealthCache() {
return providerHealthCache;
}
module.exports = {
startProviderHealthMonitor,
stopProviderHealthMonitor,
getProviderHealthCache,
runHealthCheck,
};