Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9436036d65 | |||
| ea91155f9e |
+339
-264
@@ -5,8 +5,6 @@ const { EventEmitter } = require("node:events");
|
||||
const { pool } = require("./db");
|
||||
const { refundTaskBillingOnFailure } = require("./billing");
|
||||
const { putObject, isOssConfigured } = require("./ossClient");
|
||||
const keyManager = require("./keyManager");
|
||||
const { resolveImageProviderCandidates, resolveVideoProvider } = require("./aiProviderRouter");
|
||||
|
||||
const taskEvents = new EventEmitter();
|
||||
taskEvents.setMaxListeners(200);
|
||||
@@ -15,10 +13,15 @@ const activePollers = new Map();
|
||||
const POLL_INTERVAL_MS = 3000;
|
||||
const MAX_POLL_ATTEMPTS = 120;
|
||||
const GRS_IMAGE_MAX_POLL_ATTEMPTS = Number(process.env.GRSAI_IMAGE_MAX_POLL_ATTEMPTS || 60);
|
||||
const STALE_TASK_TIMEOUT_MINUTES = Math.max(10, Number(process.env.STALE_GENERATION_TASK_MINUTES || 120));
|
||||
const RESULT_PERSIST_RETRY_LIMIT = Math.max(1, Number(process.env.RESULT_PERSIST_RETRY_LIMIT || 5));
|
||||
const RESULT_PERSIST_RETRY_BATCH_SIZE = Math.max(1, Number(process.env.RESULT_PERSIST_RETRY_BATCH_SIZE || 25));
|
||||
const TASK_STARTUP_RECOVERY_LIMIT = Math.max(1, Number(process.env.TASK_STARTUP_RECOVERY_LIMIT || 50));
|
||||
const TASK_EVENT_CHANNEL = "generation_task_events";
|
||||
const TASK_EVENT_ORIGIN = `${process.pid}-${crypto.randomUUID()}`;
|
||||
const POLLER_OWNER_ID = `${process.pid}-${crypto.randomUUID()}`;
|
||||
const POLLER_OWNER_STALE_MS = Number(process.env.TASK_POLLER_OWNER_STALE_MS || 20_000);
|
||||
const POLLER_RECOVERY_INTERVAL_MS = Number(process.env.TASK_POLLER_RECOVERY_INTERVAL_MS || 30_000);
|
||||
let taskEventListenerClient = null;
|
||||
let taskEventListenerStarting = null;
|
||||
let pollerStoreReady = null;
|
||||
let pollerRecoveryTimer = null;
|
||||
|
||||
function normalizeTaskProgress(value) {
|
||||
const numeric = Number(value);
|
||||
@@ -36,6 +39,156 @@ function formatTaskEvent(row) {
|
||||
};
|
||||
}
|
||||
|
||||
function emitTaskEvent(event) {
|
||||
if (!event?.taskId) return;
|
||||
taskEvents.emit(`task:${event.taskId}`, event);
|
||||
}
|
||||
|
||||
async function publishTaskEvent(event) {
|
||||
if (!event?.taskId) return;
|
||||
emitTaskEvent(event);
|
||||
try {
|
||||
await pool.query("SELECT pg_notify($1, $2)", [
|
||||
TASK_EVENT_CHANNEL,
|
||||
JSON.stringify({ origin: TASK_EVENT_ORIGIN, event }),
|
||||
]);
|
||||
} catch (err) {
|
||||
console.error(`[aiTaskWorker] task event publish failed for task ${event.taskId}:`, err.message);
|
||||
}
|
||||
}
|
||||
|
||||
function serializeProviderConfig(providerConfig) {
|
||||
if (!providerConfig || typeof providerConfig !== "object") return {};
|
||||
const allowedKeys = [
|
||||
"provider",
|
||||
"transport",
|
||||
"protocol",
|
||||
"baseUrl",
|
||||
"endpoint",
|
||||
"resultEndpoint",
|
||||
"model",
|
||||
"requestedModel",
|
||||
];
|
||||
const result = {};
|
||||
for (const key of allowedKeys) {
|
||||
if (providerConfig[key] !== undefined) result[key] = providerConfig[key];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
function parseProviderConfig(value) {
|
||||
if (!value) return {};
|
||||
if (typeof value === "object") return value;
|
||||
try {
|
||||
const parsed = JSON.parse(value);
|
||||
return parsed && typeof parsed === "object" ? parsed : {};
|
||||
} catch {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
async function ensureTaskPollerStore() {
|
||||
if (pollerStoreReady) return pollerStoreReady;
|
||||
pollerStoreReady = pool.query(`
|
||||
CREATE TABLE IF NOT EXISTS generation_task_pollers (
|
||||
task_id INTEGER PRIMARY KEY REFERENCES generation_tasks(id) ON DELETE CASCADE,
|
||||
provider_task_id TEXT NOT NULL,
|
||||
task_type TEXT NOT NULL,
|
||||
provider_config_json TEXT NOT NULL,
|
||||
lease_token TEXT,
|
||||
owner_id TEXT,
|
||||
owner_heartbeat_at TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_generation_task_pollers_owner
|
||||
ON generation_task_pollers(owner_heartbeat_at);
|
||||
`).catch((err) => {
|
||||
pollerStoreReady = null;
|
||||
throw err;
|
||||
});
|
||||
return pollerStoreReady;
|
||||
}
|
||||
|
||||
async function persistPollerState(taskDbId, { providerTaskId, type, providerConfig, leaseToken }) {
|
||||
await ensureTaskPollerStore();
|
||||
await pool.query(
|
||||
`
|
||||
INSERT INTO generation_task_pollers (
|
||||
task_id, provider_task_id, task_type, provider_config_json, lease_token,
|
||||
owner_id, owner_heartbeat_at, updated_at
|
||||
)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, NOW(), NOW())
|
||||
ON CONFLICT (task_id) DO UPDATE SET
|
||||
provider_task_id = EXCLUDED.provider_task_id,
|
||||
task_type = EXCLUDED.task_type,
|
||||
provider_config_json = EXCLUDED.provider_config_json,
|
||||
lease_token = EXCLUDED.lease_token,
|
||||
owner_id = EXCLUDED.owner_id,
|
||||
owner_heartbeat_at = NOW(),
|
||||
updated_at = NOW()
|
||||
`,
|
||||
[
|
||||
taskDbId,
|
||||
providerTaskId,
|
||||
type,
|
||||
JSON.stringify(serializeProviderConfig(providerConfig)),
|
||||
leaseToken || null,
|
||||
POLLER_OWNER_ID,
|
||||
],
|
||||
);
|
||||
}
|
||||
|
||||
async function refreshPollerHeartbeat(taskDbId) {
|
||||
await ensureTaskPollerStore();
|
||||
await pool.query(
|
||||
"UPDATE generation_task_pollers SET owner_id = $1, owner_heartbeat_at = NOW(), updated_at = NOW() WHERE task_id = $2",
|
||||
[POLLER_OWNER_ID, taskDbId],
|
||||
);
|
||||
}
|
||||
|
||||
async function clearPollerState(taskDbId) {
|
||||
await ensureTaskPollerStore();
|
||||
await pool.query("DELETE FROM generation_task_pollers WHERE task_id = $1", [taskDbId]);
|
||||
}
|
||||
|
||||
async function getLeaseKey(leaseToken) {
|
||||
if (!leaseToken) return null;
|
||||
const { rows } = await pool.query(
|
||||
`
|
||||
SELECT k.api_key
|
||||
FROM key_leases l
|
||||
JOIN api_keys k ON k.id = l.key_id
|
||||
WHERE l.lease_token = $1
|
||||
AND l.released_at IS NULL
|
||||
AND k.enabled = 1
|
||||
LIMIT 1
|
||||
`,
|
||||
[leaseToken],
|
||||
);
|
||||
const apiKey = rows[0]?.api_key;
|
||||
return apiKey === "pool-slot" ? "" : apiKey || null;
|
||||
}
|
||||
|
||||
async function claimPoller(taskId) {
|
||||
await ensureTaskPollerStore();
|
||||
const staleInterval = `${Math.max(5, Math.ceil(POLLER_OWNER_STALE_MS / 1000))} seconds`;
|
||||
const { rows } = await pool.query(
|
||||
`
|
||||
UPDATE generation_task_pollers
|
||||
SET owner_id = $1, owner_heartbeat_at = NOW(), updated_at = NOW()
|
||||
WHERE task_id = $2
|
||||
AND (
|
||||
owner_heartbeat_at IS NULL
|
||||
OR owner_heartbeat_at < NOW() - ($3::text)::interval
|
||||
)
|
||||
RETURNING *
|
||||
`,
|
||||
[POLLER_OWNER_ID, taskId, staleInterval],
|
||||
);
|
||||
return rows[0] || null;
|
||||
}
|
||||
|
||||
async function createTaskLifecycleNotification(task) {
|
||||
if (!task || !task.user_id || !task.id) return;
|
||||
|
||||
@@ -98,15 +251,14 @@ async function updateTaskInDb(taskId, updates) {
|
||||
|
||||
if (fields.length === 0) return;
|
||||
values.push(taskId);
|
||||
const protectCancelled = nextUpdates.status !== "cancelled" ? " AND status <> 'cancelled'" : "";
|
||||
const { rows } = await pool.query(
|
||||
`UPDATE generation_tasks SET ${fields.join(", ")} WHERE id = $${idx}${protectCancelled} RETURNING *`,
|
||||
`UPDATE generation_tasks SET ${fields.join(", ")} WHERE id = $${idx} RETURNING *`,
|
||||
values,
|
||||
);
|
||||
let updatedTask = rows[0];
|
||||
|
||||
if (updatedTask) {
|
||||
taskEvents.emit(`task:${taskId}`, formatTaskEvent(updatedTask));
|
||||
await publishTaskEvent(formatTaskEvent(updatedTask));
|
||||
}
|
||||
|
||||
if (nextUpdates.status === "completed" && updatedTask?.result_url) {
|
||||
@@ -131,66 +283,20 @@ function persistTaskResultUrlToOssInBackground(task) {
|
||||
|
||||
Promise.resolve()
|
||||
.then(async () => {
|
||||
await persistTaskResultUrlToOss(task);
|
||||
const durableUrl = await persistResultUrlToOss(task);
|
||||
if (!durableUrl || durableUrl === task.result_url) return;
|
||||
|
||||
await pool.query(
|
||||
"UPDATE generation_tasks SET result_url = $1, updated_at = NOW() WHERE id = $2 AND result_url = $3",
|
||||
[durableUrl, task.id, task.result_url],
|
||||
);
|
||||
console.info(`[aiTaskWorker] task ${task.id} result persisted to OSS after completion`);
|
||||
})
|
||||
.catch((error) => {
|
||||
console.warn(`[aiTaskWorker] background result persistence failed for task ${task.id}:`, error.message);
|
||||
});
|
||||
}
|
||||
|
||||
async function markResultPersistence(taskId, status, error = null, durableUrl = null, previousUrl = null) {
|
||||
const fields = [
|
||||
"result_persist_status = $1",
|
||||
"result_persist_attempts = result_persist_attempts + 1",
|
||||
"result_persist_error = $2",
|
||||
"updated_at = NOW()",
|
||||
];
|
||||
const values = [status, error ? String(error).slice(0, 1000) : null];
|
||||
let idx = values.length + 1;
|
||||
|
||||
if (status === "succeeded") {
|
||||
fields.push("result_persisted_at = NOW()");
|
||||
}
|
||||
if (durableUrl) {
|
||||
fields.push(`result_url = $${idx++}`);
|
||||
values.push(durableUrl);
|
||||
}
|
||||
|
||||
values.push(taskId);
|
||||
let where = `id = $${idx}`;
|
||||
if (previousUrl) {
|
||||
idx += 1;
|
||||
values.push(previousUrl);
|
||||
where += ` AND result_url = $${idx}`;
|
||||
}
|
||||
|
||||
await pool.query(`UPDATE generation_tasks SET ${fields.join(", ")} WHERE ${where}`, values);
|
||||
}
|
||||
|
||||
async function persistTaskResultUrlToOss(task) {
|
||||
if (!task?.id || !task?.result_url) return null;
|
||||
|
||||
if (isOwnPersistedResultUrl(task.result_url)) {
|
||||
await markResultPersistence(task.id, "succeeded", null, null);
|
||||
return task.result_url;
|
||||
}
|
||||
|
||||
if (!isOssConfigured()) {
|
||||
await markResultPersistence(task.id, "failed", "OSS is not configured");
|
||||
return null;
|
||||
}
|
||||
|
||||
const durableUrl = await persistResultUrlToOss(task);
|
||||
if (!durableUrl) {
|
||||
await markResultPersistence(task.id, "failed", "Result URL could not be copied to OSS");
|
||||
return null;
|
||||
}
|
||||
|
||||
await markResultPersistence(task.id, "succeeded", null, durableUrl, task.result_url);
|
||||
console.info(`[aiTaskWorker] task ${task.id} result persisted to OSS after completion`);
|
||||
return durableUrl;
|
||||
}
|
||||
|
||||
function asObject(value) {
|
||||
return value && typeof value === "object" && !Array.isArray(value) ? value : undefined;
|
||||
}
|
||||
@@ -689,27 +795,22 @@ function getMaxPollAttempts(type, providerConfig) {
|
||||
return MAX_POLL_ATTEMPTS;
|
||||
}
|
||||
|
||||
async function releasePollingLease(poller) {
|
||||
if (!poller?.leaseToken || !poller?.keyManager) return;
|
||||
await poller.keyManager.releaseKey(poller.leaseToken).catch((err) => {
|
||||
console.warn(`[aiTaskWorker] release lease failed for task ${poller.taskDbId}:`, err.message);
|
||||
});
|
||||
}
|
||||
|
||||
function startPolling(taskDbId, { providerTaskId, apiKey, type, providerConfig, leaseToken, keyManager, onTaskFailed }) {
|
||||
function startPolling(taskDbId, { providerTaskId, apiKey, type, providerConfig, leaseToken, keyManager, onTaskFailed, skipPersist = false }) {
|
||||
if (activePollers.has(taskDbId)) return;
|
||||
if (!skipPersist) {
|
||||
persistPollerState(taskDbId, { providerTaskId, type, providerConfig, leaseToken }).catch((err) => {
|
||||
console.error(`[aiTaskWorker] failed to persist poller state for task ${taskDbId}:`, err.message);
|
||||
});
|
||||
}
|
||||
|
||||
let attempts = 0;
|
||||
let polling = false;
|
||||
const maxPollAttempts = getMaxPollAttempts(type, providerConfig);
|
||||
const interval = setInterval(async () => {
|
||||
if (polling) return;
|
||||
polling = true;
|
||||
attempts++;
|
||||
|
||||
try {
|
||||
if (attempts > maxPollAttempts) {
|
||||
await stopPolling(taskDbId, { releaseLease: true });
|
||||
clearInterval(interval);
|
||||
activePollers.delete(taskDbId);
|
||||
if (leaseToken && keyManager) await keyManager.releaseKey(leaseToken).catch(() => {});
|
||||
if (typeof onTaskFailed === "function") {
|
||||
const handled = await onTaskFailed("Task timed out").catch((fallbackErr) => {
|
||||
console.error(`[aiTaskWorker] fallback error for task ${taskDbId}:`, fallbackErr.message);
|
||||
@@ -718,14 +819,21 @@ function startPolling(taskDbId, { providerTaskId, apiKey, type, providerConfig,
|
||||
if (handled) return;
|
||||
}
|
||||
await updateTaskInDb(taskDbId, { status: "failed", error: "Task timed out" });
|
||||
await clearPollerState(taskDbId).catch(() => {});
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
// Check if task was cancelled by user
|
||||
const { rows: [taskRow] } = await pool.query("SELECT status FROM generation_tasks WHERE id = $1", [taskDbId]);
|
||||
if (!taskRow || taskRow.status === "cancelled") {
|
||||
await stopPolling(taskDbId, { releaseLease: true });
|
||||
clearInterval(interval);
|
||||
activePollers.delete(taskDbId);
|
||||
await clearPollerState(taskDbId).catch(() => {});
|
||||
if (leaseToken && keyManager) await keyManager.releaseKey(leaseToken).catch(() => {});
|
||||
return;
|
||||
}
|
||||
await refreshPollerHeartbeat(taskDbId).catch(() => {});
|
||||
|
||||
let result;
|
||||
if (type === "image") {
|
||||
@@ -739,7 +847,9 @@ function startPolling(taskDbId, { providerTaskId, apiKey, type, providerConfig,
|
||||
}
|
||||
|
||||
if (result.status === "completed" || result.status === "failed") {
|
||||
await stopPolling(taskDbId, { releaseLease: true });
|
||||
clearInterval(interval);
|
||||
activePollers.delete(taskDbId);
|
||||
if (leaseToken && keyManager) await keyManager.releaseKey(leaseToken).catch(() => {});
|
||||
if (result.status === "failed" && typeof onTaskFailed === "function") {
|
||||
const handled = await onTaskFailed(result.error || "Task failed").catch((fallbackErr) => {
|
||||
console.error(`[aiTaskWorker] fallback error for task ${taskDbId}:`, fallbackErr.message);
|
||||
@@ -750,228 +860,179 @@ function startPolling(taskDbId, { providerTaskId, apiKey, type, providerConfig,
|
||||
}
|
||||
|
||||
await updateTaskInDb(taskDbId, result);
|
||||
if (result.status === "completed" || result.status === "failed") {
|
||||
await clearPollerState(taskDbId).catch(() => {});
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`[aiTaskWorker] poll error for task ${taskDbId}:`, err.message);
|
||||
} finally {
|
||||
polling = false;
|
||||
}
|
||||
}, POLL_INTERVAL_MS);
|
||||
|
||||
activePollers.set(taskDbId, { taskDbId, interval, leaseToken, keyManager });
|
||||
activePollers.set(taskDbId, { interval, leaseToken });
|
||||
}
|
||||
|
||||
async function stopPolling(taskDbId, options = {}) {
|
||||
function stopPolling(taskDbId) {
|
||||
const poller = activePollers.get(taskDbId);
|
||||
if (!poller) return;
|
||||
|
||||
if (poller) {
|
||||
clearInterval(poller.interval);
|
||||
activePollers.delete(taskDbId);
|
||||
if (options.releaseLease) {
|
||||
await releasePollingLease(poller);
|
||||
}
|
||||
}
|
||||
|
||||
async function cancelTask(taskId, userId) {
|
||||
const { rows } = await pool.query(
|
||||
`UPDATE generation_tasks
|
||||
SET status = 'cancelled', completed_at = NOW(), updated_at = NOW()
|
||||
WHERE id = $1 AND user_id = $2 AND status IN ('pending', 'running')
|
||||
RETURNING *`,
|
||||
[taskId, userId],
|
||||
);
|
||||
const task = rows[0];
|
||||
if (!task) return null;
|
||||
|
||||
await stopPolling(task.id, { releaseLease: true });
|
||||
taskEvents.emit(`task:${task.id}`, formatTaskEvent(task));
|
||||
return task;
|
||||
clearPollerState(taskDbId).catch(() => {});
|
||||
}
|
||||
|
||||
function getActiveCount() {
|
||||
return activePollers.size;
|
||||
}
|
||||
|
||||
const STALE_TASK_CLEANUP_INTERVAL_MS = 5 * 60 * 1000;
|
||||
let staleTaskCleanupTimer = null;
|
||||
const TASK_RESULT_PERSIST_RETRY_INTERVAL_MS = 5 * 60 * 1000;
|
||||
let taskResultPersistenceRetryTimer = null;
|
||||
let taskStartupRecoveryTimer = null;
|
||||
let taskStaleCleanupRunning = false;
|
||||
let taskResultPersistenceRetryRunning = false;
|
||||
let taskStartupRecoveryRunning = false;
|
||||
|
||||
function parseTaskParams(paramsJson) {
|
||||
if (!paramsJson) return {};
|
||||
if (typeof paramsJson === "object") return paramsJson;
|
||||
try {
|
||||
const parsed = JSON.parse(paramsJson);
|
||||
return parsed && typeof parsed === "object" ? parsed : {};
|
||||
} catch {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
function resolveProviderConfigForRecovery(task) {
|
||||
const params = parseTaskParams(task.params_json);
|
||||
|
||||
if (task.type === "video") {
|
||||
if (params.model === "video-style-transform" || params.operation === "video-style-super-resolution") {
|
||||
return { provider: "dashscope", protocol: "wan-i2v", baseUrl: "https://dashscope.aliyuncs.com" };
|
||||
}
|
||||
if (params.model === "aliyun-video-super-resolve" || params.model === "aliyun-erase-subtitles") {
|
||||
return null;
|
||||
}
|
||||
return resolveVideoProvider(params.model);
|
||||
}
|
||||
|
||||
if (task.type === "image") {
|
||||
if (params.operation === "image-super-resolution" || params.operation === "image-edit") {
|
||||
return { provider: "dashscope", transport: "dashscope-image" };
|
||||
}
|
||||
const candidates = resolveImageProviderCandidates(params.model);
|
||||
return candidates[0] || null;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function normalizeRecoveryUser(task) {
|
||||
return {
|
||||
id: task.user_id,
|
||||
enterpriseId: task.enterprise_id ?? null,
|
||||
accountType: task.enterprise_id ? "enterprise" : "personal",
|
||||
};
|
||||
}
|
||||
|
||||
async function runStaleTaskCleanup() {
|
||||
if (taskStaleCleanupRunning) return;
|
||||
taskStaleCleanupRunning = true;
|
||||
try {
|
||||
async function recoverRunnablePollers() {
|
||||
await ensureTaskPollerStore();
|
||||
const staleInterval = `${Math.max(5, Math.ceil(POLLER_OWNER_STALE_MS / 1000))} seconds`;
|
||||
const { rows } = await pool.query(
|
||||
`UPDATE generation_tasks
|
||||
SET status = 'failed',
|
||||
error = 'Task timed out and was released automatically',
|
||||
completed_at = NOW(),
|
||||
updated_at = NOW()
|
||||
WHERE status IN ('pending', 'running')
|
||||
AND GREATEST(updated_at, COALESCE(last_poll_at, created_at)) < NOW() - ($1::int * INTERVAL '1 minute')
|
||||
RETURNING *`,
|
||||
[STALE_TASK_TIMEOUT_MINUTES],
|
||||
`
|
||||
SELECT p.task_id
|
||||
FROM generation_task_pollers p
|
||||
JOIN generation_tasks t ON t.id = p.task_id
|
||||
WHERE t.status IN ('pending', 'running')
|
||||
AND (
|
||||
p.owner_heartbeat_at IS NULL
|
||||
OR p.owner_heartbeat_at < NOW() - ($1::text)::interval
|
||||
)
|
||||
ORDER BY p.owner_heartbeat_at NULLS FIRST, p.updated_at ASC
|
||||
LIMIT 20
|
||||
`,
|
||||
[staleInterval],
|
||||
);
|
||||
|
||||
for (const row of rows) {
|
||||
await stopPolling(row.id, { releaseLease: true });
|
||||
taskEvents.emit(`task:${row.id}`, formatTaskEvent(row));
|
||||
await refundTaskBillingOnFailure(row.id).catch((err) => {
|
||||
console.error(`[aiTaskWorker] stale task refund error for task ${row.id}:`, err.message);
|
||||
});
|
||||
const taskId = row.task_id;
|
||||
if (activePollers.has(taskId)) continue;
|
||||
const poller = await claimPoller(taskId);
|
||||
if (!poller || activePollers.has(taskId)) continue;
|
||||
|
||||
const apiKey = await getLeaseKey(poller.lease_token);
|
||||
if (apiKey == null) {
|
||||
console.warn(`[aiTaskWorker] cannot recover task ${taskId}: active lease not found`);
|
||||
continue;
|
||||
}
|
||||
|
||||
console.info(`[aiTaskWorker] recovering poller for task ${taskId}`);
|
||||
startPolling(taskId, {
|
||||
providerTaskId: poller.provider_task_id,
|
||||
apiKey,
|
||||
type: poller.task_type,
|
||||
providerConfig: parseProviderConfig(poller.provider_config_json),
|
||||
leaseToken: poller.lease_token,
|
||||
keyManager: require("./keyManager"),
|
||||
skipPersist: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// --- Periodic stale task cleanup ---
|
||||
// Runs every 5 minutes, marks tasks stuck in 'pending'/'running' for too long as 'failed'.
|
||||
// This catches cases where the worker crashed, the provider API never responded,
|
||||
// or the cancel request failed silently on the client side.
|
||||
const STALE_TASK_CLEANUP_INTERVAL_MS = 5 * 60 * 1000;
|
||||
let staleTaskCleanupTimer = null;
|
||||
|
||||
async function runStaleTaskCleanup() {
|
||||
try {
|
||||
const { rows } = await pool.query(
|
||||
`UPDATE generation_tasks
|
||||
SET status = 'failed', error = '任务超时自动释放', updated_at = NOW()
|
||||
WHERE status IN ('pending', 'running')
|
||||
AND GREATEST(updated_at, COALESCE(last_poll_at, created_at)) < NOW() - INTERVAL '10 minutes'
|
||||
RETURNING id`,
|
||||
);
|
||||
for (const row of rows) {
|
||||
await publishTaskEvent({
|
||||
taskId: row.id,
|
||||
status: "failed",
|
||||
progress: null,
|
||||
resultUrl: null,
|
||||
error: "任务超时自动释放",
|
||||
});
|
||||
// Also stop any active poller for this task
|
||||
const poller = activePollers.get(row.id);
|
||||
if (poller) {
|
||||
clearInterval(poller.interval);
|
||||
activePollers.delete(row.id);
|
||||
}
|
||||
await clearPollerState(row.id).catch(() => {});
|
||||
}
|
||||
if (rows.length > 0) {
|
||||
console.log(`[aiTaskWorker] Cleaned up ${rows.length} stale task(s)`);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("[aiTaskWorker] Stale task cleanup failed:", err.message);
|
||||
} finally {
|
||||
taskStaleCleanupRunning = false;
|
||||
}
|
||||
}
|
||||
|
||||
async function runResultPersistenceRetry() {
|
||||
if (taskResultPersistenceRetryRunning) return;
|
||||
taskResultPersistenceRetryRunning = true;
|
||||
async function startTaskEventListener() {
|
||||
if (taskEventListenerClient) return;
|
||||
if (taskEventListenerStarting) return taskEventListenerStarting;
|
||||
|
||||
taskEventListenerStarting = (async () => {
|
||||
const client = await pool.connect();
|
||||
let released = false;
|
||||
|
||||
const releaseClient = () => {
|
||||
if (released) return;
|
||||
released = true;
|
||||
taskEventListenerClient = null;
|
||||
try {
|
||||
const { rows } = await pool.query(
|
||||
`SELECT *
|
||||
FROM generation_tasks
|
||||
WHERE status = 'completed'
|
||||
AND result_url IS NOT NULL
|
||||
AND result_url ~* '^https?://'
|
||||
AND result_url !~* '/users/[^/]+/generation-results/'
|
||||
AND result_persist_status IN ('pending', 'failed')
|
||||
AND result_persist_attempts < $1
|
||||
ORDER BY updated_at ASC
|
||||
LIMIT $2`,
|
||||
[RESULT_PERSIST_RETRY_LIMIT, RESULT_PERSIST_RETRY_BATCH_SIZE],
|
||||
);
|
||||
client.release();
|
||||
} catch {}
|
||||
};
|
||||
|
||||
for (const row of rows) {
|
||||
await persistTaskResultUrlToOss(row);
|
||||
}
|
||||
|
||||
if (rows.length > 0) {
|
||||
console.log(`[aiTaskWorker] Retried OSS result persistence for ${rows.length} task(s)`);
|
||||
}
|
||||
client.on("notification", (message) => {
|
||||
if (message.channel !== TASK_EVENT_CHANNEL || !message.payload) return;
|
||||
try {
|
||||
const payload = JSON.parse(message.payload);
|
||||
if (payload?.origin === TASK_EVENT_ORIGIN) return;
|
||||
emitTaskEvent(payload?.event || payload);
|
||||
} catch (err) {
|
||||
console.error("[aiTaskWorker] Result persistence retry failed:", err.message);
|
||||
} finally {
|
||||
taskResultPersistenceRetryRunning = false;
|
||||
console.error("[aiTaskWorker] task event notification parse failed:", err.message);
|
||||
}
|
||||
}
|
||||
|
||||
async function runTaskStartupRecovery() {
|
||||
if (taskStartupRecoveryRunning) return;
|
||||
taskStartupRecoveryRunning = true;
|
||||
try {
|
||||
const { rows } = await pool.query(
|
||||
`SELECT gt.*, u.enterprise_id
|
||||
FROM generation_tasks gt
|
||||
JOIN users u ON u.id = gt.user_id
|
||||
WHERE gt.status = 'running'
|
||||
AND gt.provider_task_id IS NOT NULL
|
||||
AND GREATEST(gt.updated_at, COALESCE(gt.last_poll_at, gt.created_at)) >= NOW() - ($1::int * INTERVAL '1 minute')
|
||||
ORDER BY gt.updated_at DESC
|
||||
LIMIT $2`,
|
||||
[STALE_TASK_TIMEOUT_MINUTES, TASK_STARTUP_RECOVERY_LIMIT],
|
||||
);
|
||||
|
||||
let recovered = 0;
|
||||
for (const task of rows) {
|
||||
if (activePollers.has(task.id)) continue;
|
||||
|
||||
let providerConfig;
|
||||
try {
|
||||
providerConfig = resolveProviderConfigForRecovery(task);
|
||||
} catch (err) {
|
||||
console.warn(`[aiTaskWorker] task ${task.id} recovery skipped: ${err.message}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!providerConfig?.provider) continue;
|
||||
const slotResult = await keyManager.acquireKey(providerConfig.provider, normalizeRecoveryUser(task), null, { waitTimeoutMs: 0 });
|
||||
if (!slotResult) {
|
||||
console.warn(`[aiTaskWorker] task ${task.id} recovery waiting for provider capacity: ${providerConfig.provider}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
startPolling(task.id, {
|
||||
providerTaskId: task.provider_task_id,
|
||||
apiKey: slotResult.apiKey,
|
||||
type: task.type,
|
||||
providerConfig,
|
||||
leaseToken: slotResult.leaseToken,
|
||||
keyManager,
|
||||
});
|
||||
recovered += 1;
|
||||
}
|
||||
|
||||
if (recovered > 0) {
|
||||
console.log(`[aiTaskWorker] Recovered ${recovered} running task poller(s) after startup`);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("[aiTaskWorker] Startup task recovery failed:", err.message);
|
||||
client.on("error", (err) => {
|
||||
console.error("[aiTaskWorker] task event listener error:", err.message);
|
||||
releaseClient();
|
||||
setTimeout(() => {
|
||||
startTaskEventListener().catch((restartErr) => {
|
||||
console.error("[aiTaskWorker] task event listener restart failed:", restartErr.message);
|
||||
});
|
||||
}, 5000).unref?.();
|
||||
});
|
||||
|
||||
await client.query(`LISTEN ${TASK_EVENT_CHANNEL}`);
|
||||
taskEventListenerClient = client;
|
||||
console.log(`[aiTaskWorker] listening for task events on ${TASK_EVENT_CHANNEL}`);
|
||||
})();
|
||||
|
||||
try {
|
||||
await taskEventListenerStarting;
|
||||
} finally {
|
||||
taskStartupRecoveryRunning = false;
|
||||
taskEventListenerStarting = null;
|
||||
}
|
||||
}
|
||||
|
||||
async function stopTaskEventListener() {
|
||||
const client = taskEventListenerClient;
|
||||
taskEventListenerClient = null;
|
||||
if (!client) return;
|
||||
try {
|
||||
await client.query(`UNLISTEN ${TASK_EVENT_CHANNEL}`);
|
||||
} catch {}
|
||||
client.release();
|
||||
}
|
||||
|
||||
function startStaleTaskCleanup() {
|
||||
if (staleTaskCleanupTimer) return;
|
||||
staleTaskCleanupTimer = setInterval(runStaleTaskCleanup, STALE_TASK_CLEANUP_INTERVAL_MS);
|
||||
taskResultPersistenceRetryTimer = setInterval(runResultPersistenceRetry, TASK_RESULT_PERSIST_RETRY_INTERVAL_MS);
|
||||
taskStartupRecoveryTimer = setTimeout(runTaskStartupRecovery, 5_000);
|
||||
// Run once shortly after startup
|
||||
setTimeout(runStaleTaskCleanup, 10_000);
|
||||
setTimeout(runResultPersistenceRetry, 15_000);
|
||||
}
|
||||
|
||||
function stopStaleTaskCleanup() {
|
||||
@@ -979,20 +1040,30 @@ function stopStaleTaskCleanup() {
|
||||
clearInterval(staleTaskCleanupTimer);
|
||||
staleTaskCleanupTimer = null;
|
||||
}
|
||||
if (taskResultPersistenceRetryTimer) {
|
||||
clearInterval(taskResultPersistenceRetryTimer);
|
||||
taskResultPersistenceRetryTimer = null;
|
||||
}
|
||||
if (taskStartupRecoveryTimer) {
|
||||
clearTimeout(taskStartupRecoveryTimer);
|
||||
taskStartupRecoveryTimer = null;
|
||||
}
|
||||
|
||||
function startPollerRecovery() {
|
||||
if (pollerRecoveryTimer) return;
|
||||
ensureTaskPollerStore()
|
||||
.then(() => recoverRunnablePollers())
|
||||
.catch((err) => console.error("[aiTaskWorker] initial poller recovery failed:", err.message));
|
||||
pollerRecoveryTimer = setInterval(() => {
|
||||
recoverRunnablePollers().catch((err) => {
|
||||
console.error("[aiTaskWorker] poller recovery failed:", err.message);
|
||||
});
|
||||
}, POLLER_RECOVERY_INTERVAL_MS);
|
||||
}
|
||||
|
||||
function stopPollerRecovery() {
|
||||
if (pollerRecoveryTimer) {
|
||||
clearInterval(pollerRecoveryTimer);
|
||||
pollerRecoveryTimer = null;
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
startPolling,
|
||||
stopPolling,
|
||||
cancelTask,
|
||||
updateTaskInDb,
|
||||
getActiveCount,
|
||||
extractProviderTaskId,
|
||||
@@ -1002,6 +1073,10 @@ module.exports = {
|
||||
parseKlingCredential,
|
||||
createKlingJwt,
|
||||
taskEvents,
|
||||
startTaskEventListener,
|
||||
stopTaskEventListener,
|
||||
startPollerRecovery,
|
||||
stopPollerRecovery,
|
||||
startStaleTaskCleanup,
|
||||
stopStaleTaskCleanup,
|
||||
};
|
||||
|
||||
@@ -353,18 +353,6 @@ async function migrateGenerationTasksBillingColumns(client) {
|
||||
);
|
||||
}
|
||||
|
||||
async function migrateGenerationTaskResultPersistence(client) {
|
||||
await addColumnIfMissing("generation_tasks", "result_persist_status TEXT NOT NULL DEFAULT 'pending'");
|
||||
await addColumnIfMissing("generation_tasks", "result_persist_attempts INTEGER NOT NULL DEFAULT 0");
|
||||
await addColumnIfMissing("generation_tasks", "result_persist_error TEXT");
|
||||
await addColumnIfMissing("generation_tasks", "result_persisted_at TIMESTAMPTZ");
|
||||
await client.query(`
|
||||
CREATE INDEX IF NOT EXISTS idx_generation_tasks_result_persist_retry
|
||||
ON generation_tasks(result_persist_status, updated_at)
|
||||
WHERE status = 'completed' AND result_url IS NOT NULL
|
||||
`);
|
||||
}
|
||||
|
||||
async function ensureModelPriceSeed() {
|
||||
const columns = await getColumnNames("model_prices");
|
||||
const useMills = columns.includes("input_price_mills");
|
||||
@@ -971,7 +959,6 @@ async function ensureSchema() {
|
||||
await runMigration("030_generation_tasks_user_status_index", migrateGenerationTasksUserStatusIndex);
|
||||
await runMigration("031_generation_tasks_billing_columns", migrateGenerationTasksBillingColumns);
|
||||
await runMigration("032_ecommerce_video_history", migrateEcommerceVideoHistorySchema);
|
||||
await runMigration("033_generation_task_result_persistence", migrateGenerationTaskResultPersistence);
|
||||
await ensureModelPriceSeed();
|
||||
}
|
||||
|
||||
|
||||
+6
-1
@@ -144,7 +144,9 @@ async function main() {
|
||||
startSettlementWorker()
|
||||
startProviderHealthMonitor()
|
||||
|
||||
const { startStaleTaskCleanup } = require('./aiTaskWorker')
|
||||
const { startStaleTaskCleanup, startTaskEventListener, startPollerRecovery } = require('./aiTaskWorker')
|
||||
await startTaskEventListener()
|
||||
startPollerRecovery()
|
||||
startStaleTaskCleanup()
|
||||
|
||||
server = app.listen(PORT, HOST, () => {
|
||||
@@ -183,6 +185,9 @@ function gracefulShutdown(signal) {
|
||||
console.log('[shutdown] Server closed, cleaning up...')
|
||||
const { stopProviderHealthMonitor } = require('./providerHealthMonitor')
|
||||
stopProviderHealthMonitor()
|
||||
const { stopTaskEventListener, stopPollerRecovery } = require('./aiTaskWorker')
|
||||
stopPollerRecovery()
|
||||
void stopTaskEventListener()
|
||||
const { pool } = require('./db')
|
||||
pool.end().then(() => {
|
||||
console.log('[shutdown] Database pool closed')
|
||||
|
||||
+2
-3
@@ -284,7 +284,7 @@ async function releaseLeaseInternal(leaseToken, user, options = {}) {
|
||||
const { rows } = await client.query(
|
||||
`
|
||||
WITH candidate AS (
|
||||
SELECT l.id, l.key_id, l.user_id, l.enterprise_id, k.provider
|
||||
SELECT l.id, l.key_id, k.provider
|
||||
FROM key_leases l
|
||||
JOIN api_keys k ON k.id = l.key_id
|
||||
WHERE l.lease_token = $1 AND l.released_at IS NULL
|
||||
@@ -298,7 +298,6 @@ async function releaseLeaseInternal(leaseToken, user, options = {}) {
|
||||
RETURNING id, key_id
|
||||
)
|
||||
SELECT r.id, r.key_id, c.provider
|
||||
, c.user_id, c.enterprise_id
|
||||
FROM released r
|
||||
JOIN candidate c ON c.key_id = r.key_id
|
||||
`,
|
||||
@@ -340,7 +339,7 @@ async function releaseLeaseInternal(leaseToken, user, options = {}) {
|
||||
INSERT INTO usage_logs (user_id, enterprise_id, provider, key_id, action)
|
||||
VALUES ($1, $2, (SELECT provider FROM api_keys WHERE id = $3), $4, $5)
|
||||
`,
|
||||
[userId ?? lease.user_id, enterpriseId ?? lease.enterprise_id, lease.key_id, lease.key_id, "release"],
|
||||
[userId, enterpriseId, lease.key_id, lease.key_id, "release"],
|
||||
);
|
||||
|
||||
return {
|
||||
|
||||
+6
-4
@@ -16,7 +16,6 @@ const {
|
||||
} = require("../enterpriseVideoBilling");
|
||||
const {
|
||||
startPolling,
|
||||
cancelTask,
|
||||
updateTaskInDb,
|
||||
extractProviderTaskId,
|
||||
extractImageUrl,
|
||||
@@ -1771,9 +1770,12 @@ function registerAiRoutes(router) {
|
||||
if (!Number.isFinite(taskId)) return res.status(400).json({ error: "Invalid task id" });
|
||||
|
||||
try {
|
||||
const task = await cancelTask(taskId, req.user.id);
|
||||
if (!task) return res.status(404).json({ error: "Task not found or not in active state" });
|
||||
res.json({ id: task.id, status: task.status });
|
||||
const { rows } = await pool.query(
|
||||
"UPDATE generation_tasks SET status = 'cancelled', updated_at = NOW() WHERE id = $1 AND user_id = $2 AND status IN ('pending', 'running') RETURNING id, status",
|
||||
[taskId, req.user.id],
|
||||
);
|
||||
if (rows.length === 0) return res.status(404).json({ error: "Task not found or not in active state" });
|
||||
res.json({ id: rows[0].id, status: rows[0].status });
|
||||
} catch (err) {
|
||||
console.error("[ai/task-cancel] error:", err.message);
|
||||
res.status(500).json({ error: "取消任务失败" });
|
||||
|
||||
Reference in New Issue
Block a user