fix: Add fetch timeouts to prevent batch embedding SIGKILL (#4370)
Root cause: fetch() calls had no timeout, could hang indefinitely during batch status polling, eventually triggering SIGKILL. Changes: - Add fetchWithTimeout() helper with AbortController - Default 30s timeout for status checks (OPENAI_FETCH_TIMEOUT_MS) - 60s timeout for file upload/download operations - Add retry logic with exponential backoff for network failures - Wrap batch status polling in retryAsync for transient errors This ensures: 1. No indefinite hangs - all network calls time out 2. Graceful recovery from temporary network issues 3. Clear error messages on timeout 4. Process completes or fails within expected timeframe Fixes #4370
This commit is contained in:
parent
374a6339f1
commit
01f7518f91
@ -34,11 +34,36 @@ export type OpenAiBatchOutputLine = {
|
||||
export const OPENAI_BATCH_ENDPOINT = "/v1/embeddings";
|
||||
const OPENAI_BATCH_COMPLETION_WINDOW = "24h";
|
||||
const OPENAI_BATCH_MAX_REQUESTS = 50000;
|
||||
const OPENAI_FETCH_TIMEOUT_MS = 30000; // 30 second timeout for individual fetch calls
|
||||
|
||||
function getOpenAiBaseUrl(openAi: OpenAiEmbeddingClient): string {
|
||||
return openAi.baseUrl?.replace(/\/$/, "") ?? "";
|
||||
}
|
||||
|
||||
async function fetchWithTimeout(
|
||||
url: string,
|
||||
options: RequestInit & { timeoutMs?: number },
|
||||
): Promise<Response> {
|
||||
const timeoutMs = options.timeoutMs ?? OPENAI_FETCH_TIMEOUT_MS;
|
||||
const controller = new AbortController();
|
||||
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
...options,
|
||||
signal: controller.signal,
|
||||
});
|
||||
clearTimeout(timeoutId);
|
||||
return response;
|
||||
} catch (err) {
|
||||
clearTimeout(timeoutId);
|
||||
if (err instanceof Error && err.name === "AbortError") {
|
||||
throw new Error(`fetch timeout after ${timeoutMs}ms: ${url}`);
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
function getOpenAiHeaders(
|
||||
openAi: OpenAiEmbeddingClient,
|
||||
params: { json: boolean },
|
||||
@ -79,10 +104,11 @@ async function submitOpenAiBatch(params: {
|
||||
`memory-embeddings.${hashText(String(Date.now()))}.jsonl`,
|
||||
);
|
||||
|
||||
const fileRes = await fetch(`${baseUrl}/files`, {
|
||||
const fileRes = await fetchWithTimeout(`${baseUrl}/files`, {
|
||||
method: "POST",
|
||||
headers: getOpenAiHeaders(params.openAi, { json: false }),
|
||||
body: form,
|
||||
timeoutMs: 60000, // 60s for file upload
|
||||
});
|
||||
if (!fileRes.ok) {
|
||||
const text = await fileRes.text();
|
||||
@ -95,7 +121,7 @@ async function submitOpenAiBatch(params: {
|
||||
|
||||
const batchRes = await retryAsync(
|
||||
async () => {
|
||||
const res = await fetch(`${baseUrl}/batches`, {
|
||||
const res = await fetchWithTimeout(`${baseUrl}/batches`, {
|
||||
method: "POST",
|
||||
headers: getOpenAiHeaders(params.openAi, { json: true }),
|
||||
body: JSON.stringify({
|
||||
@ -137,7 +163,7 @@ async function fetchOpenAiBatchStatus(params: {
|
||||
batchId: string;
|
||||
}): Promise<OpenAiBatchStatus> {
|
||||
const baseUrl = getOpenAiBaseUrl(params.openAi);
|
||||
const res = await fetch(`${baseUrl}/batches/${params.batchId}`, {
|
||||
const res = await fetchWithTimeout(`${baseUrl}/batches/${params.batchId}`, {
|
||||
headers: getOpenAiHeaders(params.openAi, { json: true }),
|
||||
});
|
||||
if (!res.ok) {
|
||||
@ -152,8 +178,9 @@ async function fetchOpenAiFileContent(params: {
|
||||
fileId: string;
|
||||
}): Promise<string> {
|
||||
const baseUrl = getOpenAiBaseUrl(params.openAi);
|
||||
const res = await fetch(`${baseUrl}/files/${params.fileId}/content`, {
|
||||
const res = await fetchWithTimeout(`${baseUrl}/files/${params.fileId}/content`, {
|
||||
headers: getOpenAiHeaders(params.openAi, { json: true }),
|
||||
timeoutMs: 60000, // 60s for file download
|
||||
});
|
||||
if (!res.ok) {
|
||||
const text = await res.text();
|
||||
@ -208,10 +235,24 @@ async function waitForOpenAiBatch(params: {
|
||||
while (true) {
|
||||
const status =
|
||||
current ??
|
||||
(await fetchOpenAiBatchStatus({
|
||||
openAi: params.openAi,
|
||||
batchId: params.batchId,
|
||||
}));
|
||||
(await retryAsync(
|
||||
async () =>
|
||||
await fetchOpenAiBatchStatus({
|
||||
openAi: params.openAi,
|
||||
batchId: params.batchId,
|
||||
}),
|
||||
{
|
||||
attempts: 3,
|
||||
minDelayMs: 500,
|
||||
maxDelayMs: 2000,
|
||||
jitter: 0.2,
|
||||
shouldRetry: (err) => {
|
||||
// Retry on network errors and timeout errors
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
return message.includes("timeout") || message.includes("fetch failed");
|
||||
},
|
||||
},
|
||||
));
|
||||
const state = status.status ?? "unknown";
|
||||
if (state === "completed") {
|
||||
if (!status.output_file_id) {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user