fix: Add fetch timeouts to prevent batch embedding SIGKILL (#4370)
Root cause: fetch() calls had no timeout, could hang indefinitely during batch status polling, eventually triggering SIGKILL. Changes: - Add fetchWithTimeout() helper with AbortController - Default 30s timeout for status checks (OPENAI_FETCH_TIMEOUT_MS) - 60s timeout for file upload/download operations - Add retry logic with exponential backoff for network failures - Wrap batch status polling in retryAsync for transient errors This ensures: 1. No indefinite hangs - all network calls time out 2. Graceful recovery from temporary network issues 3. Clear error messages on timeout 4. Process completes or fails within expected timeframe Fixes #4370
This commit is contained in:
parent
374a6339f1
commit
01f7518f91
@ -34,11 +34,36 @@ export type OpenAiBatchOutputLine = {
|
|||||||
export const OPENAI_BATCH_ENDPOINT = "/v1/embeddings";
|
export const OPENAI_BATCH_ENDPOINT = "/v1/embeddings";
|
||||||
const OPENAI_BATCH_COMPLETION_WINDOW = "24h";
|
const OPENAI_BATCH_COMPLETION_WINDOW = "24h";
|
||||||
const OPENAI_BATCH_MAX_REQUESTS = 50000;
|
const OPENAI_BATCH_MAX_REQUESTS = 50000;
|
||||||
|
const OPENAI_FETCH_TIMEOUT_MS = 30000; // 30 second timeout for individual fetch calls
|
||||||
|
|
||||||
function getOpenAiBaseUrl(openAi: OpenAiEmbeddingClient): string {
|
function getOpenAiBaseUrl(openAi: OpenAiEmbeddingClient): string {
|
||||||
return openAi.baseUrl?.replace(/\/$/, "") ?? "";
|
return openAi.baseUrl?.replace(/\/$/, "") ?? "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function fetchWithTimeout(
|
||||||
|
url: string,
|
||||||
|
options: RequestInit & { timeoutMs?: number },
|
||||||
|
): Promise<Response> {
|
||||||
|
const timeoutMs = options.timeoutMs ?? OPENAI_FETCH_TIMEOUT_MS;
|
||||||
|
const controller = new AbortController();
|
||||||
|
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch(url, {
|
||||||
|
...options,
|
||||||
|
signal: controller.signal,
|
||||||
|
});
|
||||||
|
clearTimeout(timeoutId);
|
||||||
|
return response;
|
||||||
|
} catch (err) {
|
||||||
|
clearTimeout(timeoutId);
|
||||||
|
if (err instanceof Error && err.name === "AbortError") {
|
||||||
|
throw new Error(`fetch timeout after ${timeoutMs}ms: ${url}`);
|
||||||
|
}
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function getOpenAiHeaders(
|
function getOpenAiHeaders(
|
||||||
openAi: OpenAiEmbeddingClient,
|
openAi: OpenAiEmbeddingClient,
|
||||||
params: { json: boolean },
|
params: { json: boolean },
|
||||||
@ -79,10 +104,11 @@ async function submitOpenAiBatch(params: {
|
|||||||
`memory-embeddings.${hashText(String(Date.now()))}.jsonl`,
|
`memory-embeddings.${hashText(String(Date.now()))}.jsonl`,
|
||||||
);
|
);
|
||||||
|
|
||||||
const fileRes = await fetch(`${baseUrl}/files`, {
|
const fileRes = await fetchWithTimeout(`${baseUrl}/files`, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: getOpenAiHeaders(params.openAi, { json: false }),
|
headers: getOpenAiHeaders(params.openAi, { json: false }),
|
||||||
body: form,
|
body: form,
|
||||||
|
timeoutMs: 60000, // 60s for file upload
|
||||||
});
|
});
|
||||||
if (!fileRes.ok) {
|
if (!fileRes.ok) {
|
||||||
const text = await fileRes.text();
|
const text = await fileRes.text();
|
||||||
@ -95,7 +121,7 @@ async function submitOpenAiBatch(params: {
|
|||||||
|
|
||||||
const batchRes = await retryAsync(
|
const batchRes = await retryAsync(
|
||||||
async () => {
|
async () => {
|
||||||
const res = await fetch(`${baseUrl}/batches`, {
|
const res = await fetchWithTimeout(`${baseUrl}/batches`, {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
headers: getOpenAiHeaders(params.openAi, { json: true }),
|
headers: getOpenAiHeaders(params.openAi, { json: true }),
|
||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
@ -137,7 +163,7 @@ async function fetchOpenAiBatchStatus(params: {
|
|||||||
batchId: string;
|
batchId: string;
|
||||||
}): Promise<OpenAiBatchStatus> {
|
}): Promise<OpenAiBatchStatus> {
|
||||||
const baseUrl = getOpenAiBaseUrl(params.openAi);
|
const baseUrl = getOpenAiBaseUrl(params.openAi);
|
||||||
const res = await fetch(`${baseUrl}/batches/${params.batchId}`, {
|
const res = await fetchWithTimeout(`${baseUrl}/batches/${params.batchId}`, {
|
||||||
headers: getOpenAiHeaders(params.openAi, { json: true }),
|
headers: getOpenAiHeaders(params.openAi, { json: true }),
|
||||||
});
|
});
|
||||||
if (!res.ok) {
|
if (!res.ok) {
|
||||||
@ -152,8 +178,9 @@ async function fetchOpenAiFileContent(params: {
|
|||||||
fileId: string;
|
fileId: string;
|
||||||
}): Promise<string> {
|
}): Promise<string> {
|
||||||
const baseUrl = getOpenAiBaseUrl(params.openAi);
|
const baseUrl = getOpenAiBaseUrl(params.openAi);
|
||||||
const res = await fetch(`${baseUrl}/files/${params.fileId}/content`, {
|
const res = await fetchWithTimeout(`${baseUrl}/files/${params.fileId}/content`, {
|
||||||
headers: getOpenAiHeaders(params.openAi, { json: true }),
|
headers: getOpenAiHeaders(params.openAi, { json: true }),
|
||||||
|
timeoutMs: 60000, // 60s for file download
|
||||||
});
|
});
|
||||||
if (!res.ok) {
|
if (!res.ok) {
|
||||||
const text = await res.text();
|
const text = await res.text();
|
||||||
@ -208,10 +235,24 @@ async function waitForOpenAiBatch(params: {
|
|||||||
while (true) {
|
while (true) {
|
||||||
const status =
|
const status =
|
||||||
current ??
|
current ??
|
||||||
(await fetchOpenAiBatchStatus({
|
(await retryAsync(
|
||||||
openAi: params.openAi,
|
async () =>
|
||||||
batchId: params.batchId,
|
await fetchOpenAiBatchStatus({
|
||||||
}));
|
openAi: params.openAi,
|
||||||
|
batchId: params.batchId,
|
||||||
|
}),
|
||||||
|
{
|
||||||
|
attempts: 3,
|
||||||
|
minDelayMs: 500,
|
||||||
|
maxDelayMs: 2000,
|
||||||
|
jitter: 0.2,
|
||||||
|
shouldRetry: (err) => {
|
||||||
|
// Retry on network errors and timeout errors
|
||||||
|
const message = err instanceof Error ? err.message : String(err);
|
||||||
|
return message.includes("timeout") || message.includes("fetch failed");
|
||||||
|
},
|
||||||
|
},
|
||||||
|
));
|
||||||
const state = status.status ?? "unknown";
|
const state = status.status ?? "unknown";
|
||||||
if (state === "completed") {
|
if (state === "completed") {
|
||||||
if (!status.output_file_id) {
|
if (!status.output_file_id) {
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user