- Add document analysis for PDFs, text, code files (up to 20MB) - Add PostgreSQL storage for task persistence (survives restarts) - Add Redis for conversation caching (24hr TTL) - Create storage.ts abstraction layer with fallback to memory - Update scheduler to persist tasks to database - Update config with DATABASE_URL and REDIS_URL support - Add railway.toml for Railway deployment - Update README with new architecture and features https://claude.ai/code/session_015VqJ7gN4vaxtYfYc92UjLs
121 lines
2.8 KiB
TypeScript
121 lines
2.8 KiB
TypeScript
/**
|
|
* AssureBot - Document Analysis
|
|
*
|
|
* Extract text from various document formats for AI analysis.
|
|
*/
|
|
|
|
export type DocumentResult = {
|
|
text: string;
|
|
pageCount?: number;
|
|
format: string;
|
|
truncated: boolean;
|
|
};
|
|
|
|
const MAX_TEXT_LENGTH = 50000; // ~12k tokens
|
|
|
|
/**
|
|
* Extract text from a buffer based on mime type
|
|
*/
|
|
export async function extractText(
|
|
buffer: Buffer,
|
|
mimeType: string,
|
|
filename?: string
|
|
): Promise<DocumentResult> {
|
|
const ext = filename?.split(".").pop()?.toLowerCase();
|
|
|
|
// Plain text files
|
|
if (
|
|
mimeType.startsWith("text/") ||
|
|
ext === "txt" ||
|
|
ext === "md" ||
|
|
ext === "json" ||
|
|
ext === "xml" ||
|
|
ext === "csv" ||
|
|
ext === "log"
|
|
) {
|
|
return extractPlainText(buffer);
|
|
}
|
|
|
|
// PDF
|
|
if (mimeType === "application/pdf" || ext === "pdf") {
|
|
return extractPdf(buffer);
|
|
}
|
|
|
|
// Code files (treat as text)
|
|
const codeExtensions = [
|
|
"js", "ts", "jsx", "tsx", "py", "rb", "go", "rs", "java",
|
|
"c", "cpp", "h", "hpp", "cs", "php", "swift", "kt", "scala",
|
|
"sh", "bash", "zsh", "yaml", "yml", "toml", "ini", "env",
|
|
"sql", "graphql", "html", "css", "scss", "less"
|
|
];
|
|
if (ext && codeExtensions.includes(ext)) {
|
|
return extractPlainText(buffer, ext);
|
|
}
|
|
|
|
// Unsupported format
|
|
return {
|
|
text: `[Unsupported document format: ${mimeType}${ext ? ` (.${ext})` : ""}]`,
|
|
format: "unsupported",
|
|
truncated: false,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Extract plain text
|
|
*/
|
|
function extractPlainText(buffer: Buffer, format = "text"): DocumentResult {
|
|
let text = buffer.toString("utf-8");
|
|
let truncated = false;
|
|
|
|
if (text.length > MAX_TEXT_LENGTH) {
|
|
text = text.slice(0, MAX_TEXT_LENGTH) + "\n\n[... truncated ...]";
|
|
truncated = true;
|
|
}
|
|
|
|
return { text, format, truncated };
|
|
}
|
|
|
|
/**
|
|
* Extract text from PDF using pdf-parse
|
|
*/
|
|
async function extractPdf(buffer: Buffer): Promise<DocumentResult> {
|
|
try {
|
|
// Dynamic import to avoid bundling issues
|
|
const pdfParse = await import("pdf-parse").then(m => m.default);
|
|
const data = await pdfParse(buffer);
|
|
|
|
let text = data.text;
|
|
let truncated = false;
|
|
|
|
if (text.length > MAX_TEXT_LENGTH) {
|
|
text = text.slice(0, MAX_TEXT_LENGTH) + "\n\n[... truncated ...]";
|
|
truncated = true;
|
|
}
|
|
|
|
return {
|
|
text,
|
|
pageCount: data.numpages,
|
|
format: "pdf",
|
|
truncated,
|
|
};
|
|
} catch (err) {
|
|
const msg = err instanceof Error ? err.message : String(err);
|
|
return {
|
|
text: `[Failed to parse PDF: ${msg}]`,
|
|
format: "pdf-error",
|
|
truncated: false,
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Summarize document metadata for logging
|
|
*/
|
|
export function summarizeDocument(result: DocumentResult): string {
|
|
const parts = [result.format.toUpperCase()];
|
|
if (result.pageCount) parts.push(`${result.pageCount} pages`);
|
|
parts.push(`${result.text.length} chars`);
|
|
if (result.truncated) parts.push("truncated");
|
|
return parts.join(", ");
|
|
}
|