openclaw/secure/documents.ts
Claude b5d78db832
feat: add document analysis + PostgreSQL/Redis persistence
- Add document analysis for PDFs, text, code files (up to 20MB)
- Add PostgreSQL storage for task persistence (survives restarts)
- Add Redis for conversation caching (24hr TTL)
- Create storage.ts abstraction layer with fallback to memory
- Update scheduler to persist tasks to database
- Update config with DATABASE_URL and REDIS_URL support
- Add railway.toml for Railway deployment
- Update README with new architecture and features

https://claude.ai/code/session_015VqJ7gN4vaxtYfYc92UjLs
2026-01-30 07:13:06 +00:00

121 lines
2.8 KiB
TypeScript

/**
* AssureBot - Document Analysis
*
* Extract text from various document formats for AI analysis.
*/
export type DocumentResult = {
text: string;
pageCount?: number;
format: string;
truncated: boolean;
};
const MAX_TEXT_LENGTH = 50000; // ~12k tokens
/**
* Extract text from a buffer based on mime type
*/
export async function extractText(
buffer: Buffer,
mimeType: string,
filename?: string
): Promise<DocumentResult> {
const ext = filename?.split(".").pop()?.toLowerCase();
// Plain text files
if (
mimeType.startsWith("text/") ||
ext === "txt" ||
ext === "md" ||
ext === "json" ||
ext === "xml" ||
ext === "csv" ||
ext === "log"
) {
return extractPlainText(buffer);
}
// PDF
if (mimeType === "application/pdf" || ext === "pdf") {
return extractPdf(buffer);
}
// Code files (treat as text)
const codeExtensions = [
"js", "ts", "jsx", "tsx", "py", "rb", "go", "rs", "java",
"c", "cpp", "h", "hpp", "cs", "php", "swift", "kt", "scala",
"sh", "bash", "zsh", "yaml", "yml", "toml", "ini", "env",
"sql", "graphql", "html", "css", "scss", "less"
];
if (ext && codeExtensions.includes(ext)) {
return extractPlainText(buffer, ext);
}
// Unsupported format
return {
text: `[Unsupported document format: ${mimeType}${ext ? ` (.${ext})` : ""}]`,
format: "unsupported",
truncated: false,
};
}
/**
* Extract plain text
*/
function extractPlainText(buffer: Buffer, format = "text"): DocumentResult {
let text = buffer.toString("utf-8");
let truncated = false;
if (text.length > MAX_TEXT_LENGTH) {
text = text.slice(0, MAX_TEXT_LENGTH) + "\n\n[... truncated ...]";
truncated = true;
}
return { text, format, truncated };
}
/**
* Extract text from PDF using pdf-parse
*/
async function extractPdf(buffer: Buffer): Promise<DocumentResult> {
try {
// Dynamic import to avoid bundling issues
const pdfParse = await import("pdf-parse").then(m => m.default);
const data = await pdfParse(buffer);
let text = data.text;
let truncated = false;
if (text.length > MAX_TEXT_LENGTH) {
text = text.slice(0, MAX_TEXT_LENGTH) + "\n\n[... truncated ...]";
truncated = true;
}
return {
text,
pageCount: data.numpages,
format: "pdf",
truncated,
};
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
return {
text: `[Failed to parse PDF: ${msg}]`,
format: "pdf-error",
truncated: false,
};
}
}
/**
* Summarize document metadata for logging
*/
export function summarizeDocument(result: DocumentResult): string {
const parts = [result.format.toUpperCase()];
if (result.pageCount) parts.push(`${result.pageCount} pages`);
parts.push(`${result.text.length} chars`);
if (result.truncated) parts.push("truncated");
return parts.join(", ");
}