openclaw/secure/documents.ts
Claude 095d476acc
feat: add persistent personality, Piston API sandbox, and storage layer
- Add personality engine with learning from conversations
  - Tracks user preferences, interests, and communication style
  - Persists to Redis (cache) + PostgreSQL (durable)
  - Generates personalized system prompts per user

- Add Piston API fallback for sandbox execution
  - Auto-detects backend: Docker → Piston API → none
  - Supports 15+ languages via free cloud execution
  - Works on Railway and other managed platforms

- Add storage layer with layered persistence
  - PostgreSQL for tasks, user profiles, personality traits
  - Redis for conversation cache and fast profile access
  - Graceful fallback to in-memory when not configured

- Update scheduler with task persistence
  - Loads tasks from PostgreSQL on startup
  - Saves task status after execution

- Add document analysis (PDF, text files)
- Add railway-template.json for one-click deployment
- Enable sandbox by default (Piston fallback)
- Add OpenRouter support (100+ models)

https://claude.ai/code/session_015VqJ7gN4vaxtYfYc92UjLs
2026-01-30 08:03:08 +00:00

121 lines
2.8 KiB
TypeScript

/**
* AssureBot - Document Analysis
*
* Extract text from various document formats for AI analysis.
*/
export type DocumentResult = {
text: string;
pageCount?: number;
format: string;
truncated: boolean;
};
const MAX_TEXT_LENGTH = 50000; // ~12k tokens
/**
* Extract text from a buffer based on mime type
*/
export async function extractText(
buffer: Buffer,
mimeType: string,
filename?: string
): Promise<DocumentResult> {
const ext = filename?.split(".").pop()?.toLowerCase();
// Plain text files
if (
mimeType.startsWith("text/") ||
ext === "txt" ||
ext === "md" ||
ext === "json" ||
ext === "xml" ||
ext === "csv" ||
ext === "log"
) {
return extractPlainText(buffer);
}
// PDF
if (mimeType === "application/pdf" || ext === "pdf") {
return extractPdf(buffer);
}
// Code files (treat as text)
const codeExtensions = [
"js", "ts", "jsx", "tsx", "py", "rb", "go", "rs", "java",
"c", "cpp", "h", "hpp", "cs", "php", "swift", "kt", "scala",
"sh", "bash", "zsh", "yaml", "yml", "toml", "ini", "env",
"sql", "graphql", "html", "css", "scss", "less"
];
if (ext && codeExtensions.includes(ext)) {
return extractPlainText(buffer, ext);
}
// Unsupported format
return {
text: `[Unsupported document format: ${mimeType}${ext ? ` (.${ext})` : ""}]`,
format: "unsupported",
truncated: false,
};
}
/**
* Extract plain text
*/
function extractPlainText(buffer: Buffer, format = "text"): DocumentResult {
let text = buffer.toString("utf-8");
let truncated = false;
if (text.length > MAX_TEXT_LENGTH) {
text = text.slice(0, MAX_TEXT_LENGTH) + "\n\n[... truncated ...]";
truncated = true;
}
return { text, format, truncated };
}
/**
* Extract text from PDF using pdf-parse
*/
async function extractPdf(buffer: Buffer): Promise<DocumentResult> {
try {
// Dynamic import to avoid bundling issues
const pdfParse = await import("pdf-parse").then(m => m.default);
const data = await pdfParse(buffer);
let text = data.text;
let truncated = false;
if (text.length > MAX_TEXT_LENGTH) {
text = text.slice(0, MAX_TEXT_LENGTH) + "\n\n[... truncated ...]";
truncated = true;
}
return {
text,
pageCount: data.numpages,
format: "pdf",
truncated,
};
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
return {
text: `[Failed to parse PDF: ${msg}]`,
format: "pdf-error",
truncated: false,
};
}
}
/**
* Summarize document metadata for logging
*/
export function summarizeDocument(result: DocumentResult): string {
const parts = [result.format.toUpperCase()];
if (result.pageCount) parts.push(`${result.pageCount} pages`);
parts.push(`${result.text.length} chars`);
if (result.truncated) parts.push("truncated");
return parts.join(", ");
}