Search
Code165
import { generateSnippet, cosineSimilarity } from "./utils.ts";// OpenAI embeddings functionexport const generateEmbeddings = async (content: string): Promise<number[] | null> => { const OPENAI_API_KEY = Deno.env.get("OPENAI_API_KEY"); const OPENAI_API_URL = "https://api.openai.com/v1/embeddings"; if (!OPENAI_API_KEY) { console.warn("OPENAI_API_KEY not found - embeddings disabled"); return null; } } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); console.error("OpenAI embeddings failed:", errorMessage); return null; }};// No caching for query embeddings - we want to benchmark actual API performanceexport const searchStrategy: SearchStrategy = { name: "openai-cosine", description: "Semantic search using OpenAI embeddings with direct cosine similarity (fastest for small datasets)", search: async (query: string, pages: Page[], options: SearchOptions = {}): Promise<SearchResult[]> => { const limit = options.limit || 10; // Generate query embedding (no caching - benchmarking actual API performance) const queryEmbedStart = performance.now(); const queryEmbedding = await generateEmbeddings(query); if (enableTiming) { timings.queryEmbedding = performance.now() - queryEmbedStart; for (const page of pages) { const pageEmbedding = page.embeddings; if (!pageEmbedding || pageEmbedding.length !== queryEmbedding.length) { continue; // Skip pages without embeddings or wrong dimension } // Initialize cache table only in Val Town (bypass SQLite when running locally)const CACHE_TABLE = "groq_docs_cache_v3"; // Updated table with hash and embeddingsif (IS_VALTOWN) { try { metadata TEXT, contentHash TEXT, embeddings TEXT, cachedAt INTEGER NOT NULL )`); metadata: any | null; contentHash: string | null; embeddings: number[] | null;} | null> => { if (!IS_VALTOWN) { try { const result = await sqlite.execute({ sql: `SELECT content, charCount, tokenCount, frontmatter, metadata, contentHash, embeddings FROM ${CACHE_TABLE} WHERE url = ?`, args: [url] }); metadata: rowObj.metadata ? JSON.parse(rowObj.metadata as string) : null, contentHash: rowObj.contentHash as string | null, embeddings: rowObj.embeddings ? JSON.parse(rowObj.embeddings as string) : null, }; }};export const setCache = async (url: string, data: { content: string; charCount: number; tokenCount: number | null; frontmatter: any; metadata?: any; contentHash?: string; embeddings?: number[] | null }): Promise<string | null> => { if (!IS_VALTOWN) { return "Cache unavailable - running locally"; try { await sqlite.execute({ sql: `INSERT OR REPLACE INTO ${CACHE_TABLE} (url, content, charCount, tokenCount, frontmatter, metadata, contentHash, embeddings, cachedAt) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, args: [ url, data.metadata ? JSON.stringify(data.metadata) : null, data.contentHash || null, data.embeddings ? JSON.stringify(data.embeddings) : null, Date.now(), ]// Embeddings and Search utilities// TODO: Replace fake implementations with actual embeddings API and vector search// Generate embeddings for content (fake default implementation)export const generateEmbeddings = async (content: string): Promise<number[] | null> => { // TODO: Replace with actual embeddings API when available // For now, return a fake embedding vector console.debug("Generating fake embeddings (replace with actual API when available)"); // Generate a fake embedding of fixed size (e.g., 384 dimensions) // Using a simple hash-based approach to create deterministic fake embeddings const fakeEmbedding: number[] = []; const hash = content.split('').reduce((acc, char) => acc + char.charCodeAt(0), 0);// Search pages by query (fake implementation using keyword matching)// TODO: Replace with actual vector similarity search using embeddingsexport const searchPages = async ( query: string, content: string; title?: string; embeddings?: number[] | null; metadata?: any; }>, const minScore = options.minScore || 0; // TODO: Use embeddings for semantic search // For now, use simple keyword matching };// Vector similarity search (for future use with embeddings)export const vectorSearch = async ( queryEmbedding: number[], url: string; title?: string; embeddings: number[] | null; }>, options: { for (const page of pages) { if (!page.embeddings) { continue; } const similarity = cosineSimilarity(queryEmbedding, page.embeddings); if (similarity >= minSimilarity) { "slug": "val-vibes", "link": "/blog/val-vibes", "description": "How to build semantic search with embeddings for Val Town within Val Town itself", "pubDate": "Tue, 18 Jun 2024 00:00:00 GMT", "author": "JP Posma", "slug": "val-vibes", "link": "/blog/val-vibes", "description": "How to build semantic search with embeddings for Val Town within Val Town itself", "pubDate": "Tue, 18 Jun 2024 00:00:00 GMT", "author": "JP Posma", "slug": "val-vibes", "link": "/blog/val-vibes", "description": "How to build semantic search with embeddings for Val Town within Val Town itself", "pubDate": "Tue, 18 Jun 2024 00:00:00 GMT", "author": "JP Posma", "slug": "val-vibes", "link": "/blog/val-vibes", "description": "How to build semantic search with embeddings for Val Town within Val Town itself", "pubDate": "Tue, 18 Jun 2024 00:00:00 GMT", "author": "JP Posma", - `imageRecognition.labels`: Visual elements detected (people, objects, logos, etc.) - `vectors`: Text embeddings for semantic similarity (using Basilica method)- **Content metadata fields** (may not yet be generally populated): - `description`: Manual content descriptions - `imageRecognition.labels`: Visual elements detected (people, objects, logos, etc.) - `vectors`: Text embeddings for semantic similarity (using Basilica method)- **Content metadata fields** (may not yet be generally populated): - `description`: Manual content descriptions - `imageRecognition.labels`: Visual elements detected (people, objects, logos, etc.) - `vectors`: Text embeddings for semantic similarity (using Basilica method)- **Content metadata fields** (may not yet be generally populated): - `description`: Manual content descriptions