Search
Code127
(async () => { try { // let embeddings = await modelProvider.gen({ // embed: true, // value: JSON.stringify(link) // }); // console.log(`Adding link ${counter}:`, link.title, link.url, embeddings); console.log(`Adding link ${counter}:`, link.title, link.url); counter++; postedAt: new Date(link.postedAt).toLocaleString('en-US', { timeZone: 'UTC' }), data: JSON.stringify(link), // embeddings: embeddings.embedding.join(','), }); } finally {
async addDocuments({ documents, fields = 'content', modelName = 'text-embedding-3-large' }) { const documentsWithoutEmbeddings = documents?.filter(doc => !doc.embedding) || []; const documentsWithEmbeddings = documents?.filter(doc => doc.embedding) || []; if (documentsWithoutEmbeddings.length > 0) { const contents = documentsWithoutEmbeddings.map(doc => doc.content); const embeddings = await this.getEmbeddings(contents, modelName); documentsWithoutEmbeddings.forEach((doc, index) => { doc.embedding = embeddings[index]; this.documents.push(doc); }); } documentsWithEmbeddings.forEach(doc => { this.documents.push(doc); }); } async getEmbeddings(texts, modelName) { const { embeddings } = await embedMany({ model: openai.embedding(modelName), values: texts, }); return embeddings; }
<th>Name</th> <th>Content</th> <th>embeddingsContent</th> </tr> </thead> <td>{result.name}</td> <td>{result.content}</td> <td>{result.embeddingsContent}</td> </tr> ))}const defaultNameColumn = "Name";const defaultContentColumn = "Content";const defaultEmbeddingColumn = "Embeddings";async function fetchAirtableData(baseId, tableName, nameColumn, contentColumn, embeddingColumn) { content: record.get(contentColumn), embedding: record.get(embeddingColumn).split(",").map(parseFloat), embeddingsContent: record.get('EmbeddingsContent'), })); return documents; const documents = await fetchAirtableData(baseId, tableName, nameColumn, contentColumn, embeddingColumn); console.log('documents:', documents) await semanticSearch.addDocuments({documents, fields: 'embeddingsContent'}); const results = await semanticSearch.search({query, similarityThreshold});
Demo of adding an Embeddings column to Airtable (which calls the embeddings endpoint at yawnxyz/v/ai) and stores embeddings in an Airtable column, then performing search against itUsage:
const defaultTableName = "ExampleItems";const defaultContentColumn = "Content";const defaultEmbeddingColumn = "Embeddings";async function fetchAirtableData(baseId, tableName, contentColumn, embeddingColumn) {
Use embeddings / Lunr search on Airtable. Embeddings need to have been generated / stored on Airtable, or this gets very slow / costly.- Simple usage: https://yawnxyz-buildclubprojectsearch.web.val.run/search?query=cars
import lunr from "https://cdn.skypack.dev/lunr";// Step 1: Get Embeddings// Function to get a single embeddingasync function getEmbedding(text) {}// Function to get embeddings for multiple textsasync function getEmbeddings(texts) { console.log(`Getting embeddings for texts: ${texts}`); const { embeddings } = await embedMany({ model: openai.embedding('text-embedding-3-small'), values: texts, }); console.log(`Embeddings: ${embeddings}`); return embeddings;}// Step 2: Store Embeddings with Documentsconst documents = [ { id: 1, content: 'cats dogs' },];async function prepareDocumentsWithEmbeddings() { const contents = documents.map(doc => doc.content); const embeddings = await getEmbeddings(contents); documents.forEach((doc, index) => { doc.embedding = embeddings[index]; }); // console.log('Documents with embeddings:', documents);}await prepareDocumentsWithEmbeddings();// Step 3: Nearest Neighbor Search if (nearestDocs.length > 0 && cosineSimilarity(nearestDocs[0].embedding, queryEmbedding) >= similarityThreshold) { // Remove the embedding field from the search results const resultsWithoutEmbeddings = nearestDocs.map(doc => { const { embedding, ...rest } = doc; return rest; }); console.log('Cosine similarity results:', resultsWithoutEmbeddings); return resultsWithoutEmbeddings; } else { const results = idx.search(query); // Remove the embedding field from the search results const resultsWithoutEmbeddings = results.map(result => { const doc = documents.find(doc => doc.id.toString() === result.ref); const { embedding, ...rest } = doc; return rest; }); console.log('Lunr search results:', resultsWithoutEmbeddings); return resultsWithoutEmbeddings; }}
In-memory semantic search; load it up with valtown KV.This is a "dumb" version of vector search, for prototyping RAG responses and UIs — with both regular search (w/ Lunr) and vector search (with OpenAI embeddings + cosine similarity)Usage:
This is an example of in-memory search, using a combination of lunr, OpenAI embeddings, and cosine similarityMigrated from folder: Libraries/SemanticSearch/embeddingsSearchExample
export default async function semanticSearchPublicVals(query) { const allValsBlobEmbeddingsMeta = (await blob.getJSON(`allValsBlob${dimensions}EmbeddingsMeta`)) ?? {}; const allBatchDataIndexes = _.uniq(Object.values(allValsBlobEmbeddingsMeta).map((item: any) => item.batchDataIndex)); const embeddingsBatches = []; const allBatchDataIndexesPromises = []; for (const batchDataIndex of allBatchDataIndexes) { const embeddingsBatchBlobName = `allValsBlob${dimensions}EmbeddingsData_${batchDataIndex}`; const promise = blob.get(embeddingsBatchBlobName).then((response) => response.arrayBuffer()); promise.then((data) => { embeddingsBatches[batchDataIndex as any] = data; console.log(`Loaded ${embeddingsBatchBlobName} (${data.byteLength} bytes)`); }); allBatchDataIndexesPromises.push(promise); const openai = new OpenAI(); const queryEmbedding = (await openai.embeddings.create({ model: "text-embedding-3-small", input: query, const res = []; for (const id in allValsBlobEmbeddingsMeta) { const meta = allValsBlobEmbeddingsMeta[id]; const embedding = new Float32Array( embeddingsBatches[meta.batchDataIndex], dimensions * 4 * meta.valIndex, dimensions,