import numpy as np import pandas as pd from GoogleEmbeddings import Embeddings from tinydb import TinyDB, Query from langchain_core.documents import Document from langchain_core.runnables import RunnableLambda class TinyDBRetriever(): def __init__(self, tinydb_filepath: str, google_api_key: str, k: int): self.tinydb_filepath = tinydb_filepath self.google_api_key = google_api_key self.k = k def embedQuery(self, query: str): embeddings = Embeddings(api_key=self.google_api_key) embedded_query = embeddings.embed_query(query) return embedded_query def getVecSearch(self) -> list[tuple[int, list[float]]]: db = TinyDB(self.tinydb_filepath) table = db.table('_default') Q = Query() vec_search = [tuple((t.doc_id, t['question-embedded'])) for t in table.all()] db.close() return vec_search def getSimilarityScores(self, query: list[float], keys: list[tuple[int, list[float]]]) -> pd.DataFrame(): scores = [] for tup in keys: num = np.dot(query, tup[1]) denom = np.sqrt(np.dot(query, query) * np.dot(tup[1], tup[1])) scores.append(tuple((tup[0], num/denom))) return pd.DataFrame(scores).set_index(0).sort_values(ascending=False, by=1).rename(columns={0:'doc_id', 1:'score'}) def _get_relevant_documents(self, query: str) -> list[Document]: embedded_query = self.embedQuery(query) vecsearch = self.getVecSearch() scores = self.getSimilarityScores(embedded_query, vecsearch)[:self.k] db = TinyDB(self.tinydb_filepath) table = db.table('_default') Q = Query() docs = [Document( page_content=table.get(doc_id=doc[0])['answer'], metadata={"question": table.get(doc_id=doc[0])['question']}) for doc in scores.iterrows()] return docs def as_retriever(self): return RunnableLambda(self._get_relevant_documents)