From 3cd3c5e49a60764aa5c1d1bcd99cb6ca1651dc92 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Wed, 23 Oct 2024 14:32:02 -0700 Subject: [PATCH 01/10] feat: Abstract Vector Store --- ragengine/vector_store/base.py | 134 +++++++++++++++-- ragengine/vector_store/chromadb_store.py | 16 ++ ragengine/vector_store/faiss_store.py | 180 +---------------------- 3 files changed, 144 insertions(+), 186 deletions(-) create mode 100644 ragengine/vector_store/chromadb_store.py diff --git a/ragengine/vector_store/base.py b/ragengine/vector_store/base.py index bf3be9624..3c94c752a 100644 --- a/ragengine/vector_store/base.py +++ b/ragengine/vector_store/base.py @@ -1,31 +1,143 @@ from abc import ABC, abstractmethod from typing import Dict, List - -from ragengine.models import Document import hashlib +import os + +from llama_index.core import Document as LlamaDocument +from llama_index.core.storage.index_store import SimpleIndexStore +from llama_index.core import (StorageContext, VectorStoreIndex) +from ragengine.models import Document +from ragengine.embedding.base import BaseEmbeddingModel +from ragengine.inference.inference import Inference +from ragengine.config import PERSIST_DIR class BaseVectorStore(ABC): + def __init__(self, embedding_manager: BaseEmbeddingModel): + self.embedding_manager = embedding_manager + self.embed_model = self.embedding_manager.model + self.index_map = {} + self.index_store = SimpleIndexStore() + self.llm = Inference() + + @staticmethod def generate_doc_id(text: str) -> str: """Generates a unique document ID based on the hash of the document text.""" return hashlib.sha256(text.encode('utf-8')).hexdigest() - @abstractmethod def index_documents(self, index_name: str, documents: List[Document]) -> List[str]: - pass + """Common indexing logic for all vector stores.""" + if index_name in self.index_map: + return self._append_documents_to_index(index_name, documents) + else: + return self._create_new_index(index_name, documents) + + def _append_documents_to_index(self, index_name: str, documents: List[Document]) -> List[str]: + """Common logic for appending documents to existing index.""" + print(f"Index {index_name} already exists. Appending documents to existing index.") + indexed_doc_ids = set() + + for doc in documents: + doc_id = self.generate_doc_id(doc.text) + if not self.document_exists(index_name, doc_id): + self.add_document_to_index(index_name, doc, doc_id) + indexed_doc_ids.add(doc_id) + else: + print(f"Document {doc_id} already exists in index {index_name}. Skipping.") + if indexed_doc_ids: + self._persist(index_name) + return list(indexed_doc_ids) + @abstractmethod - def query(self, index_name: str, query: str, top_k: int, params: dict): + def _create_new_index(self, index_name: str, documents: List[Document]) -> List[str]: + """Create a new index - implementation specific to each vector store.""" pass + + def _create_index_common(self, index_name: str, documents: List[Document], vector_store) -> List[str]: + """Common logic for creating a new index with documents.""" + storage_context = StorageContext.from_defaults(vector_store=vector_store) + llama_docs = [] + indexed_doc_ids = set() + + for doc in documents: + doc_id = self.generate_doc_id(doc.text) + llama_doc = LlamaDocument(id_=doc_id, text=doc.text, metadata=doc.metadata) + llama_docs.append(llama_doc) + indexed_doc_ids.add(doc_id) + + if llama_docs: + index = VectorStoreIndex.from_documents( + llama_docs, + storage_context=storage_context, + embed_model=self.embed_model, + ) + index.set_index_id(index_name) + self.index_map[index_name] = index + self.index_store.add_index_struct(index.index_struct) + self._persist(index_name) + return list(indexed_doc_ids) + + def query(self, index_name: str, query: str, top_k: int, llm_params: dict): + """Common query logic for all vector stores.""" + if index_name not in self.index_map: + raise ValueError(f"No such index: '{index_name}' exists.") + self.llm.set_params(llm_params) + + query_engine = self.index_map[index_name].as_query_engine( + llm=self.llm, + similarity_top_k=top_k + ) + query_result = query_engine.query(query) + return { + "response": query_result.response, + "source_nodes": [ + { + "node_id": node.node_id, + "text": node.text, + "score": node.score, + "metadata": node.metadata + } + for node in query_result.source_nodes + ], + "metadata": query_result.metadata, + } - @abstractmethod def add_document_to_index(self, index_name: str, document: Document, doc_id: str): - pass + """Common logic for adding a single document.""" + if index_name not in self.index_map: + raise ValueError(f"No such index: '{index_name}' exists.") + llama_doc = LlamaDocument(text=document.text, metadata=document.metadata, id_=doc_id) + self.index_map[index_name].insert(llama_doc) - @abstractmethod def list_all_indexed_documents(self) -> Dict[str, Dict[str, Dict[str, str]]]: - pass + """Common logic for listing all documents.""" + return { + index_name: { + doc_info.ref_doc_id: { + "text": doc_info.text, + "hash": doc_info.hash + } for doc_name, doc_info in vector_store_index.docstore.docs.items() + } + for index_name, vector_store_index in self.index_map.items() + } - @abstractmethod def document_exists(self, index_name: str, doc_id: str) -> bool: - pass + """Common logic for checking document existence.""" + if index_name not in self.index_map: + print(f"No such index: '{index_name}' exists in vector store.") + return False + return doc_id in self.index_map[index_name].ref_doc_info + + def _persist_all(self): + """Common persistence logic.""" + self.index_store.persist(os.path.join(PERSIST_DIR, "store.json")) + for idx in self.index_store.index_structs(): + self._persist(idx.index_id) + + def _persist(self, index_name: str): + """Common persistence logic for individual index.""" + self.index_store.persist(os.path.join(PERSIST_DIR, "store.json")) + assert index_name in self.index_map, f"No such index: '{index_name}' exists." + storage_context = self.index_map[index_name].storage_context + storage_context.persist(persist_dir=os.path.join(PERSIST_DIR, index_name)) diff --git a/ragengine/vector_store/chromadb_store.py b/ragengine/vector_store/chromadb_store.py new file mode 100644 index 000000000..745054c2e --- /dev/null +++ b/ragengine/vector_store/chromadb_store.py @@ -0,0 +1,16 @@ +from typing import List +from ragengine.models import Document + +import chromadb +from llama_index.vector_stores.chroma import ChromaVectorStore +from .base import BaseVectorStore + +class ChromaDBVectorStoreHandler(BaseVectorStore): + def __init__(self, embedding_manager): + super().__init__(embedding_manager) + self.chroma_client = chromadb.EphemeralClient() + + def _create_new_index(self, index_name: str, documents: List[Document]) -> List[str]: + chroma_collection = self.chroma_client.create_collection(index_name) + vector_store = ChromaVectorStore(chroma_collection=chroma_collection) + return self._create_index_common(index_name, documents, vector_store) diff --git a/ragengine/vector_store/faiss_store.py b/ragengine/vector_store/faiss_store.py index ddd5b670d..9c4485c44 100644 --- a/ragengine/vector_store/faiss_store.py +++ b/ragengine/vector_store/faiss_store.py @@ -1,186 +1,16 @@ -import os -from typing import Dict, List +from typing import List +from ragengine.models import Document import faiss -from llama_index.core import Document as LlamaDocument -from llama_index.core import (StorageContext, VectorStoreIndex) -from llama_index.core.storage.index_store import SimpleIndexStore from llama_index.vector_stores.faiss import FaissVectorStore - -from ragengine.models import Document -from ragengine.inference.inference import Inference - -from ragengine.config import PERSIST_DIR - from .base import BaseVectorStore -from ragengine.embedding.base import BaseEmbeddingModel - class FaissVectorStoreHandler(BaseVectorStore): - def __init__(self, embedding_manager: BaseEmbeddingModel): - self.embedding_manager = embedding_manager - self.embed_model = self.embedding_manager.model + def __init__(self, embedding_manager): + super().__init__(embedding_manager) self.dimension = self.embedding_manager.get_embedding_dimension() - # TODO: Consider allowing user custom indexing method (would require configmap?) e.g. - """ - # Choose the FAISS index type based on the provided index_method - if index_method == 'FlatL2': - faiss_index = faiss.IndexFlatL2(self.dimension) # L2 (Euclidean distance) index - elif index_method == 'FlatIP': - faiss_index = faiss.IndexFlatIP(self.dimension) # Inner product (cosine similarity) index - elif index_method == 'IVFFlat': - quantizer = faiss.IndexFlatL2(self.dimension) # Quantizer for IVF - faiss_index = faiss.IndexIVFFlat(quantizer, self.dimension, 100) # IVF with flat quantization - elif index_method == 'HNSW': - faiss_index = faiss.IndexHNSWFlat(self.dimension, 32) # HNSW index with 32 neighbors - else: - raise ValueError(f"Unknown index method: {index_method}") - """ - self.index_map = {} # Used to store the in-memory index via namespace (e.g. index_name -> VectorStoreIndex) - self.index_store = SimpleIndexStore() # Use to store global index metadata - self.llm = Inference() - - def index_documents(self, index_name: str, documents: List[Document]) -> List[str]: - """ - Called by the /index endpoint to index documents into the specified index. - - If the index already exists, appends new documents to it. - Otherwise, creates a new index with the provided documents. - - Args: - index_name (str): The name of the index to update or create. - documents (List[Document]): A list of documents to index. - - Returns: - List[str]: A list of document IDs that were successfully indexed. - """ - if index_name in self.index_map: - return self._append_documents_to_index(index_name, documents) - else: - return self._create_new_index(index_name, documents) - - def _append_documents_to_index(self, index_name: str, documents: List[Document]) -> List[str]: - """ - Appends documents to an existing index. - - Args: - index_name (str): The name of the existing index. - documents (List[Document]): A list of documents to append. - - Returns: - List[str]: A list of document IDs that were successfully indexed. - """ - print(f"Index {index_name} already exists. Appending documents to existing index.") - indexed_doc_ids = set() - - for doc in documents: - doc_id = BaseVectorStore.generate_doc_id(doc.text) - if not self.document_exists(index_name, doc_id): - self.add_document_to_index(index_name, doc, doc_id) - indexed_doc_ids.add(doc_id) - else: - print(f"Document {doc_id} already exists in index {index_name}. Skipping.") - - if indexed_doc_ids: - self._persist(index_name) - return list(indexed_doc_ids) def _create_new_index(self, index_name: str, documents: List[Document]) -> List[str]: - """ - Creates a new index with the provided documents. - - Args: - index_name (str): The name of the new index to create. - documents (List[Document]): A list of documents to index. - - Returns: - List[str]: A list of document IDs that were successfully indexed. - """ faiss_index = faiss.IndexFlatL2(self.dimension) vector_store = FaissVectorStore(faiss_index=faiss_index) - storage_context = StorageContext.from_defaults(vector_store=vector_store) - - llama_docs = [] - indexed_doc_ids = set() - - for doc in documents: - doc_id = BaseVectorStore.generate_doc_id(doc.text) - llama_doc = LlamaDocument(id_=doc_id, text=doc.text, metadata=doc.metadata) - llama_docs.append(llama_doc) - indexed_doc_ids.add(doc_id) - - if llama_docs: - index = VectorStoreIndex.from_documents( - llama_docs, - storage_context=storage_context, - embed_model=self.embed_model, - # use_async=True # TODO: Indexing Process Performed Async - ) - index.set_index_id(index_name) - self.index_map[index_name] = index - self.index_store.add_index_struct(index.index_struct) - self._persist(index_name) - return list(indexed_doc_ids) - - def add_document_to_index(self, index_name: str, document: Document, doc_id: str): - """Inserts a single document into the existing FAISS index.""" - if index_name not in self.index_map: - raise ValueError(f"No such index: '{index_name}' exists.") - llama_doc = LlamaDocument(text=document.text, metadata=document.metadata, id_=doc_id) - self.index_map[index_name].insert(llama_doc) - - def query(self, index_name: str, query: str, top_k: int, llm_params: dict): - """Queries the FAISS vector store.""" - if index_name not in self.index_map: - raise ValueError(f"No such index: '{index_name}' exists.") - self.llm.set_params(llm_params) - - query_engine = self.index_map[index_name].as_query_engine(llm=self.llm, similarity_top_k=top_k) - query_result = query_engine.query(query) - return { - "response": query_result.response, - "source_nodes": [ - { - "node_id": node.node_id, - "text": node.text, - "score": node.score, - "metadata": node.metadata - } - for node in query_result.source_nodes - ], - "metadata": query_result.metadata, - } - - def list_all_indexed_documents(self) -> Dict[str, Dict[str, Dict[str, str]]]: - """Lists all documents in the vector store.""" - return { - index_name: { - doc_info.ref_doc_id: { - "text": doc_info.text, "hash": doc_info.hash - } for doc_name, doc_info in vector_store_index.docstore.docs.items() - } - for index_name, vector_store_index in self.index_map.items() - } - - def document_exists(self, index_name: str, doc_id: str) -> bool: - """Checks if a document exists in the vector store.""" - if index_name not in self.index_map: - print(f"No such index: '{index_name}' exists in vector store.") - return False - return doc_id in self.index_map[index_name].ref_doc_info - - def _persist_all(self): - self.index_store.persist(os.path.join(PERSIST_DIR, "store.json")) # Persist global index store - for idx in self.index_store.index_structs(): - self._persist(idx.index_id) - - def _persist(self, index_name: str): - """Saves the existing FAISS index to disk.""" - self.index_store.persist(os.path.join(PERSIST_DIR, "store.json")) # Persist global index store - assert index_name in self.index_map, f"No such index: '{index_name}' exists." - - # Persist each index's storage context separately - storage_context = self.index_map[index_name].storage_context - storage_context.persist( - persist_dir=os.path.join(PERSIST_DIR, index_name) - ) + return self._create_index_common(index_name, documents, vector_store) From d5df60be613f1efdf1544a126f3722d6cf2d8284 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Thu, 24 Oct 2024 14:32:42 -0700 Subject: [PATCH 02/10] feat: mongodb --- ragengine/vector_store/azuremongodb_store.py | 24 ++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 ragengine/vector_store/azuremongodb_store.py diff --git a/ragengine/vector_store/azuremongodb_store.py b/ragengine/vector_store/azuremongodb_store.py new file mode 100644 index 000000000..3754ef2d3 --- /dev/null +++ b/ragengine/vector_store/azuremongodb_store.py @@ -0,0 +1,24 @@ +from typing import List +import os +from ragengine.models import Document + +import pymongo +from llama_index.vector_stores.azurecosmosmongo import ( + AzureCosmosDBMongoDBVectorSearch, +) + +from .base import BaseVectorStore + +class AzureCosmosDBMongoDBVectorStoreHandler(BaseVectorStore): + def __init__(self, embedding_manager): + super().__init__(embedding_manager) + self.connection_string = os.environ.get("AZURE_COSMOSDB_MONGODB_URI") + self.mongodb_client = pymongo.MongoClient(self.connection_string) + + def _create_new_index(self, index_name: str, documents: List[Document]) -> List[str]: + vector_store = AzureCosmosDBMongoDBVectorSearch( + mongodb_client=self.mongodb_client, + db_name="kaito_ragengine", + collection_name=index_name, + ) + return self._create_index_common(index_name, documents, vector_store) From 590977782d84116f6fdbe2a26cb47255ba9fc342 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Thu, 24 Oct 2024 14:46:41 -0700 Subject: [PATCH 03/10] feat: add logger --- ragengine/vector_store/base.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/ragengine/vector_store/base.py b/ragengine/vector_store/base.py index 3c94c752a..ed7e69259 100644 --- a/ragengine/vector_store/base.py +++ b/ragengine/vector_store/base.py @@ -1,3 +1,4 @@ +import logging from abc import ABC, abstractmethod from typing import Dict, List import hashlib @@ -12,6 +13,10 @@ from ragengine.inference.inference import Inference from ragengine.config import PERSIST_DIR +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + class BaseVectorStore(ABC): def __init__(self, embedding_manager: BaseEmbeddingModel): self.embedding_manager = embedding_manager @@ -34,7 +39,7 @@ def index_documents(self, index_name: str, documents: List[Document]) -> List[st def _append_documents_to_index(self, index_name: str, documents: List[Document]) -> List[str]: """Common logic for appending documents to existing index.""" - print(f"Index {index_name} already exists. Appending documents to existing index.") + logger.info(f"Index {index_name} already exists. Appending documents to existing index.") indexed_doc_ids = set() for doc in documents: @@ -43,7 +48,7 @@ def _append_documents_to_index(self, index_name: str, documents: List[Document]) self.add_document_to_index(index_name, doc, doc_id) indexed_doc_ids.add(doc_id) else: - print(f"Document {doc_id} already exists in index {index_name}. Skipping.") + logger.info(f"Document {doc_id} already exists in index {index_name}. Skipping.") if indexed_doc_ids: self._persist(index_name) @@ -117,7 +122,7 @@ def list_all_indexed_documents(self) -> Dict[str, Dict[str, Dict[str, str]]]: doc_info.ref_doc_id: { "text": doc_info.text, "hash": doc_info.hash - } for doc_name, doc_info in vector_store_index.docstore.docs.items() + } for _, doc_info in vector_store_index.docstore.docs.items() } for index_name, vector_store_index in self.index_map.items() } @@ -125,18 +130,20 @@ def list_all_indexed_documents(self) -> Dict[str, Dict[str, Dict[str, str]]]: def document_exists(self, index_name: str, doc_id: str) -> bool: """Common logic for checking document existence.""" if index_name not in self.index_map: - print(f"No such index: '{index_name}' exists in vector store.") + logger.warning(f"No such index: '{index_name}' exists in vector store.") return False return doc_id in self.index_map[index_name].ref_doc_info def _persist_all(self): """Common persistence logic.""" + logger.info("Persisting all indexes.") self.index_store.persist(os.path.join(PERSIST_DIR, "store.json")) for idx in self.index_store.index_structs(): self._persist(idx.index_id) def _persist(self, index_name: str): """Common persistence logic for individual index.""" + logger.info(f"Persisting index {index_name}.") self.index_store.persist(os.path.join(PERSIST_DIR, "store.json")) assert index_name in self.index_map, f"No such index: '{index_name}' exists." storage_context = self.index_map[index_name].storage_context From 2e1b573d1b073c5285a90b504a54a5a3f80d40e9 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Thu, 24 Oct 2024 14:51:30 -0700 Subject: [PATCH 04/10] feat: Add persist --- ragengine/vector_store/base.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/ragengine/vector_store/base.py b/ragengine/vector_store/base.py index ed7e69259..5b2f89044 100644 --- a/ragengine/vector_store/base.py +++ b/ragengine/vector_store/base.py @@ -143,8 +143,13 @@ def _persist_all(self): def _persist(self, index_name: str): """Common persistence logic for individual index.""" - logger.info(f"Persisting index {index_name}.") - self.index_store.persist(os.path.join(PERSIST_DIR, "store.json")) - assert index_name in self.index_map, f"No such index: '{index_name}' exists." - storage_context = self.index_map[index_name].storage_context - storage_context.persist(persist_dir=os.path.join(PERSIST_DIR, index_name)) + try: + logger.info(f"Persisting index {index_name}.") + self.index_store.persist(os.path.join(PERSIST_DIR, "store.json")) + assert index_name in self.index_map, f"No such index: '{index_name}' exists." + storage_context = self.index_map[index_name].storage_context + # Persist the specific index + storage_context.persist(persist_dir=os.path.join(PERSIST_DIR, index_name)) + logger.info(f"Successfully persisted index {index_name}.") + except Exception as e: + logger.error(f"Failed to persist index {index_name}. Error: {str(e)}") From e8e2ceabee844af8b7357a07835b75902135cc18 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Fri, 1 Nov 2024 20:57:55 -0700 Subject: [PATCH 05/10] feat: new updates tests, dependencies, and azuremongodb implementation --- ragengine/requirements.txt | 1 + .../vector_store/test_azuremongodb_store.py | 123 ++++++++++++++++++ .../tests/vector_store/test_faiss_store.py | 2 +- ragengine/vector_store/azuremongodb_store.py | 76 ++++++++++- ragengine/vector_store/base.py | 4 +- 5 files changed, 198 insertions(+), 8 deletions(-) create mode 100644 ragengine/tests/vector_store/test_azuremongodb_store.py diff --git a/ragengine/requirements.txt b/ragengine/requirements.txt index 4a324766c..c91a95430 100644 --- a/ragengine/requirements.txt +++ b/ragengine/requirements.txt @@ -10,6 +10,7 @@ llama-index-llms-huggingface-api fastapi faiss-cpu llama-index-vector-stores-faiss +llama-index-vector-stores-azurecosmosmongo uvicorn # For UTs pytest \ No newline at end of file diff --git a/ragengine/tests/vector_store/test_azuremongodb_store.py b/ragengine/tests/vector_store/test_azuremongodb_store.py new file mode 100644 index 000000000..85ac6eace --- /dev/null +++ b/ragengine/tests/vector_store/test_azuremongodb_store.py @@ -0,0 +1,123 @@ +import os +from tempfile import TemporaryDirectory +from unittest.mock import patch + +import pytest + +from ragengine.vector_store.base import BaseVectorStore +from ragengine.vector_store.azuremongodb_store import AzureCosmosDBMongoDBVectorStoreHandler +from ragengine.models import Document +from ragengine.embedding.huggingface_local import LocalHuggingFaceEmbedding +from ragengine.config import MODEL_ID, INFERENCE_URL, INFERENCE_ACCESS_SECRET +from ragengine.config import PERSIST_DIR + +@pytest.fixture(scope='session') +def init_embed_manager(): + return LocalHuggingFaceEmbedding(MODEL_ID) + +@pytest.fixture +def vector_store_manager(init_embed_manager): + with TemporaryDirectory() as temp_dir: + print(f"Saving temporary test storage at: {temp_dir}") + # Mock the persistence directory + os.environ['AZURE_COSMOSDB_MONGODB_URI'] = "" + manager = AzureCosmosDBMongoDBVectorStoreHandler(init_embed_manager) + manager._clear_collection_and_indexes() + yield manager + +def test_index_documents(vector_store_manager): + first_doc_text, second_doc_text = "First document", "Second document" + documents = [ + Document(text=first_doc_text, metadata={"type": "text"}), + Document(text=second_doc_text, metadata={"type": "text"}) + ] + + doc_ids = vector_store_manager.index_documents("test_index", documents) + + assert len(doc_ids) == 2 + assert set(doc_ids) == {BaseVectorStore.generate_doc_id(first_doc_text), + BaseVectorStore.generate_doc_id(second_doc_text)} + +def test_index_documents_isolation(vector_store_manager): + documents1 = [ + Document(text="First document in index1", metadata={"type": "text"}), + ] + documents2 = [ + Document(text="First document in index2", metadata={"type": "text"}), + ] + + # Index documents in separate indices + index_name_1, index_name_2 = "index1", "index2" + vector_store_manager.index_documents(index_name_1, documents1) + vector_store_manager.index_documents(index_name_2, documents2) + + indexed_docs = vector_store_manager.list_all_indexed_documents() + assert len(indexed_docs) == 2 + assert list(indexed_docs[index_name_1].values())[0]["text"] == "First document in index1" + assert list(indexed_docs[index_name_1].values())[0]["content_vector"] == "Vector of dimension 384" + assert list(indexed_docs[index_name_2].values())[0]["text"] == "First document in index2" + assert list(indexed_docs[index_name_2].values())[0]["content_vector"] == "Vector of dimension 384" + +@patch('requests.post') +def test_query_documents(mock_post, vector_store_manager): + # Define Mock Response for Custom Inference API + mock_response = { + "result": "This is the completion from the API" + } + + mock_post.return_value.json.return_value = mock_response + + # Add documents to index + documents = [ + Document(text="First document", metadata={"type": "text"}), + Document(text="Second document", metadata={"type": "text"}) + ] + vector_store_manager.index_documents("test_index", documents) + + params = {"temperature": 0.7} + # Mock query and results + query_result = vector_store_manager.query("test_index", "First", top_k=1, llm_params=params) + + assert query_result is not None + assert query_result["response"] == "{'result': 'This is the completion from the API'}" + assert query_result["source_nodes"][0]["text"] == "First document" + assert query_result["source_nodes"][0]["score"] == pytest.approx(0.7102378952219661, rel=1e-6) + + mock_post.assert_called_once_with( + INFERENCE_URL, + # Auto-Generated by LlamaIndex + json={"prompt": "Context information is below.\n---------------------\ntype: text\n\nFirst document\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: First\nAnswer: ", "formatted": True, 'temperature': 0.7}, + headers={"Authorization": f"Bearer {INFERENCE_ACCESS_SECRET}"} + ) + +def test_add_document(vector_store_manager): + documents = [Document(text="Third document", metadata={"type": "text"})] + vector_store_manager.index_documents("test_index", documents) + + # Add a document to the existing index + new_document = [Document(text="Fourth document", metadata={"type": "text"})] + vector_store_manager.index_documents("test_index", new_document) + + # Assert that the document exists + assert vector_store_manager.document_exists("test_index", new_document[0], + BaseVectorStore.generate_doc_id("Fourth document")) + +def test_persist_index_1(vector_store_manager): + """Test that the index store is persisted.""" + # Add a document and persist the index + documents = [Document(text="Test document", metadata={"type": "text"})] + vector_store_manager.index_documents("test_index", documents) + vector_store_manager._persist("test_index") + assert os.path.exists(PERSIST_DIR) + +def test_persist_index_2(vector_store_manager): + """Test that an index store is persisted.""" + # Add a document and persist the index + documents = [Document(text="Test document", metadata={"type": "text"})] + vector_store_manager.index_documents("test_index", documents) + + documents = [Document(text="Another Test document", metadata={"type": "text"})] + vector_store_manager.index_documents("another_test_index", documents) + + vector_store_manager._persist_all() + assert os.path.exists(PERSIST_DIR) diff --git a/ragengine/tests/vector_store/test_faiss_store.py b/ragengine/tests/vector_store/test_faiss_store.py index 0fc17a912..8575b08a0 100644 --- a/ragengine/tests/vector_store/test_faiss_store.py +++ b/ragengine/tests/vector_store/test_faiss_store.py @@ -99,7 +99,7 @@ def test_add_document(vector_store_manager): vector_store_manager.index_documents("test_index", new_document) # Assert that the document exists - assert vector_store_manager.document_exists("test_index", + assert vector_store_manager.document_exists("test_index", new_document[0], BaseVectorStore.generate_doc_id("Fourth document")) def test_persist_index_1(vector_store_manager): diff --git a/ragengine/vector_store/azuremongodb_store.py b/ragengine/vector_store/azuremongodb_store.py index 3754ef2d3..1452bd317 100644 --- a/ragengine/vector_store/azuremongodb_store.py +++ b/ragengine/vector_store/azuremongodb_store.py @@ -1,24 +1,90 @@ -from typing import List +import logging +from typing import List, Dict import os from ragengine.models import Document import pymongo +import json from llama_index.vector_stores.azurecosmosmongo import ( AzureCosmosDBMongoDBVectorSearch, ) from .base import BaseVectorStore +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + class AzureCosmosDBMongoDBVectorStoreHandler(BaseVectorStore): def __init__(self, embedding_manager): super().__init__(embedding_manager) - self.connection_string = os.environ.get("AZURE_COSMOSDB_MONGODB_URI") - self.mongodb_client = pymongo.MongoClient(self.connection_string) + self.db_name = os.getenv("AZURE_COSMOSDB_MONGODB_DB_NAME", "vector-store") + self.collection_name = os.getenv("AZURE_COSMOSDB_MONGODB_COLLECTION_NAME", "vector-store") + self.connection_string = os.getenv("AZURE_COSMOSDB_MONGODB_URI", "mongodb+srv://myDatabaseUser:D1fficultP%40ssw0rd@cluster0.example.mongodb.net/?retryWrites=true&w=majority") + self.dimension = self.embedding_manager.get_embedding_dimension() + try: + self.mongodb_client = pymongo.MongoClient(self.connection_string) + self.mongodb_client.admin.command('ping') # Test the connection + except pymongo.errors.ConnectionError as e: + raise Exception(f"Failed to connect to MongoDB: {e}") + # Ensure collection exists + try: + self.collection = self.mongodb_client[self.db_name][self.collection_name] + except Exception as e: + raise ValueError(f"Failed to access collection '{self.collection_name}' in database '{self.db_name}': {e}") + def _create_new_index(self, index_name: str, documents: List[Document]) -> List[str]: vector_store = AzureCosmosDBMongoDBVectorSearch( mongodb_client=self.mongodb_client, - db_name="kaito_ragengine", - collection_name=index_name, + db_name=self.db_name, + collection_name=self.collection_name, + index_name=index_name, + embedding_key=f"{index_name}_embedding", # Unique field for each index + cosmos_search_kwargs={ + # TODO: "kind": "vector-hnsw", # or "vector-ivf", "vector-diskann" (search type) + "dimensions": self.dimension, + } ) return self._create_index_common(index_name, documents, vector_store) + + def list_all_indexed_documents(self) -> Dict[str, Dict[str, Dict[str, str]]]: + indexed_docs = {} # Accumulate documents across all indexes + for index_name in self.index_map.keys(): + embedding_key = f"{index_name}_embedding" + documents = self.collection.find({embedding_key: {"$exists": True}}) + for doc in documents: + doc_id = doc.get("id") + if doc_id is None: + continue # Skip if no document ID is found + indexed_docs.setdefault(index_name, {})[doc_id] = { + "text": doc.get("text", ""), + "metadata": json.dumps(doc.get("metadata", {})), + "content_vector": f"Vector of dimension {len(doc.get(embedding_key, []))}" + } + return indexed_docs + + def document_exists(self, index_name: str, doc: Document, doc_id: str) -> bool: + """AzureCosmosDBMongoDB for checking document existence.""" + if index_name not in self.index_map: + logger.warning(f"No such index: '{index_name}' exists in vector store.") + return False + return doc.text in [elm["text"] for elm in list(self.mongodb_client[self.db_name][self.collection_name].find({f"{index_name}_embedding": {"$exists": True}}))] + + def _clear_collection_and_indexes(self): + """Clears all documents and drops all indexes in the collection. + + This method is primarily intended for testing purposes to ensure + a clean state between tests, preventing index and document conflicts. + """ + try: + # Delete all documents in the collection + self.collection.delete_many({}) + print(f"All documents in collection '{self.collection_name}' have been deleted.") + + # Drop all indexes in the collection + self.collection.drop_indexes() + print(f"All indexes in collection '{self.collection_name}' have been dropped.") + + except Exception as e: + print(f"Failed to clear collection and indexes in '{self.collection_name}': {e}") diff --git a/ragengine/vector_store/base.py b/ragengine/vector_store/base.py index 5b2f89044..56871cbb4 100644 --- a/ragengine/vector_store/base.py +++ b/ragengine/vector_store/base.py @@ -44,7 +44,7 @@ def _append_documents_to_index(self, index_name: str, documents: List[Document]) for doc in documents: doc_id = self.generate_doc_id(doc.text) - if not self.document_exists(index_name, doc_id): + if not self.document_exists(index_name, doc, doc_id): self.add_document_to_index(index_name, doc, doc_id) indexed_doc_ids.add(doc_id) else: @@ -127,7 +127,7 @@ def list_all_indexed_documents(self) -> Dict[str, Dict[str, Dict[str, str]]]: for index_name, vector_store_index in self.index_map.items() } - def document_exists(self, index_name: str, doc_id: str) -> bool: + def document_exists(self, index_name: str, doc: Document, doc_id: str) -> bool: """Common logic for checking document existence.""" if index_name not in self.index_map: logger.warning(f"No such index: '{index_name}' exists in vector store.") From 20f06e0d2597a10d56d101568e8f1bf1f6f14e77 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Fri, 1 Nov 2024 21:00:02 -0700 Subject: [PATCH 06/10] nit --- ragengine/vector_store/azuremongodb_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ragengine/vector_store/azuremongodb_store.py b/ragengine/vector_store/azuremongodb_store.py index 1452bd317..f2ff2c027 100644 --- a/ragengine/vector_store/azuremongodb_store.py +++ b/ragengine/vector_store/azuremongodb_store.py @@ -69,7 +69,7 @@ def document_exists(self, index_name: str, doc: Document, doc_id: str) -> bool: if index_name not in self.index_map: logger.warning(f"No such index: '{index_name}' exists in vector store.") return False - return doc.text in [elm["text"] for elm in list(self.mongodb_client[self.db_name][self.collection_name].find({f"{index_name}_embedding": {"$exists": True}}))] + return doc.text in [doc["text"] for doc in list(self.mongodb_client[self.db_name][self.collection_name].find({f"{index_name}_embedding": {"$exists": True}}))] def _clear_collection_and_indexes(self): """Clears all documents and drops all indexes in the collection. From 856be86f1fe60eb786df82e4e23f5f891963351c Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Fri, 1 Nov 2024 21:05:53 -0700 Subject: [PATCH 07/10] nit --- ragengine/vector_store/azuremongodb_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ragengine/vector_store/azuremongodb_store.py b/ragengine/vector_store/azuremongodb_store.py index f2ff2c027..13389bbdc 100644 --- a/ragengine/vector_store/azuremongodb_store.py +++ b/ragengine/vector_store/azuremongodb_store.py @@ -69,7 +69,7 @@ def document_exists(self, index_name: str, doc: Document, doc_id: str) -> bool: if index_name not in self.index_map: logger.warning(f"No such index: '{index_name}' exists in vector store.") return False - return doc.text in [doc["text"] for doc in list(self.mongodb_client[self.db_name][self.collection_name].find({f"{index_name}_embedding": {"$exists": True}}))] + return doc.text in [doc["text"] for doc in list(self.collection.find({f"{index_name}_embedding": {"$exists": True}}))] def _clear_collection_and_indexes(self): """Clears all documents and drops all indexes in the collection. From 81a95eb2cc9f0bec26376cb05cb2b0d9fa8420f2 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Mon, 4 Nov 2024 10:50:38 -0800 Subject: [PATCH 08/10] fix: remove this file for future PR --- ragengine/vector_store/chromadb_store.py | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 ragengine/vector_store/chromadb_store.py diff --git a/ragengine/vector_store/chromadb_store.py b/ragengine/vector_store/chromadb_store.py deleted file mode 100644 index 745054c2e..000000000 --- a/ragengine/vector_store/chromadb_store.py +++ /dev/null @@ -1,16 +0,0 @@ -from typing import List -from ragengine.models import Document - -import chromadb -from llama_index.vector_stores.chroma import ChromaVectorStore -from .base import BaseVectorStore - -class ChromaDBVectorStoreHandler(BaseVectorStore): - def __init__(self, embedding_manager): - super().__init__(embedding_manager) - self.chroma_client = chromadb.EphemeralClient() - - def _create_new_index(self, index_name: str, documents: List[Document]) -> List[str]: - chroma_collection = self.chroma_client.create_collection(index_name) - vector_store = ChromaVectorStore(chroma_collection=chroma_collection) - return self._create_index_common(index_name, documents, vector_store) From b44a7d2ada4d68d13102edf1500e53bd2e135d73 Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Mon, 4 Nov 2024 11:00:25 -0800 Subject: [PATCH 09/10] fix: add license headers --- presets/tuning/text-generation/metrics/metrics_server.py | 4 +++- presets/tuning/text-generation/metrics/test_metrics_server.py | 4 +++- ragengine/config.py | 3 +++ ragengine/embedding/base.py | 3 +++ ragengine/embedding/huggingface_local.py | 3 +++ ragengine/embedding/huggingface_remote.py | 3 +++ ragengine/inference/inference.py | 3 +++ ragengine/main.py | 3 +++ ragengine/models.py | 3 +++ ragengine/tests/api/conftest.py | 3 +++ ragengine/tests/api/test_main.py | 3 +++ ragengine/tests/vector_store/conftest.py | 3 +++ ragengine/tests/vector_store/test_azuremongodb_store.py | 3 +++ ragengine/tests/vector_store/test_faiss_store.py | 3 +++ ragengine/vector_store/azuremongodb_store.py | 3 +++ ragengine/vector_store/base.py | 3 +++ ragengine/vector_store/faiss_store.py | 3 +++ ragengine/vector_store_manager/manager.py | 3 +++ 18 files changed, 54 insertions(+), 2 deletions(-) diff --git a/presets/tuning/text-generation/metrics/metrics_server.py b/presets/tuning/text-generation/metrics/metrics_server.py index 402701d85..53dd3b283 100644 --- a/presets/tuning/text-generation/metrics/metrics_server.py +++ b/presets/tuning/text-generation/metrics/metrics_server.py @@ -1,4 +1,6 @@ -# metrics_server.py +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import logging import os from typing import List, Optional diff --git a/presets/tuning/text-generation/metrics/test_metrics_server.py b/presets/tuning/text-generation/metrics/test_metrics_server.py index d51f2b5c4..500edca3a 100644 --- a/presets/tuning/text-generation/metrics/test_metrics_server.py +++ b/presets/tuning/text-generation/metrics/test_metrics_server.py @@ -1,4 +1,6 @@ -# test_metrics_server.py +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import os from unittest.mock import MagicMock, patch diff --git a/ragengine/config.py b/ragengine/config.py index bda4b46d3..0bcd87aa2 100644 --- a/ragengine/config.py +++ b/ragengine/config.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + # config.py # Variables are set via environment variables from the RAGEngine CR diff --git a/ragengine/embedding/base.py b/ragengine/embedding/base.py index a1c371937..ee4d13d8c 100644 --- a/ragengine/embedding/base.py +++ b/ragengine/embedding/base.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + from abc import ABC, abstractmethod diff --git a/ragengine/embedding/huggingface_local.py b/ragengine/embedding/huggingface_local.py index 3dab51e9a..1ae7a52ac 100644 --- a/ragengine/embedding/huggingface_local.py +++ b/ragengine/embedding/huggingface_local.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + from llama_index.embeddings.huggingface import HuggingFaceEmbedding from .base import BaseEmbeddingModel diff --git a/ragengine/embedding/huggingface_remote.py b/ragengine/embedding/huggingface_remote.py index 0f8e79181..dd4b81006 100644 --- a/ragengine/embedding/huggingface_remote.py +++ b/ragengine/embedding/huggingface_remote.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + from llama_index.embeddings.huggingface_api import \ HuggingFaceInferenceAPIEmbedding diff --git a/ragengine/inference/inference.py b/ragengine/inference/inference.py index 801ebc1b2..6d4293ae7 100644 --- a/ragengine/inference/inference.py +++ b/ragengine/inference/inference.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + from typing import Any from llama_index.core.llms import CustomLLM, CompletionResponse, LLMMetadata, CompletionResponseGen from llama_index.llms.openai import OpenAI diff --git a/ragengine/main.py b/ragengine/main.py index 53bdb1997..f289416d8 100644 --- a/ragengine/main.py +++ b/ragengine/main.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + from typing import List from vector_store_manager.manager import VectorStoreManager from embedding.huggingface_local import LocalHuggingFaceEmbedding diff --git a/ragengine/models.py b/ragengine/models.py index 982c1b1b9..92e10bd2a 100644 --- a/ragengine/models.py +++ b/ragengine/models.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + from typing import Dict, List, Optional from pydantic import BaseModel diff --git a/ragengine/tests/api/conftest.py b/ragengine/tests/api/conftest.py index 08ad12a74..1576962ec 100644 --- a/ragengine/tests/api/conftest.py +++ b/ragengine/tests/api/conftest.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import sys import os sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) diff --git a/ragengine/tests/api/test_main.py b/ragengine/tests/api/test_main.py index c0b6ef13b..311eecb60 100644 --- a/ragengine/tests/api/test_main.py +++ b/ragengine/tests/api/test_main.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + from unittest.mock import patch from llama_index.core.storage.index_store import SimpleIndexStore diff --git a/ragengine/tests/vector_store/conftest.py b/ragengine/tests/vector_store/conftest.py index 08ad12a74..1576962ec 100644 --- a/ragengine/tests/vector_store/conftest.py +++ b/ragengine/tests/vector_store/conftest.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import sys import os sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) diff --git a/ragengine/tests/vector_store/test_azuremongodb_store.py b/ragengine/tests/vector_store/test_azuremongodb_store.py index 85ac6eace..9dba1c46e 100644 --- a/ragengine/tests/vector_store/test_azuremongodb_store.py +++ b/ragengine/tests/vector_store/test_azuremongodb_store.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import os from tempfile import TemporaryDirectory from unittest.mock import patch diff --git a/ragengine/tests/vector_store/test_faiss_store.py b/ragengine/tests/vector_store/test_faiss_store.py index 8575b08a0..ce58dc811 100644 --- a/ragengine/tests/vector_store/test_faiss_store.py +++ b/ragengine/tests/vector_store/test_faiss_store.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import os from tempfile import TemporaryDirectory from unittest.mock import patch diff --git a/ragengine/vector_store/azuremongodb_store.py b/ragengine/vector_store/azuremongodb_store.py index 13389bbdc..0eca90dac 100644 --- a/ragengine/vector_store/azuremongodb_store.py +++ b/ragengine/vector_store/azuremongodb_store.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import logging from typing import List, Dict import os diff --git a/ragengine/vector_store/base.py b/ragengine/vector_store/base.py index 56871cbb4..fe9de2c24 100644 --- a/ragengine/vector_store/base.py +++ b/ragengine/vector_store/base.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import logging from abc import ABC, abstractmethod from typing import Dict, List diff --git a/ragengine/vector_store/faiss_store.py b/ragengine/vector_store/faiss_store.py index 9c4485c44..054f0110f 100644 --- a/ragengine/vector_store/faiss_store.py +++ b/ragengine/vector_store/faiss_store.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + from typing import List from ragengine.models import Document diff --git a/ragengine/vector_store_manager/manager.py b/ragengine/vector_store_manager/manager.py index d8871b93a..3aa2722c9 100644 --- a/ragengine/vector_store_manager/manager.py +++ b/ragengine/vector_store_manager/manager.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + from typing import Dict, List from ragengine.models import Document From bfbe639b782e4769ec10f3613312c1ce010e971a Mon Sep 17 00:00:00 2001 From: ishaansehgal99 Date: Mon, 4 Nov 2024 15:37:31 -0800 Subject: [PATCH 10/10] fix: exception --- ragengine/vector_store/azuremongodb_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ragengine/vector_store/azuremongodb_store.py b/ragengine/vector_store/azuremongodb_store.py index 0eca90dac..c7559e327 100644 --- a/ragengine/vector_store/azuremongodb_store.py +++ b/ragengine/vector_store/azuremongodb_store.py @@ -28,7 +28,7 @@ def __init__(self, embedding_manager): try: self.mongodb_client = pymongo.MongoClient(self.connection_string) self.mongodb_client.admin.command('ping') # Test the connection - except pymongo.errors.ConnectionError as e: + except Exception as e: raise Exception(f"Failed to connect to MongoDB: {e}") # Ensure collection exists try: