Source code for sparknlp.annotator.vector_db.vector_db_connector

#  Copyright 2017-2024 John Snow Labs
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
"""Contains classes for VectorDBConnector."""
from sparknlp.common import *


[docs]class VectorDBConnector(AnnotatorModel): """Connector for storing and retrieving embeddings from vector databases. This annotator takes embeddings from previous annotators (like BertEmbeddings, SentenceEmbeddings, E5VEmbeddings, etc.) and stores them in a vector database for similarity search and retrieval. Currently supports Pinecone with more providers planned. Two modality modes are supported via setModalityMode: * **text** (default) – expects DOCUMENT, SENTENCE_EMBEDDINGS input columns. Upserted metadata is augmented with modality=text. * **image** – expects MAGE, SENTENCE_EMBEDDINGS input columns (e.g. from ImageAssembler + E5VEmbeddings). Upserted metadata is augmented with modality=image, image_origin, image_width, image_height, and image_nChannels. Vector IDs are deterministic UUID-v3 values derived from the image file-path (origin), ensuring stable re-indexing. ========================= ======================= Input Annotation types Output Annotation type ========================= ======================= DOCUMENT, SENTENCE_EMBEDDINGS (text mode) DOCUMENT IMAGE, SENTENCE_EMBEDDINGS (image mode) DOCUMENT ========================= ======================= Parameters ---------- provider Vector database provider. Currently supported: 'pinecone' indexName Name of the index/collection in the vector database namespace Namespace/partition within the index (optional) idColumn Column name to use as vector ID (if not set, generates UUID; for image mode a stable UUID-v3 derived from the image origin path is used) metadataColumns Column names to include as metadata with vectors batchSize Number of vectors to upsert in a single batch modalityMode Modality mode: 'text' (default) or 'image' Examples -------- **Text mode example:** >>> import sparknlp >>> from sparknlp.base import * >>> from sparknlp.annotator import * >>> from pyspark.ml import Pipeline >>> documentAssembler = DocumentAssembler() \\ ... .setInputCol("text") \\ ... .setOutputCol("document") >>> embeddings = BertSentenceEmbeddings.pretrained() \\ ... .setInputCols(["document"]) \\ ... .setOutputCol("sentence_embeddings") >>> vectorDB = VectorDBConnector() \\ ... .setInputCols(["document", "sentence_embeddings"]) \\ ... .setOutputCol("vectordb_result") \\ ... .setProvider("pinecone") \\ ... .setIndexName("my-index") \\ ... .setNamespace("production") \\ ... .setIdColumn("id") \\ ... .setMetadataColumns(["text", "category"]) \\ ... .setBatchSize(100) >>> pipeline = Pipeline().setStages([ ... documentAssembler, ... embeddings, ... vectorDB ... ]) >>> data = spark.createDataFrame([ ... ("1", "Spark NLP is great", "tech"), ... ("2", "Vector databases enable semantic search", "tech") ... ]).toDF("id", "text", "category") >>> result = pipeline.fit(data).transform(data) **Image mode example:** >>> imageAssembler = ImageAssembler() \\ ... .setInputCol("image") \\ ... .setOutputCol("image_assembler") >>> e5vEmbeddings = E5VEmbeddings.pretrained() \\ ... .setInputCols(["image_assembler"]) \\ ... .setOutputCol("image_embeddings") >>> vectorDB = VectorDBConnector() \\ ... .setInputCols(["image_assembler", "image_embeddings"]) \\ ... .setOutputCol("vectordb_result") \\ ... .setProvider("pinecone") \\ ... .setIndexName("my-multimodal-index") \\ ... .setModalityMode("image") \\ ... .setBatchSize(50) >>> pipeline = Pipeline().setStages([ ... imageAssembler, ... e5vEmbeddings, ... vectorDB ... ]) """
[docs] name = "VectorDBConnector"
[docs] inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.SENTENCE_EMBEDDINGS]
[docs] outputAnnotatorType = AnnotatorType.DOCUMENT
[docs] provider = Param(Params._dummy(), "provider", "Vector database provider. Currently supported: 'pinecone'", typeConverter=TypeConverters.toString)
[docs] indexName = Param(Params._dummy(), "indexName", "Name of the index/collection in the vector database", typeConverter=TypeConverters.toString)
[docs] namespace = Param(Params._dummy(), "namespace", "Namespace/partition within the index (optional)", typeConverter=TypeConverters.toString)
[docs] idColumn = Param(Params._dummy(), "idColumn", "Column name to use as vector ID (if not set, generates UUID)", typeConverter=TypeConverters.toString)
[docs] metadataColumns = Param(Params._dummy(), "metadataColumns", "Column names to include as metadata with vectors", typeConverter=TypeConverters.toListString)
[docs] batchSize = Param(Params._dummy(), "batchSize", "Number of vectors to upsert in a single batch", typeConverter=TypeConverters.toInt)
[docs] modalityMode = Param(Params._dummy(), "modalityMode", "Modality mode for indexing. Supported values: 'text' (default), 'image'.", typeConverter=TypeConverters.toString)
[docs] def setProvider(self, value): """Sets the vector database provider. Parameters ---------- value : str Vector database provider. Currently supported: 'pinecone' """ return self._set(provider=value)
[docs] def setIndexName(self, value): """Sets the name of the index/collection in the vector database. Parameters ---------- value : str Name of the index/collection """ return self._set(indexName=value)
[docs] def setNamespace(self, value): """Sets the namespace/partition within the index. Parameters ---------- value : str Namespace/partition name (optional) """ return self._set(namespace=value)
[docs] def setIdColumn(self, value): """Sets the column name to use as vector ID. Parameters ---------- value : str Column name for vector ID. If not set, UUIDs will be generated. """ return self._set(idColumn=value)
[docs] def setMetadataColumns(self, value): """Sets the column names to include as metadata with vectors. Parameters ---------- value : list[str] List of column names to include as metadata """ return self._set(metadataColumns=value)
[docs] def setBatchSize(self, value): """Sets the number of vectors to upsert in a single batch. Parameters ---------- value : int Batch size for upsert operations (max 1000) """ return self._set(batchSize=value)
[docs] def setModalityMode(self, value): """Sets the modality mode for indexing. Use 'text' (default) for DOCUMENT + SENTENCE_EMBEDDINGS pipelines and 'image' for IMAGE + SENTENCE_EMBEDDINGS pipelines (e.g. ImageAssembler + E5VEmbeddings). In image mode vector IDs are stable UUID-v3 values derived from the image origin path, and upserted metadata automatically includes modality, image_origin, image_width, image_height, and image_nChannels fields. Parameters ---------- value : str 'text' or 'image' """ return self._set(modalityMode=value)
[docs] def getModalityMode(self): """Gets the current modality mode. Returns ------- str 'text' or 'image' """ return self.getOrDefault(self.modalityMode)
@keyword_only def __init__(self, classname="com.johnsnowlabs.ml.ai.VectorDBConnector", java_model=None): super(VectorDBConnector, self).__init__( classname=classname, java_model=java_model ) self._setDefault( provider="pinecone", batchSize=100, namespace="", metadataColumns=[], modalityMode="text" )