Google Vertex AI 矢量搜索
此笔记本展示了如何使用与Google Cloud Vertex AI Vector Searchvector 数据库。
Google Vertex AI Vector Search,前身为 Vertex AI Matching Engine,提供业界领先的大规模低延迟矢量数据库。这些向量数据库通常称为向量相似性匹配或近似最近邻 (ANN) 服务。
注意:Langchain API 需要已创建的终端节点和已部署的索引。索引创建时间最长可能需要 1 小时。
要了解如何创建索引,请参阅 创建索引并将其部署到端点
如果您已部署索引 ,请跳至从文本创建 VectorStore
创建索引并将其部署到 Endpoint
- 本节介绍如何创建新索引并将其部署到终端节点
# TODO : Set values as per your requirements
# Project and Storage Constants
PROJECT_ID = "<my_project_id>"
REGION = "<my_region>"
BUCKET = "<my_gcs_bucket>"
BUCKET_URI = f"gs://{BUCKET}"
# The number of dimensions for the textembedding-gecko@003 is 768
# If other embedder is used, the dimensions would probably need to change.
DIMENSIONS = 768
# Index Constants
DISPLAY_NAME = "<my_matching_engine_index_id>"
DEPLOYED_INDEX_ID = "<my_matching_engine_endpoint_id>"
# Create a bucket.
! gsutil mb -l $REGION -p $PROJECT_ID $BUCKET_URI
使用 VertexAIEmbeddings 作为嵌入模型
from google.cloud import aiplatform
from langchain_google_vertexai import VertexAIEmbeddings
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)
embedding_model = VertexAIEmbeddings(model_name="text-embedding-005")
创建空索引
注意:创建索引时,您应该从 “BATCH_UPDATE” 或 “STREAM_UPDATE” 中指定 “index_update_method”
批量索引适用于您希望批量更新索引的情况,其中包含已在一定时间内存储的数据,例如每周或每月处理一次的系统。流式索引是指您希望在将新数据添加到数据存储时更新索引数据,例如,如果您有一家书店,并且希望尽快在线显示新清单。选择哪种类型很重要,因为设置和要求不同。
有关配置索引的更多详细信息,请参阅官方文档
# NOTE : This operation can take upto 30 seconds
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
display_name=DISPLAY_NAME,
dimensions=DIMENSIONS,
approximate_neighbors_count=150,
distance_measure_type="DOT_PRODUCT_DISTANCE",
index_update_method="STREAM_UPDATE", # allowed values BATCH_UPDATE , STREAM_UPDATE
)
创建终端节点
# Create an endpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
display_name=f"{DISPLAY_NAME}-endpoint", public_endpoint_enabled=True
)
将索引部署到终端节点
# NOTE : This operation can take upto 20 minutes
my_index_endpoint = my_index_endpoint.deploy_index(
index=my_index, deployed_index_id=DEPLOYED_INDEX_ID
)
my_index_endpoint.deployed_indexes
从文本创建 Vector Store
注意 : 如果您有现有的 Index 和 Endpoints,则可以使用以下代码加载它们
# TODO : replace 1234567890123456789 with your acutial index ID
my_index = aiplatform.MatchingEngineIndex("1234567890123456789")
# TODO : replace 1234567890123456789 with your acutial endpoint ID
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint("1234567890123456789")
from langchain_google_vertexai import (
VectorSearchVectorStore,
VectorSearchVectorStoreDatastore,
)
创建简单的 vectorstore ( 没有过滤器)
# Input texts
texts = [
"The cat sat on",
"the mat.",
"I like to",
"eat pizza for",
"dinner.",
"The sun sets",
"in the west.",
]
# Create a Vector Store
vector_store = VectorSearchVectorStore.from_components(
project_id=PROJECT_ID,
region=REGION,
gcs_bucket_name=BUCKET,
index_id=my_index.name,
endpoint_id=my_index_endpoint.name,
embedding=embedding_model,
stream_update=True,
)
# Add vectors and mapped text chunks to your vectore store
vector_store.add_texts(texts=texts)
可选 : 您还可以创建 vectore 并将数据块存储在 Datastore 中
# NOTE : This operation can take upto 20 mins
vector_store = VectorSearchVectorStoreDatastore.from_components(
project_id=PROJECT_ID,
region=REGION,
index_id=my_index.name,
endpoint_id=my_index_endpoint.name,
embedding=embedding_model,
stream_update=True,
)
vector_store.add_texts(texts=texts, is_complete_overwrite=True)
# Try running a simialarity search
vector_store.similarity_search("pizza")
使用元数据过滤器创建 vectorstore
# Input text with metadata
record_data = [
{
"description": "A versatile pair of dark-wash denim jeans."
"Made from durable cotton with a classic straight-leg cut, these jeans"
" transition easily from casual days to dressier occasions.",
"price": 65.00,
"color": "blue",
"season": ["fall", "winter", "spring"],
},
{
"description": "A lightweight linen button-down shirt in a crisp white."
" Perfect for keeping cool with breathable fabric and a relaxed fit.",
"price": 34.99,
"color": "white",
"season": ["summer", "spring"],
},
{
"description": "A soft, chunky knit sweater in a vibrant forest green. "
"The oversized fit and cozy wool blend make this ideal for staying warm "
"when the temperature drops.",
"price": 89.99,
"color": "green",
"season": ["fall", "winter"],
},
{
"description": "A classic crewneck t-shirt in a soft, heathered blue. "
"Made from comfortable cotton jersey, this t-shirt is a wardrobe essential "
"that works for every season.",
"price": 19.99,
"color": "blue",
"season": ["fall", "winter", "summer", "spring"],
},
{
"description": "A flowing midi-skirt in a delicate floral print. "
"Lightweight and airy, this skirt adds a touch of feminine style "
"to warmer days.",
"price": 45.00,
"color": "white",
"season": ["spring", "summer"],
},
]
# Parse and prepare input data
texts = []
metadatas = []
for record in record_data:
record = record.copy()
page_content = record.pop("description")
texts.append(page_content)
if isinstance(page_content, str):
metadata = {**record}
metadatas.append(metadata)
# Inspect metadatas
metadatas
# NOTE : This operation can take more than 20 mins
vector_store = VectorSearchVectorStore.from_components(
project_id=PROJECT_ID,
region=REGION,
gcs_bucket_name=BUCKET,
index_id=my_index.name,
endpoint_id=my_index_endpoint.name,
embedding=embedding_model,
)
vector_store.add_texts(texts=texts, metadatas=metadatas, is_complete_overwrite=True)
from google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint import (
Namespace,
NumericNamespace,
)
# Try running a simple similarity search
# Below code should return 5 results
vector_store.similarity_search("shirt", k=5)
# Try running a similarity search with text filter
filters = [Namespace(name="season", allow_tokens=["spring"])]
# Below code should return 4 results now
vector_store.similarity_search("shirt", k=5, filter=filters)
# Try running a similarity search with combination of text and numeric filter
filters = [Namespace(name="season", allow_tokens=["spring"])]
numeric_filters = [NumericNamespace(name="price", value_float=40.0, op="LESS")]
# Below code should return 2 results now
vector_store.similarity_search(
"shirt", k=5, filter=filters, numeric_filter=numeric_filters
)
使用 Vector Store 作为检索器
# Initialize the vectore_store as retriever
retriever = vector_store.as_retriever()
# perform simple similarity search on retriever
retriever.invoke("What are my options in breathable fabric?")
# Try running a similarity search with text filter
filters = [Namespace(name="season", allow_tokens=["spring"])]
retriever.search_kwargs = {"filter": filters}
# perform similarity search with filters on retriever
retriever.invoke("What are my options in breathable fabric?")
# Try running a similarity search with combination of text and numeric filter
filters = [Namespace(name="season", allow_tokens=["spring"])]
numeric_filters = [NumericNamespace(name="price", value_float=40.0, op="LESS")]
retriever.search_kwargs = {"filter": filters, "numeric_filter": numeric_filters}
retriever.invoke("What are my options in breathable fabric?")
在问答链中将筛选器与 retriever 一起使用
from langchain_google_vertexai import VertexAI
llm = VertexAI(model_name="gemini-pro")
from langchain.chains import RetrievalQA
filters = [Namespace(name="season", allow_tokens=["spring"])]
numeric_filters = [NumericNamespace(name="price", value_float=40.0, op="LESS")]
retriever.search_kwargs = {"k": 2, "filter": filters, "numeric_filter": numeric_filters}
retrieval_qa = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
return_source_documents=True,
)
question = "What are my options in breathable fabric?"
response = retrieval_qa({"query": question})
print(f"{response['result']}")
print("REFERENCES")
print(f"{response['source_documents']}")
读取 、 分块 、 矢量化 和 索引 PDF
!pip install pypdf
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
loader = PyPDFLoader("https://arxiv.org/pdf/1706.03762.pdf")
pages = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
# Set a really small chunk size, just to show.
chunk_size=1000,
chunk_overlap=20,
length_function=len,
is_separator_regex=False,
)
doc_splits = text_splitter.split_documents(pages)
texts = [doc.page_content for doc in doc_splits]
metadatas = [doc.metadata for doc in doc_splits]
texts[0]
# Inspect Metadata of 1st page
metadatas[0]
vector_store = VectorSearchVectorStore.from_components(
project_id=PROJECT_ID,
region=REGION,
gcs_bucket_name=BUCKET,
index_id=my_index.name,
endpoint_id=my_index_endpoint.name,
embedding=embedding_model,
)
vector_store.add_texts(texts=texts, metadatas=metadatas, is_complete_overwrite=True)
vector_store = VectorSearchVectorStore.from_components(
project_id=PROJECT_ID,
region=REGION,
gcs_bucket_name=BUCKET,
index_id=my_index.name,
endpoint_id=my_index_endpoint.name,
embedding=embedding_model,
)
混合搜索
矢量搜索支持混合搜索,这是信息检索 (IR) 中的一种常用架构模式,它结合了语义搜索和关键字搜索(也称为基于标记的搜索)。通过混合搜索,开发人员可以利用这两种方法中的最佳方法,从而有效地提供更高的搜索质量。 单击此处了解更多信息。
为了使用混合搜索,我们需要安装一个稀疏嵌入向量器,并在 Vector Search 集成之外处理嵌入。 稀疏嵌入向量器的一个例子是 sklearn TfidfVectorizer,但也可以使用其他技术,例如 BM25。
# Define some sample data
texts = [
"The cat sat on",
"the mat.",
"I like to",
"eat pizza for",
"dinner.",
"The sun sets",
"in the west.",
]
# optional IDs
ids = ["i_" + str(i + 1) for i in range(len(texts))]
# optional metadata
metadatas = [{"my_metadata": i} for i in range(len(texts))]
from sklearn.feature_extraction.text import TfidfVectorizer
# Fit the TFIDF Vectorizer (This is usually done on a very large corpus of data to make sure that word statistics generalize well on new data)
vectorizer = TfidfVectorizer()
vectorizer.fit(texts)
# Utility function to transform text into a TF-IDF Sparse Vector
def get_sparse_embedding(tfidf_vectorizer, text):
tfidf_vector = tfidf_vectorizer.transform([text])
values = []
dims = []
for i, tfidf_value in enumerate(tfidf_vector.data):
values.append(float(tfidf_value))
dims.append(int(tfidf_vector.indices[i]))
return {"values": values, "dimensions": dims}
# semantic (dense) embeddings
embeddings = embedding_model.embed_documents(texts)
# tfidf (sparse) embeddings
sparse_embeddings = [get_sparse_embedding(vectorizer, x) for x in texts]
sparse_embeddings[0]
# Add the dense and sparse embeddings in Vector Search
vector_store.add_texts_with_embeddings(
texts=texts,
embeddings=embeddings,
sparse_embeddings=sparse_embeddings,
ids=ids,
metadatas=metadatas,
)
# Run hybrid search
query = "the cat"
embedding = embedding_model.embed_query(query)
sparse_embedding = get_sparse_embedding(vectorizer, query)
vector_store.similarity_search_by_vector_with_score(
embedding=embedding,
sparse_embedding=sparse_embedding,
k=5,
rrf_ranking_alpha=0.7, # 0.7 weight to dense and 0.3 weight to sparse
)