2024-06-08

AI Milvus Vector Database

Milvus was created in 2019 with a singular goal: store, index, and manage massive embedding vectors generated by deep neural networks and other machine learning (ML) models. ¹

setup

python3 -m venv env-vector-db
source env-vector-db/bin/activate

# `-I`  Ignore the installed packages, overwriting them.
# `-U`  Upgrade all specified packages to the newest available version.

pip3 install -U pymilvus==2.4.3 "pymilvus[model]" torch==2.3.1
pip3 install --upgrade --force-reinstall pymilvus "pymilvus[model]" torch
pip3 show pymilvus torch
pip3 index versions pymilvus torch

hands-on local

# https://milvus.io/docs/quickstart.md
from pymilvus import MilvusClient

# SET UP VECTOR DATABASE
client = MilvusClient("milvus_demo.db")

# DROP COLLECTION
client.drop_collection(collection_name="demo_collection")

# CREATE A COLLECTION
client.create_collection(
  collection_name="demo_collection",
  dimension=768,  # The vectors we will use in this demo has 768 dimensions
)

# REPRESENT TEXT WITH VECTORS
from pymilvus import model

# If connection to https://huggingface.co/ failed, uncomment the following path
# import os
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# This will download a small embedding model "paraphrase-albert-small-v2" (~50MB).
embedding_fn = model.DefaultEmbeddingFunction()
# Text strings to search from.
docs = [
  "Artificial intelligence was founded as an academic discipline in 1956.",
  "Alan Turing was the first person to conduct substantial research in AI.",
  "Born in Maida Vale, London, Turing was raised in southern England.",
]
vectors = embedding_fn.encode_documents(docs)
# The output vector has 768 dimensions, matching the collection that we just created.
print("Dim:", embedding_fn.dim, vectors[0].shape)  # Dim: 768 (768,)
# Each entity has id, vector representation, raw text, and a subject label that we use
# to demo metadata filtering later.
data = [
  {"id": i, "vector": vectors[i], "text": docs[i], "subject": "history"}
  for i in range(len(vectors))
]
print("Data has", len(data), "entities, each with fields: ", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))

# INSERT DATA INTO THE COLLECTION
# res = client.insert(collection_name="demo_collection", data=data)
res = client.insert(collection_name="demo_collection", data=data, ids=[d['id'] for d in data ])
print(f'''
  Inserted {len(res)} entities into the collection.
  {res}
''')

# SEARCH FOR SIMILAR VECTORS
query_vectors = embedding_fn.encode_queries(["Who is Alan Turing?"])
# If you don't have the embedding function you can use a fake vector to finish the demo:
# query_vectors = [ [ random.uniform(-1, 1) for _ in range(768) ] ]
res = client.search(
  collection_name="demo_collection",  # target collection
  data=query_vectors,  # query vectors
  limit=2,  # number of returned entities
  output_fields=["text", "subject"],  # specifies fields to be returned
)
# print(res)
print(f'''
  Query similar vector: {res}
''')

# VECTOR SEARCH WITH METADATA FILTERING
# Insert more docs in another subject.
docs = [
  "Machine learning has been used for drug design.",
  "Computational synthesis with AI algorithms predicts molecular properties.",
  "DDR1 is involved in cancers and fibrosis.",
]
vectors = embedding_fn.encode_documents(docs)
data = [
  {"id": 3 + i, "vector": vectors[i], "text": docs[i], "subject": "biology"}
  for i in range(len(vectors))
]
# client.insert(collection_name="demo_collection", data=data)
client.insert(collection_name="demo_collection", data=data, ids=[d['id'] for d in data ])

# This will exclude any text in "history" subject despite close to the query vector.
res = client.search(
  collection_name="demo_collection",
  data=embedding_fn.encode_queries(["tell me AI related information"]),
  filter="subject == 'biology'",
  limit=2,
  output_fields=["text", "subject"],
)
# print(res)
print(f'''
  Query by filter: {res}
''')

# QUERY BY FILTER
res = client.query(
  collection_name="demo_collection",
  filter="subject == 'history'",
  output_fields=["text", "subject"],
)
print(f'''
  Query by filter: {[str(r['id']) + ' ' + r['text'] for r in res]}
''')
# QUERY BY ID
res = client.query(
  collection_name="demo_collection",
  ids=[0, 2],
  output_fields=["vector", "text", "subject"],
)
print(f'''
  Query by id: {[str(r['id']) + ' ' + r['text'] for r in res]}
''')

# exit(0)
# DELETE ENTITIES
# Delete entities by primary key
res = client.delete(collection_name="demo_collection", ids=[0, 2])
print(f'''
  Deleted entities: {res}
''')
# Delete entities by a filter expression
res = client.delete(
  collection_name="demo_collection",
  filter="subject == 'biology'",
)
print(f'''
  Deleted entities: {res}
''')

python3 milvus.py

hands-on cloud

Python SDK Reference

# https://milvus.io/docs/quickstart.md
from pymilvus import MilvusClient

# SET UP VECTOR DATABASE
# Authentication enabled with a cluster user
client = MilvusClient(
  uri="https://inxx-xxxxxxxxxxxx.api.gcp-us-west1.zillizcloud.com:19530",
  token="user:password", # replace this with your token
)

# DROP COLLECTION
client.drop_collection(collection_name="demo_collection")

# CREATE A COLLECTION
client.create_collection(
  collection_name="demo_collection",
  dimension=768,  # The vectors we will use in this demo has 768 dimensions
)

# REPRESENT TEXT WITH VECTORS
from pymilvus import model

# If connection to https://huggingface.co/ failed, uncomment the following path
# import os
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# This will download a small embedding model "paraphrase-albert-small-v2" (~50MB).
embedding_fn = model.DefaultEmbeddingFunction()
# Text strings to search from.
docs = [
  "Artificial intelligence was founded as an academic discipline in 1956.",
  "Alan Turing was the first person to conduct substantial research in AI.",
  "Born in Maida Vale, London, Turing was raised in southern England.",
]
vectors = embedding_fn.encode_documents(docs)
# The output vector has 768 dimensions, matching the collection that we just created.
print("Dim:", embedding_fn.dim, vectors[0].shape)  # Dim: 768 (768,)
# Each entity has id, vector representation, raw text, and a subject label that we use
# to demo metadata filtering later.
data = [
  {"id": i, "vector": vectors[i], "text": docs[i], "subject": "history"}
  for i in range(len(vectors))
]
print("Data has", len(data), "entities, each with fields: ", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))

# INSERT DATA INTO THE COLLECTION
# res = client.insert(collection_name="demo_collection", data=data)
res = client.insert(collection_name="demo_collection", data=data, ids=[d['id'] for d in data ])
print(f'''
  Inserted {len(res)} entities into the collection.
  {res}
''')

# SEARCH FOR SIMILAR VECTORS
query_vectors = embedding_fn.encode_queries(["Who is Alan Turing?"])
# If you don't have the embedding function you can use a fake vector to finish the demo:
# query_vectors = [ [ random.uniform(-1, 1) for _ in range(768) ] ]
res = client.search(
  collection_name="demo_collection",  # target collection
  data=query_vectors,  # query vectors
  limit=2,  # number of returned entities
  output_fields=["text", "subject"],  # specifies fields to be returned
)
# print(res)
print(f'''
  Query similar vector: {res}
''')

# VECTOR SEARCH WITH METADATA FILTERING
# Insert more docs in another subject.
docs = [
  "Machine learning has been used for drug design.",
  "Computational synthesis with AI algorithms predicts molecular properties.",
  "DDR1 is involved in cancers and fibrosis.",
]
vectors = embedding_fn.encode_documents(docs)
data = [
  {"id": 3 + i, "vector": vectors[i], "text": docs[i], "subject": "biology"}
  for i in range(len(vectors))
]
# client.insert(collection_name="demo_collection", data=data)
client.insert(collection_name="demo_collection", data=data, ids=[d['id'] for d in data ])

# This will exclude any text in "history" subject despite close to the query vector.
res = client.search(
  collection_name="demo_collection",
  data=embedding_fn.encode_queries(["tell me AI related information"]),
  filter="subject == 'biology'",
  limit=2,
  output_fields=["text", "subject"],
)
# print(res)
print(f'''
  Query by filter: {res}
''')

# QUERY BY FILTER
res = client.query(
  collection_name="demo_collection",
  filter="subject == 'history'",
  output_fields=["text", "subject"],
)
print(f'''
  Query by filter: {[str(r['id']) + ' ' + r['text'] for r in res]}
''')
# QUERY BY ID
res = client.query(
  collection_name="demo_collection",
  ids=[0, 2],
  output_fields=["vector", "text", "subject"],
)
print(f'''
  Query by id: {[str(r['id']) + ' ' + r['text'] for r in res]}
''')

exit(0)
# DELETE ENTITIES
# Delete entities by primary key
res = client.delete(collection_name="demo_collection", ids=[0, 2])
print(f'''
  Deleted entities: {res}
''')
# Delete entities by a filter expression
res = client.delete(
  collection_name="demo_collection",
  filter="subject == 'biology'",
)
print(f'''
  Deleted entities: {res}
''')

python3 milvus-cloud.py

Other Vector Databases:

Milvus Introduction ↩