# [IVFPQ + HNSW for Billion-scale Similarity Search](https://medium.com/@peggy1502/ivfpq-hnsw-for-billion-scale-similarity-search-89ff2f89d90e)

## *To read the full article on **the best indexing approach for billion-sized vector datasets**, click the link on the title above.*

#### This notebook generates some of the plots shown in the article mentioned above.

- **Running this notebook may take several hours on a single GPU machine.**
- **You may get slightly different values running the same codes in this notebook given the stochastic nature of the algorithm.**
- **If you're not able to view the interactive plots in this notebook, copy the URL of this notebook and open it in [nbviewer](https://nbviewer.org/).**

In [1]:
import numpy as np 
import pandas as pd
import os
import time
import pprint

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [1]:
%%time
! conda install -c pytorch/label/nightly -y faiss-gpu

Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / done
Solving environment: \ | 

In [2]:
import faiss
from faiss.contrib import datasets

print(f"Faiss version: {faiss.__version__}")
print(f"GPUs: {faiss.get_num_gpus()}")

Faiss version: 1.7.0
GPUs: 1


# Section 1: Exploring HNSW

In [3]:
d = 8              # Dimension (length) of vectors.
M = 4              # Number of connections that would be made for each new vertex during HNSW construction.
nvector = 10       # Number of database vectors.

ds = datasets.SyntheticDataset(d, 0, nvector, 0)
xb = ds.get_database()

index = faiss.IndexHNSWFlat(d, M)
index.add(xb)

print(f"Number of records added into index : {index.ntotal}")
print(f"Default value for efConstruction   : {index.hnsw.efConstruction}")
print(f"Default value for efSearch         : {index.hnsw.efSearch}")

Number of records added into index : 10
Default value for efConstruction   : 40
Default value for efSearch         : 16


In [4]:
faiss.write_index(index, "test.index")       # Write index to file    
file_size = os.path.getsize("test.index")    # Get file size    
os.remove("test.index")                      # Delete saved index
print(f"Index size: %.3f bytes. \n" % file_size)

Index size: 1142.000 bytes. 



In [5]:
# To retrieve the original vectors.

for i in range(nvector):
    print(f"\n=== Vector at index {i} ========================================")
    print(index.reconstruct(i))


[-0.77021146  0.6766764  -0.98828435 -0.5946992   0.59806705 -0.07889447
  0.30440184  0.8733962 ]

[-0.86879206 -0.99689794 -0.95869625 -0.4085238   0.03261118 -0.16727902
 -0.46708187  0.5575034 ]

[ 0.03351549 -0.9945     -0.16137859 -0.137286    0.5340788   0.00778999
  0.99198925 -0.3297846 ]

[ 0.7873657  -0.18297368  0.91163546 -0.97056043  0.220104   -0.23087963
  0.38412806  0.8158843 ]

[-0.14258705  0.5982093   0.6935446  -0.32969132 -0.5689549  -0.06604872
  0.9895372   0.94295764]

[ 0.12434206  0.8910519  -0.9787737  -0.73430943  0.1108873  -0.00713387
  0.0364638   0.08474799]

[-0.5736761   0.4103292   0.9911055   0.47089875 -0.92912364 -0.17750227
 -0.42922872  0.33162764]

[-0.04735673 -0.9384091  -0.16030562 -0.08618837 -0.83368516  0.09884957
  0.8705721   0.7892136 ]

[ 0.77821916  0.94104356 -0.9964823   0.29739013  0.62030303  0.34507278
  0.8658368   0.9786864 ]

[ 0.56988734 -0.50927377 -0.82829607 -0.38108215 -0.3271086   0.08033864
  0.87360823  0.42268318]


In [6]:
print(f"In this example, the HNSW entry point is vertex {index.hnsw.entry_point}")
print("The bottom level is : level 0")
print("The top level is    : level", index.hnsw.max_level)

In this example, the HNSW entry point is vertex 1
The bottom level is : level 0
The top level is    : level 1


In [7]:
levels = faiss.vector_to_array(index.hnsw.levels)  
print("The maximum layer (ℓ) assigned to each vertex:\n", levels.tolist())

The maximum layer (ℓ) assigned to each vertex:
 [2, 2, 1, 1, 1, 1, 1, 2, 1, 1]


In [8]:
print("The maximum layer (ℓ) in which a vertex is present, and the corresponding vertex count:")
unique, counts = np.unique(levels, return_counts=True)
pprint.pprint(np.column_stack((unique, counts)))

The maximum layer (ℓ) in which a vertex is present, and the corresponding vertex count:
array([[1, 7],
       [2, 3]])


In [9]:
for i in range(1, levels.max()+1):
    vertices = np.where(levels == i)    
    print(f"List of vertices with ℓ={i} : {vertices} \n")  

List of vertices with ℓ=1 : (array([2, 3, 4, 5, 6, 8, 9]),) 

List of vertices with ℓ=2 : (array([0, 1, 7]),) 



## Getting the link structure for a vertex

In [10]:
def vector_to_array(v): 
    """ make a vector visible as a numpy array (without copying data)"""
    return faiss.rev_swig_ptr(v.data(), v.size())

def get_hnsw_links(hnsw, vno): 
    """ get link structure for vertex vno """
    
    # make arrays visible from Python
    levels = vector_to_array(hnsw.levels)
    cum_nneighbor_per_level = vector_to_array(hnsw.cum_nneighbor_per_level)
    offsets = vector_to_array(hnsw.offsets)
    neighbors = vector_to_array(hnsw.neighbors)
    
    # all neighbors of vno
    neigh_vno = neighbors[offsets[vno] : offsets[vno + 1]]
    
    # break down per level 
    nlevel = levels[vno]
    return [
        neigh_vno[cum_nneighbor_per_level[l] : cum_nneighbor_per_level[l + 1]]
        for l in range(nlevel)
    ]                 

In [11]:
# neighbors[offsets[i]:offsets[i+1]] is the list of neighbors of vector i for all levels. 
# This is where all storage goes.
print("neighbors:")
faiss.vector_to_array(index.hnsw.neighbors)

neighbors:


array([ 1,  7,  2,  6,  5,  8,  3,  4,  1,  7, -1, -1,  0,  7,  2,  6,  5,
        8,  3, -1,  0,  7, -1, -1,  0,  1,  7,  6,  5,  8,  3,  9,  9,  4,
       -1, -1, -1, -1, -1, -1,  0,  6,  7,  3, -1, -1, -1, -1,  6,  7,  2,
        1,  0,  8,  3,  9,  2,  0,  1,  7,  5,  8,  3,  4,  1,  4,  9, -1,
       -1, -1, -1, -1,  0,  1, -1, -1,  6,  1,  7,  2,  0,  5,  3, -1,  3,
        5,  2,  7, -1, -1, -1, -1], dtype=int32)

In [12]:
# offsets[i] is the offset in the neighbors array where vector i is stored.
# Size of offsets is (ntotal + 1).
print("offsets:")
faiss.vector_to_array(index.hnsw.offsets)

offsets:


array([ 0, 12, 24, 32, 40, 48, 56, 64, 76, 84, 92], dtype=uint64)

In [13]:
# Number of neighbors stored per layer (cumulative), should not be changed after first add.
print("cum_nneighbor_per_level:")
faiss.vector_to_array(index.hnsw.cum_nneighbor_per_level) 

cum_nneighbor_per_level:


array([ 0,  8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64],
      dtype=int32)

In [14]:
for i in range(nvector):
    # Get connections for that vector at particular index
    links = get_hnsw_links(index.hnsw, i)
    print("\n=== Connections for vertex", i, "========================================")
    pprint.pprint(links)


[array([1, 7, 2, 6, 5, 8, 3, 4], dtype=int32),
 array([ 1,  7, -1, -1], dtype=int32)]

[array([ 0,  7,  2,  6,  5,  8,  3, -1], dtype=int32),
 array([ 0,  7, -1, -1], dtype=int32)]

[array([0, 1, 7, 6, 5, 8, 3, 9], dtype=int32)]

[array([ 9,  4, -1, -1, -1, -1, -1, -1], dtype=int32)]

[array([ 0,  6,  7,  3, -1, -1, -1, -1], dtype=int32)]

[array([6, 7, 2, 1, 0, 8, 3, 9], dtype=int32)]

[array([2, 0, 1, 7, 5, 8, 3, 4], dtype=int32)]

[array([ 1,  4,  9, -1, -1, -1, -1, -1], dtype=int32),
 array([ 0,  1, -1, -1], dtype=int32)]

[array([ 6,  1,  7,  2,  0,  5,  3, -1], dtype=int32)]

[array([ 3,  5,  2,  7, -1, -1, -1, -1], dtype=int32)]


In [15]:
del index

# Section 2: Distribution of ℓ from a random dataset of 500,000 vectors

- ℓ is the maximum layer in which a vertex is present.
- It has an exponentially decaying probability distribution.

In [16]:
d = 128              # Dimension (length) of vectors.
M = 32               # Number of connections that would be made for each new vertex during HNSW construction.

efConstruction = 32  # The depth of exploration at add time (number of candidate neighbors to explore during construction).
efSearch = 32        # The depth of exploration during vector search.

np.random.seed(1234)             
xb = np.random.random((500000, d)).astype('float32')

index = faiss.IndexHNSWFlat(d, M)
index.hnsw.efConstruction = efConstruction 
index.hnsw.efSearch = efSearch

In [17]:
%%time
index.add(xb)

CPU times: user 23min 55s, sys: 1.4 s, total: 23min 56s
Wall time: 12min 29s


In [18]:
levels = faiss.vector_to_array(index.hnsw.levels)
np.bincount(levels)   # Ignore the first zero on the bincount array.

array([     0, 484359,  15162,    461,     17,      1])

In [19]:
print("The maximum layer (ℓ) in which a vertex is present, and the corresponding vertex count:")
unique, counts = np.unique(levels, return_counts=True)
np.column_stack((unique, counts))

The maximum layer (ℓ) in which a vertex is present, and the corresponding vertex count:


array([[     1, 484359],
       [     2,  15162],
       [     3,    461],
       [     4,     17],
       [     5,      1]])

In [20]:
fig = px.bar(x=unique, y=counts, height=480, width=500, text_auto=',d', template="simple_white") #, text_auto=True, 
fig.update_traces(textfont_size=14, textposition="outside", textfont_color="blue", width=1)
fig.update_layout(yaxis_title_text="vertex count", yaxis_range=[0, 530000], xaxis_tickprefix="ℓ=", xaxis_title_text="") # xaxis_title_text="ℓ", 
fig.update_xaxes(ticks="")
fig.show()
fig.write_html("levels.html")

In [21]:
del index

# Section 3: Helper Functions

In [22]:
def train_index(fac_string, index, gpu=True):       

    if gpu:
        res = faiss.StandardGpuResources()           # use a single GPU        
        index = faiss.index_cpu_to_gpu(res, 0, index)

    t0 = time.time()
    index.train(xt)
    train_time = time.time() - t0
    print(fac_string, "=> Training done in %.3f s. \n" % train_time)
    return index, gpu, train_time

In [23]:
def add_index(fac_string, index, efConstruction=0):
        
    if ("HNSW" in fac_string):
        if efConstruction > 0:
            print(f"{fac_string} (efConstruction={efConstruction})")
        
        if ("IVF" in fac_string):
            if efConstruction > 0:
                faiss.GpuParameterSpace().set_index_parameter(index, "efConstruction", efConstruction)                
            else:               
                efConstruction = 40            # Default value from faiss
        else:
            if efConstruction > 0:
                index.hnsw.efConstruction = efConstruction
            else:
                efConstruction = index.hnsw.efConstruction
        
    i = 0
    start = 0
    end = start + batch_size
    t0 = time.time()
    
    while start < ds.nb:   
        if end > ds.nb:
            end = ds.nb
        index.add(xb[start:end])
        if (i % batch_print_interval == 0):
            print(f"    Adding records [{start} : {end}] => ntotal: {index.ntotal}")
        start, end = end, end + batch_size
        i += 1

    index_time = time.time() - t0    
    print(fac_string, "=> Indexing done in %.3f s" % index_time, " (ntotal: ", index.ntotal, ") \n")
    return index_time

In [24]:
def save_n_get_filesize(fac_string, index, gpu=True):  
    index_name = fac_string + '.index'         # Set index name
    if gpu: 
        index = faiss.index_gpu_to_cpu(index)  
    faiss.write_index(index, index_name)       # Write index to file    
    file_size = os.path.getsize(index_name)    # Get file size    
#     os.remove(index_name)                    # Delete saved index
    file_size = file_size * 1e-6               # Convert bytes to MB
    print(f"{fac_string} => Index size: %.3f MB. \n" % file_size)
    return file_size 

In [25]:
def search_index(fac_string, index, nprobe=0, efSearch=0):
    if "IVF" in fac_string:          
        index.nprobe = nprobe                   
                  
    if "HNSW" in fac_string:
        if "IVF" in fac_string:
            if efSearch > 0:
                faiss.GpuParameterSpace().set_index_parameter(index, "efSearch", efSearch)                
            else:                
                efSearch = 16     # Default value from faiss
        else:
            if efSearch > 0:
                index.hnsw.efSearch = efSearch
            else:
                efSearch = index.hnsw.efSearch
        
    t0 = time.time()
    D, I = index.search(xq, k)

    rank = 1
    recall_1 = (I[:, :rank] == gt[:, :1]).sum() / ds.nq
#     print(f"Recall@{rank} = {recall_1} (nprobe = {nprobe})")
    
    rank = 3
    recall_3 = (I[:, :rank] == gt[:, :1]).sum() / ds.nq    
    
    rank = 5
    recall_5 = (I[:, :rank] == gt[:, :1]).sum() / ds.nq     
    
    print(f"(nprobe={nprobe}) (efSearch={efSearch}) Recall@1={recall_1}, Recall@3={recall_3}, Recall@5={recall_5}")

    search_time = (time.time() - t0) * 1000 / ds.nq     # ms per query  
    search_time_all = (time.time() - t0) * 1000    
    print(fac_string, "=> Searching done in %.3f ms." % search_time_all, "\n")
    
    return search_time, recall_1, recall_3, recall_5

In [26]:
def append_results(df, fac_string, nprobe, nsegment, recall_1, recall_3, recall_5, train_time, index_time, search_time, index_size):
    
    nprobe2 = 0 if "IVF" not in fac_string else nprobe
    nsegment2 = 0 if "PQ" not in fac_string else nsegment
    
    hnsw_loc = fac_string.find('HNSW')             # Find start location of HNSW.  
    if hnsw_loc >= 0:
        last_loc = fac_string.rfind(',')           # Find last location of comma.
        if last_loc < 0:
            last_loc = fac_string.rfind('_')       # Find last location of underscore.        
        M = fac_string[hnsw_loc+4 : last_loc]
    else:
        M = 0
    
    df = df.append({'index': fac_string,
                    'M': M,
                    'nprobe': nprobe2,
                    'nsegment': nsegment2,
                    'recall@1': recall_1,
                    'recall@3': recall_3,
                    'recall@5': recall_5,
                    'train_time':train_time,
                    'index_time': index_time,
                    'search_time': search_time,
                    'index_size': index_size
                    }, ignore_index=True)  
    
    df.to_csv('df.csv', sep='|', index=False)
    return df

In [27]:
def append_results_HNSW(dfHNSW, fac_string, efConstruction, efSearch, recall_1, recall_3, recall_5, train_time, index_time, search_time, index_size):
    
    hnsw_loc = fac_string.find('HNSW')             # Find start location of HNSW.  
    if hnsw_loc >= 0:
        last_loc = fac_string.rfind(',')           # Find last location of comma.
        if last_loc < 0:
            last_loc = fac_string.rfind('_')       # Find last location of underscore.        
        M = fac_string[hnsw_loc+4 : last_loc]
    else:
        M = 0
    
    dfHNSW = dfHNSW.append({'index': fac_string,
                    'M': M,
                    'efConstruction': efConstruction,
                    'efSearch': efSearch,
                    'recall@1': recall_1,
                    'recall@3': recall_3,
                    'recall@5': recall_5,
                    'train_time':train_time,
                    'index_time': index_time,
                    'search_time': search_time,
                    'index_size': index_size
                    }, ignore_index=True)  
    
    dfHNSW.to_csv('dfHNSW.csv', sep='|', index=False)
    return dfHNSW

# Section 3a: Getting Data for HNSW (for different `M`, `efConstruction`, `efSearch`)

In [28]:
dfHNSW = pd.DataFrame({'index': [],
                   'M': [],
                   'efConstruction': [],
                   'efSearch': [],
                   'recall@1': [],
                   'recall@3': [],
                   'recall@5': [],
                   'train_time':[],
                   'index_time': [],
                   'search_time': [],
                   'index_size': []
})

In [29]:
k = 5                        # Number of nearest neighbor records to return during search.
d = 128                      # Dimension (length) of vectors.
nbase = 3 * 1000000          # Number of database vectors.
nquery = 1000                # Number of query vectors.
batch_size = 32768           # Batch size used to add index.
batch_print_interval = 25    # When adding records to index, print status only after every x batches.  

In [30]:
%%time

ds = datasets.SyntheticDataset(d, 0, nbase, nquery)
xb = ds.get_database()
xq = ds.get_queries()
gt = ds.get_groundtruth(k) 

gpu = False            # Training is not required.
train_time = None

for M in [4,16,32,64]:  
    fac_string = "HNSW" + str(M) + ",Flat"    

    for efConstruction in [4,16,40,76]:
        
        t00 = time.time()
        
        index = faiss.index_factory(d, fac_string)   
        index_time = add_index(fac_string, index, efConstruction)
        index_size = save_n_get_filesize(fac_string, index, gpu)        

        for efSearch in [4,16,40,76]:
            search_time, recall_1, recall_3, recall_5 = search_index(fac_string, index, 0, efSearch)
            dfHNSW = append_results_HNSW(dfHNSW, fac_string, efConstruction, efSearch, recall_1, recall_3, recall_5, 
                                         train_time, index_time, search_time, index_size)         

        del index
        efCont_time = (time.time() - t00) / 60        
        print(f"{fac_string} (efConstruction={efConstruction})", "=> Total processing time: %.1f min." % efCont_time) 
        print("===================================================================\n") 

HNSW4,Flat (efConstruction=4)
    Adding records [0 : 32768] => ntotal: 32768
    Adding records [819200 : 851968] => ntotal: 851968
    Adding records [1638400 : 1671168] => ntotal: 1671168
    Adding records [2457600 : 2490368] => ntotal: 2490368
HNSW4,Flat => Indexing done in 49.407 s  (ntotal:  3000000 ) 

HNSW4,Flat => Index size: 1684.009 MB. 

(nprobe=0) (efSearch=4) Recall@1=0.021, Recall@3=0.021, Recall@5=0.021
HNSW4,Flat => Searching done in 111.859 ms. 

(nprobe=0) (efSearch=16) Recall@1=0.036, Recall@3=0.036, Recall@5=0.036
HNSW4,Flat => Searching done in 77.904 ms. 

(nprobe=0) (efSearch=40) Recall@1=0.043, Recall@3=0.043, Recall@5=0.043
HNSW4,Flat => Searching done in 121.467 ms. 

(nprobe=0) (efSearch=76) Recall@1=0.051, Recall@3=0.051, Recall@5=0.051
HNSW4,Flat => Searching done in 164.162 ms. 

HNSW4,Flat (efConstruction=4) => Total processing time: 0.9 min.

HNSW4,Flat (efConstruction=16)
    Adding records [0 : 32768] => ntotal: 32768
    Adding records [819200 : 851

# Section 3b: Getting data for the variations of HNSW indexes (with/without IVF and/or PQ)

In [31]:
df = pd.DataFrame({'index': [],
                   'M': [],
                   'nprobe': [],
                   'nsegment': [],
                   'recall@1': [],
                   'recall@3': [],
                   'recall@5': [],
                   'train_time':[],
                   'index_time': [],
                   'search_time': [],
                   'index_size': []
})

In [32]:
k = 5                        # Number of nearest neighbor records to return during search.
d = 128                      # Dimension (length) of vectors.
nlist = 65536                # Number of inverted lists (or number of centroids) for IVF indexes.
ntraining = 39 * nlist       # Number of training vectors (x*nlist, where x is between 30 and 256).
nbase = 3 * 1000000          # Number of database vectors.
nquery = 1000                # Number of query vectors.

batch_size = 32768           # Batch size used to add index.
batch_print_interval = 25    # When adding records to index, print status only after every x batches.  

M = 32                       # Number of connections that would be made for each new vertex during HNSW construction.
nsegment = 32                # Number of segments for product quantization.

# Indexes to contruct using the index_factory (e.g. "IVF65536_HNSW32,PQ32")
str1 = "IVF" + str(nlist) + "_HNSW" + str(M) + ",PQ" + str(nsegment)
str2 = "IVF" + str(nlist) + "_HNSW" + str(M) + ",Flat"
str3 = "HNSW" + str(M) + ",Flat"
str4 = "HNSW" + str(M) + "_PQ" + str(nsegment)

fac_strings = [str1, str2, str3, str4]          # To construct the indexes in this list using index_factory.
nprobes = [1, 8, 16, 64, 128, 256, 512, 1024]   # The list of nprobe values to use.

In [33]:
%%time

ds = datasets.SyntheticDataset(d, ntraining, 0, 0)
xt = ds.get_train()

ds = datasets.SyntheticDataset(d, 0, nbase, nquery)
xb = ds.get_database()
xq = ds.get_queries()
gt = ds.get_groundtruth(k) 

for fac_string in fac_strings:
    t00 = time.time()  
    
    index = faiss.index_factory(d, fac_string)
    
    # If need to get efConstruction & efSearch, downcast first before sending to GPU.    
    #--------------------------------------------------
    if ("IVF" in fac_string) and ("HNSW" in fac_string):        
        quantizer = faiss.downcast_index(index.quantizer)  # The quantizer is IndexHNSWFlat
        efConstruction = quantizer.hnsw.efConstruction
        efSearch = quantizer.hnsw.efSearch
    #--------------------------------------------------
    
    if ("IVF" in fac_string):   
        index, gpu, train_time = train_index(fac_string, index, gpu=True)   # Training is required, use GPU.
    elif ("PQ" in fac_string): 
        index, gpu, train_time = train_index(fac_string, index, gpu=False)  # Training is required, but GPU is not supported.
    else:                                                                   
        gpu = False                                                         # Training is not required.
        train_time = None
    
    index_time = add_index(fac_string, index)
    index_size = save_n_get_filesize(fac_string, index, gpu)
            
    if "IVF" in fac_string:
        for nprobe in nprobes:
            search_time, recall_1, recall_3, recall_5 = search_index(fac_string, index, nprobe)
            df = append_results(df, fac_string, nprobe, nsegment, recall_1, recall_3, recall_5, train_time, index_time, search_time, index_size)
    else:
        search_time, recall_1, recall_3, recall_5 = search_index(fac_string, index, 0)
        df = append_results(df, fac_string, 0, nsegment, recall_1, recall_3, recall_5, train_time, index_time, search_time, index_size)    
    
    del index
    total_time = (time.time() - t00) / 60
    print(fac_string, "=> Total processing time %.1f min." % total_time) 
    print("===================================================================\n") 

IVF65536_HNSW32,PQ32 => Training done in 144.404 s. 

    Adding records [0 : 32768] => ntotal: 32768
    Adding records [819200 : 851968] => ntotal: 851968
    Adding records [1638400 : 1671168] => ntotal: 1671168
    Adding records [2457600 : 2490368] => ntotal: 2490368
IVF65536_HNSW32,PQ32 => Indexing done in 23.839 s  (ntotal:  3000000 ) 

IVF65536_HNSW32,PQ32 => Index size: 154.210 MB. 

(nprobe=1) (efSearch=16) Recall@1=0.153, Recall@3=0.177, Recall@5=0.179
IVF65536_HNSW32,PQ32 => Searching done in 6.782 ms. 

(nprobe=8) (efSearch=16) Recall@1=0.349, Recall@3=0.437, Recall@5=0.45
IVF65536_HNSW32,PQ32 => Searching done in 6.422 ms. 

(nprobe=16) (efSearch=16) Recall@1=0.417, Recall@3=0.536, Recall@5=0.555
IVF65536_HNSW32,PQ32 => Searching done in 8.229 ms. 

(nprobe=64) (efSearch=16) Recall@1=0.521, Recall@3=0.712, Recall@5=0.746
IVF65536_HNSW32,PQ32 => Searching done in 18.435 ms. 

(nprobe=128) (efSearch=16) Recall@1=0.547, Recall@3=0.765, Recall@5=0.806
IVF65536_HNSW32,PQ32 => 

# Section 4a: Plots to view the effect of `M`, `efConstruction`, `efSearch`

In [34]:
# try:
#     print(dfHNSW.head(1))
# except NameError:
#     dfHNSW = pd.read_csv("../input/dfhnsw/dfHNSW-25.csv", delimiter="|")

In [35]:
dfHNSW

Unnamed: 0,index,M,efConstruction,efSearch,recall@1,recall@3,recall@5,train_time,index_time,search_time,index_size
0,"HNSW4,Flat",4,4.0,4.0,0.021,0.021,0.021,,49.407325,0.111857,1684.00915
1,"HNSW4,Flat",4,4.0,16.0,0.036,0.036,0.036,,49.407325,0.077902,1684.00915
2,"HNSW4,Flat",4,4.0,40.0,0.043,0.043,0.043,,49.407325,0.121465,1684.00915
3,"HNSW4,Flat",4,4.0,76.0,0.051,0.051,0.051,,49.407325,0.164160,1684.00915
4,"HNSW4,Flat",4,16.0,4.0,0.059,0.059,0.059,,164.926371,0.055861,1684.00915
...,...,...,...,...,...,...,...,...,...,...,...
59,"HNSW64,Flat",64,40.0,76.0,0.979,0.979,0.979,,1113.725895,0.561940,3120.19375
60,"HNSW64,Flat",64,76.0,4.0,0.738,0.738,0.738,,2090.143053,0.125361,3120.19375
61,"HNSW64,Flat",64,76.0,16.0,0.918,0.918,0.918,,2090.143053,0.189668,3120.19375
62,"HNSW64,Flat",64,76.0,40.0,0.970,0.970,0.970,,2090.143053,0.379517,3120.19375


In [36]:
dfHNSW.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   index           64 non-null     object 
 1   M               64 non-null     object 
 2   efConstruction  64 non-null     float64
 3   efSearch        64 non-null     float64
 4   recall@1        64 non-null     float64
 5   recall@3        64 non-null     float64
 6   recall@5        64 non-null     float64
 7   train_time      0 non-null      object 
 8   index_time      64 non-null     float64
 9   search_time     64 non-null     float64
 10  index_size      64 non-null     float64
dtypes: float64(8), object(3)
memory usage: 5.6+ KB


In [37]:
dfHNSW["M"] = dfHNSW["M"].fillna(0.0).astype(int)
dfHNSW["efConstruction"] = dfHNSW["efConstruction"].fillna(0.0).astype(int)
dfHNSW["efSearch"] = dfHNSW["efSearch"].fillna(0.0).astype(int)

In [38]:
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.2)
           
fig.add_trace(go.Line(x=dfHNSW['M'], y=dfHNSW['index_size'], line=dict(color='#DC3912', width=3)),
              row=1, col=1
             )         
fig.add_trace(go.Line(x=dfHNSW[dfHNSW["efConstruction"]==40]["M"].values, 
                      y=dfHNSW[dfHNSW["efConstruction"]==40]["index_time"].values,                       
                      line=dict(color='rgb(82, 188, 163)', width=3)),
              row=1, col=2
             )   
fig.update_xaxes(title_text="M", row=1, col=1, ticks="", linecolor='darkgrey')  # range=[0, 65], tick0=0.0, dtick=10, 
fig.update_xaxes(title_text="M", row=1, col=2, ticks="", linecolor='darkgrey')
fig.update_yaxes(title_text="index size (MB)", row=1, col=1, ticks="", linecolor='darkgrey')
fig.update_yaxes(title_text="construction time (s)", row=1, col=2, ticks="", linecolor='darkgrey')
fig.update_layout(height=350, width=700, template="simple_white", showlegend=False, title_text="")#, title_x=0.5)
# fig.update_traces(mode='lines+markers', )
fig.show()
fig.write_html("Hflat_M_indexsize_indextime.html")


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [39]:
fig = px.line(dfHNSW, x="M", y="index_size", markers=True, height=350, width=600, template="simple_white", color="efConstruction",
              color_discrete_sequence=px.colors.qualitative.Vivid,  
              labels=dict(index_size="index size (MB)",))
fig.update_xaxes(linecolor='darkgrey')
fig.update_yaxes(linecolor='darkgrey')
fig.show()
fig.write_html("Hflat_M_indexsize.html")

In [40]:
fig = px.line(dfHNSW, x="M", y="index_time", markers=True, height=350, width=600, color="efConstruction", template="simple_white",
              color_discrete_sequence=px.colors.qualitative.Vivid,  
              labels=dict(index_time="construction time (s)",))
fig.update_xaxes(linecolor='darkgrey')
fig.update_yaxes(linecolor='darkgrey')
fig.show()
fig.write_html("Hflat_M_indextime.html")

In [41]:
fig = px.line(dfHNSW.query("efConstruction==76"), x="M", y="search_time", markers=True, height=350, width=600, color="efSearch", template="simple_white",
              color_discrete_sequence=px.colors.qualitative.Vivid,  
              labels=dict(search_time="search time (ms/query)",))
fig.update_xaxes(linecolor='darkgrey')
fig.update_yaxes(linecolor='darkgrey')
fig.show()
fig.write_html("Hflat_M_searchtime.html")

In [42]:
# colors_custom = ['#FECB52','#FF9900','#DC3912','#990099'] # yellow, orange, red, purple
colors_custom = ['#EECA3B','#FF9900','#DC3912','#990099'] # yellow, orange, red, purple
colors_blue = ['#90AD1C','#0099C6','#316395','#1616A7']   # Shades of blue

In [43]:
fig = px.line(dfHNSW, x="efSearch", y="search_time", color="M", height=390, width=750, markers=True,
              template="simple_white", color_discrete_sequence=colors_custom, #color_discrete_sequence=px.colors.qualitative.D3,  
              facet_col="efConstruction", facet_col_spacing=0.01,
              labels=dict(search_time="search time (ms/query)",)
             )
fig.update_layout(legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.03, borderwidth=1, traceorder="reversed")) # orientation="h"
fig.update_xaxes(linecolor='darkgrey')
fig.update_yaxes(linecolor='darkgrey', showgrid=True, gridwidth=1, gridcolor='LightGrey', griddash="dot", ticks="")
fig.show()
fig.write_html("Hflat_search_time_efSearch_efConst.html")

In [44]:
fig = px.line(dfHNSW, x="efConstruction", y="search_time", color="M", height=390, width=750, markers=True,
              template="simple_white", color_discrete_sequence=colors_custom, #color_discrete_sequence=px.colors.qualitative.D3,  
              facet_col="efSearch", facet_col_spacing=0.01,
              labels=dict(search_time="search time (ms/query)",)
             )
fig.update_layout(legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.03, borderwidth=1, traceorder="reversed")) # orientation="h"
fig.update_xaxes(linecolor='darkgrey')
fig.update_yaxes(linecolor='darkgrey', showgrid=True, gridwidth=1, gridcolor='LightGrey', griddash="dot", ticks="")
fig.show()
fig.write_html("Hflat_search_time_efConst_efSearch.html")

In [45]:
fig = px.line(dfHNSW, x="efConstruction", y="recall@3", color="M", height=380, width=750, markers=True,
              template="simple_white", color_discrete_sequence=colors_custom, #color_discrete_sequence=px.colors.qualitative.D3,  
              facet_col="efSearch", facet_col_spacing=0.01,  
              )
fig.update_layout(legend=dict(yanchor="top", y=0.48, xanchor="right", x=0.99, borderwidth=1, traceorder="reversed")) # orientation="h"
fig.update_xaxes(linecolor='darkgrey')
fig.update_yaxes(linecolor='darkgrey', showgrid=True, gridwidth=1, gridcolor='LightGrey', griddash="dot", ticks="")
fig.update_yaxes(row=1, col=1, title_text="1-recall@3")
fig.show()
fig.write_html("Hflat_recall_efConst_efSearch.html")

In [46]:
fig = px.line(dfHNSW, x="efSearch", y="recall@3", color="M", height=380, width=750, markers=True, 
              template="simple_white", color_discrete_sequence=colors_custom, #color_discrete_sequence=px.colors.qualitative.D3,  
              facet_col="efConstruction", facet_col_spacing=0.01,                
              )
fig.update_layout(legend=dict(yanchor="top", y=0.48, xanchor="right", x=0.99, borderwidth=1, traceorder="reversed")) # orientation="h"
fig.update_xaxes(linecolor='darkgrey')
fig.update_yaxes(linecolor='darkgrey', showgrid=True, gridwidth=1, gridcolor='LightGrey', griddash="dot", ticks="")
fig.update_yaxes(row=1, col=1, title_text="1-recall@3")
fig.show()
fig.write_html("Hflat_recall_efSearch_efConst.html")

In [47]:
fig = px.line(dfHNSW, x="efConstruction", y="index_time", color="M", height=350, width=600, markers=True,
              template="simple_white", color_discrete_sequence=colors_custom, #color_discrete_sequence=px.colors.qualitative.D3,  
              labels=dict(index_time="construction time (s)",)
              )
fig.update_layout(legend=dict(yanchor="top", y=0.98, xanchor="left", x=0.05, borderwidth=1, traceorder="reversed")) # orientation="h"
fig.update_xaxes(linecolor='darkgrey')
fig.update_yaxes(linecolor='darkgrey', showgrid=True, gridwidth=1, gridcolor='LightGrey', griddash="dot", ticks="")
fig.show()
fig.write_html("Hflat_indextime_efConst.html")

In [48]:
fig = px.line(dfHNSW.query("efConstruction==40"), x="efSearch", y="search_time", color="M", height=350, width=600, markers=True,
              template="simple_white", color_discrete_sequence=colors_custom, #color_discrete_sequence=px.colors.qualitative.D3,  
              labels=dict(search_time="search time (ms/query)",), title="efConstruction = 40"
              )
fig.update_layout(legend=dict(yanchor="top", y=1.05, xanchor="left", x=0.05, borderwidth=1, traceorder="reversed")) # orientation="h"
fig.update_xaxes(linecolor='darkgrey')
fig.update_yaxes(linecolor='darkgrey', showgrid=True, gridwidth=1, gridcolor='LightGrey', griddash="dot", ticks="")
fig.show()
fig.write_html("Hflat_searchtime_efSearch_efConst40.html")

In [49]:
dfHNSW["M"] = dfHNSW["M"].fillna(0.0).astype(str)

In [50]:
fig = px.scatter(dfHNSW, x="search_time", y="recall@3", color="M", height=700, width=700, size="index_size", size_max=15,
                 template="simple_white", color_discrete_sequence=colors_custom, #color_discrete_sequence=px.colors.qualitative.D3,  
                 facet_row="efSearch", facet_row_spacing=0.02,  
                 facet_col="efConstruction", facet_col_spacing=0.02#, title="Bubble Size corresponds to Index Size"    
                )
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#F0F0F0', linecolor='darkgrey', ticks="")
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#F0F0F0', linecolor='darkgrey', ticks="", range=[0, 1.1], tick0=0.2, dtick=0.2)
fig.update_xaxes(title_text="")
fig.update_yaxes(title_text="")
fig.update_xaxes(row=1, col=2, title_standoff=10, title_text="<B>search time (ms/query)")                # title_font_family="Arial"
fig.update_yaxes(col=1, row=1, title_standoff=5, title_text="<B>                             1-recall@3")
fig.update_layout(legend=dict(borderwidth=1, traceorder="reversed", yanchor="top", y=0.76, xanchor="left", x=1.03, ))
fig.update_layout(title=dict(text="<B>Bubble Size corresponds to Index Size (MB)", yanchor="top", y=0.98, xanchor="center", x=0.5, 
                             font_family="Arial", font_color="RebeccaPurple"))
fig.show()
fig.write_html("Hflat_recall_4x4_indexsize.html")

In [51]:
fig = px.scatter(dfHNSW, x="search_time", y="recall@3", color="M", height=700, width=700, size="index_time", size_max=15, 
                 template="simple_white", color_discrete_sequence=colors_custom, #color_discrete_sequence=px.colors.qualitative.D3,  
                 facet_row="efSearch", facet_row_spacing=0.02,  
                 facet_col="efConstruction", facet_col_spacing=0.02     
                )
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#F0F0F0', linecolor='darkgrey', ticks="")
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#F0F0F0', linecolor='darkgrey', ticks="", range=[0, 1.1], tick0=0.2, dtick=0.2)
fig.update_xaxes(title_text="")
fig.update_yaxes(title_text="")
fig.update_xaxes(row=1, col=2, title_standoff=10, title_text="<B>search time (ms/query)")                # title_font_family="Arial"
fig.update_yaxes(col=1, row=1, title_standoff=5, title_text="<B>                             1-recall@3")
fig.update_layout(legend=dict(borderwidth=1, traceorder="reversed", yanchor="top", y=0.76, xanchor="left", x=1.03, ))
fig.update_layout(title=dict(text="<B>Bubble Size corresponds to Construction Time (s)", yanchor="top", y=0.98, xanchor="center", x=0.5, 
                             font_family="Arial", font_color="RebeccaPurple"))
fig.show()
fig.write_html("Hflat_recall_4x4_indextime.html")

# Section 4b: Plots comparing the various HNSW indexes (with/without IVF and/or PQ)

In [3]:
# try:
#     print(df.head(1))
# except NameError:
#     df = pd.read_csv("df-45.csv", delimiter="|")

In [4]:
df

Unnamed: 0,index,M,nprobe,nsegment,recall@1,recall@3,recall@5,train_time,index_time,search_time,index_size
0,"IVF65536_HNSW32,PQ32",32,1.0,32.0,0.153,0.177,0.179,144.403778,23.839422,0.00678,154.209972
1,"IVF65536_HNSW32,PQ32",32,8.0,32.0,0.349,0.437,0.45,144.403778,23.839422,0.006421,154.209972
2,"IVF65536_HNSW32,PQ32",32,16.0,32.0,0.417,0.536,0.555,144.403778,23.839422,0.008228,154.209972
3,"IVF65536_HNSW32,PQ32",32,64.0,32.0,0.521,0.712,0.746,144.403778,23.839422,0.018434,154.209972
4,"IVF65536_HNSW32,PQ32",32,128.0,32.0,0.547,0.765,0.806,144.403778,23.839422,0.033185,154.209972
5,"IVF65536_HNSW32,PQ32",32,256.0,32.0,0.563,0.804,0.856,144.403778,23.839422,0.074435,154.209972
6,"IVF65536_HNSW32,PQ32",32,512.0,32.0,0.578,0.831,0.891,144.403778,23.839422,0.195568,154.209972
7,"IVF65536_HNSW32,PQ32",32,1024.0,32.0,0.588,0.848,0.913,144.403778,23.839422,0.502939,154.209972
8,"IVF65536_HNSW32,Flat",32,1.0,0.0,0.181,0.181,0.181,84.892702,18.270943,0.009719,1594.078859
9,"IVF65536_HNSW32,Flat",32,8.0,0.0,0.455,0.455,0.455,84.892702,18.270943,0.014852,1594.078859


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   index        18 non-null     object 
 1   M            18 non-null     int64  
 2   nprobe       18 non-null     float64
 3   nsegment     18 non-null     float64
 4   recall@1     18 non-null     float64
 5   recall@3     18 non-null     float64
 6   recall@5     18 non-null     float64
 7   train_time   17 non-null     float64
 8   index_time   18 non-null     float64
 9   search_time  18 non-null     float64
 10  index_size   18 non-null     float64
dtypes: float64(9), int64(1), object(1)
memory usage: 1.7+ KB


In [6]:
df["M"] = df["M"].fillna(0.0).astype(int)
df["nprobe"] = df["nprobe"].fillna(0.0).astype(int)
df["nsegment"] = df["nsegment"].fillna(0.0).astype(int)

In [7]:
colors_gorb = ['#2CA02C', '#FF7F0E', '#D62728', '#1F77B4']   # green, orange, red, blue

In [8]:
fig = px.line(df.query("nprobe>0"), x="nprobe", y="search_time", markers=True, color="index", height=350, width=550,
             labels=dict(search_time="search time (ms/query)",), template="simple_white")
fig.update_layout(legend_title="", legend=dict(yanchor="top", y=1, xanchor="left", x=0.3, borderwidth=1)) # traceorder="reversed", 

# Set unique marker style for different lines (https://plotly.com/python/marker-style/)
line_styles = ["solid", "solid"]            # ['solid', 'dot', 'dash', 'longdash', 'dashdot', 'longdashdot']
symbols = ["circle", "circle"]  
sizes = [8, 8]

for i, fdata in enumerate(fig.data):
    fdata.marker.symbol = symbols[i]
    fdata.marker.size = sizes[i]
    fdata.marker.color = colors_gorb[i]
    fdata.line.color = colors_gorb[i]
    fdata.line.dash = line_styles[i]    

# Getting the search time for HNSW Flat and HNSW PQ to draw the horizontal lines for reference.
hnsw_flat = df[df["index"]=="HNSW32,Flat"]["search_time"].values[0]
hnsw_pq = df[df["index"]=="HNSW32_PQ32"]["search_time"].values[0]

diff = abs(hnsw_flat - hnsw_pq)
if diff < 0.023:
    if hnsw_flat > hnsw_pq:
        hnsw_flat_position = "top right"
        hnsw_pq_position = "bottom right"
    else:
        hnsw_flat_position = "bottom right"
        hnsw_pq_position = "top right"
else:
    hnsw_flat_position = "right"
    hnsw_pq_position = "right"

fig.add_hline(y=hnsw_flat, line_width=1, line_dash="dot", line_color="#1F77B4", opacity=1,
              annotation_text="<I>HNSW32,Flat</I>", annotation_position=hnsw_flat_position, annotation_font=dict(color="#1F77B4"))

fig.add_hline(y=hnsw_pq, line_width=1, line_dash="dot", line_color="#D62728", opacity=1,
              annotation_text="<I>HNSW32_PQ32</I>", annotation_position=hnsw_pq_position, annotation_font=dict(color="#D62728"))

fig.add_vline(x=128, line_width=3, line_dash="dot", line_color="grey", 
              annotation_text="nprobe=128", annotation_position="top", annotation_font=dict(color="darkgrey"))
fig.update_xaxes(linecolor='darkgrey')
fig.update_yaxes(linecolor='darkgrey')
fig.show()
fig.write_html("nprobe_search.html")

In [9]:
fig = px.line(df.query("nprobe>0"), x="nprobe", y="recall@3", color="index", markers=True, height=350, width=550, template="simple_white")
fig.update_layout(yaxis_title_text="1-recall@3",
                  legend_title="", legend=dict(yanchor="bottom", y=0.1, xanchor="right", x=0.9, borderwidth=1, traceorder="reversed")) # traceorder="reversed", 

# Set unique marker style for different lines (https://plotly.com/python/marker-style/)
line_styles = ["solid", "solid"]            # ['solid', 'dot', 'dash', 'longdash', 'dashdot', 'longdashdot']
symbols = ["circle", "circle"]  
sizes = [8, 8]

for i, fdata in enumerate(fig.data):
    fdata.marker.symbol = symbols[i]
    fdata.marker.size = sizes[i]
    fdata.marker.color = colors_gorb[i]
    fdata.line.color = colors_gorb[i]
    fdata.line.dash = line_styles[i]
        
fig.add_vline(x=128, line_width=3, line_dash="dot", line_color="grey", 
              annotation_text="nprobe=128", annotation_position="top", annotation_font=dict(color="darkgrey"))

fig.add_hline(y=df[df["index"]=="HNSW32,Flat"]["recall@3"].values[0], line_width=1, line_dash="dot", line_color="#1F77B4", opacity=1,
              annotation_text="<I>HNSW32,Flat</I>", annotation_position="right", annotation_font=dict(color="#1F77B4"))

fig.add_hline(y=df[df["index"]=="HNSW32_PQ32"]["recall@3"].values[0], line_width=1, line_dash="dot", line_color="#D62728", opacity=1,
              annotation_text="<I>HNSW32_PQ32</I>", annotation_position="right", annotation_font=dict(color="#D62728"))
fig.update_xaxes(linecolor='darkgrey')
fig.update_yaxes(linecolor='darkgrey')
fig.show()
fig.write_html("nprobe_recall_3.html")

In [10]:
df128 = df[(df["nprobe"]==128) | (df["index"]=="HNSW32,Flat") | (df["index"]=="HNSW32_PQ32")].copy()
df128

Unnamed: 0,index,M,nprobe,nsegment,recall@1,recall@3,recall@5,train_time,index_time,search_time,index_size
4,"IVF65536_HNSW32,PQ32",32,128,32,0.547,0.765,0.806,144.403778,23.839422,0.033185,154.209972
12,"IVF65536_HNSW32,Flat",32,128,0,0.84,0.84,0.84,84.892702,18.270943,0.055386,1594.078859
16,"HNSW32,Flat",32,0,0,0.85,0.85,0.85,,828.196047,0.132884,2352.424546
17,HNSW32_PQ32,32,0,32,0.629,0.789,0.81,62.063184,618.958291,0.115766,912.555659


In [11]:
fig = make_subplots(rows=1, cols=4, horizontal_spacing = 0.15)
colors = px.colors.qualitative.D3  

fig.add_trace(go.Bar(x=df128.loc[:,'index'], y=df128.loc[:,'search_time'], marker_color=colors_gorb, name="search time"),
              row=1, col=1
             )              
fig.add_trace(go.Bar(x=df128.loc[:,'index'], y=df128.loc[:,'recall@1'], marker_color=colors_gorb, name="recall@1"),
              row=1, col=2
             )  
fig.add_trace(go.Bar(x=df128.loc[:,'index'], y=df128.loc[:,'recall@3'], marker_color=colors_gorb, name="recall@3"),
              row=1, col=3
             ) 
fig.add_trace(go.Bar(x=df128.loc[:,'index'], y=df128.loc[:,'recall@5'], marker_color=colors_gorb, name="recall@5"),
              row=1, col=4
             ) 
fig.update_yaxes(title_text="search time (ms/query)", row=1, col=1)
fig.update_yaxes(title_text="1-recall@1", row=1, col=2)
fig.update_yaxes(title_text="1-recall@3", row=1, col=3)
fig.update_yaxes(title_text="1-recall@5", row=1, col=4)
fig.update_xaxes(ticks="", tickangle=30, linecolor='darkgrey')
fig.update_yaxes(linecolor='darkgrey')
fig.update_layout(height=400, width=900, template="simple_white", showlegend=False, title_text="For IVF-based indexes, nprobe=128")
fig.show()
fig.write_html("nprobe128_search_recalls.html")

In [12]:
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.4)
colors = px.colors.qualitative.D3  

fig.add_trace(go.Bar(x=df128.loc[:,'index'], y=df128.loc[:,'search_time'], marker_color=colors_gorb, marker_opacity = 0.8),
              row=1, col=1
             )              
fig.add_trace(go.Bar(x=df128.loc[:,'index'], y=df128.loc[:,'recall@3'], marker_color=colors_gorb, marker_opacity = 0.8),
              row=1, col=2
             ) 
fig.update_yaxes(title_text="search time (ms/query)", row=1, col=1)
fig.update_yaxes(title_text="1-recall@3", row=1, col=2)
fig.update_xaxes(ticks="", tickangle=30, linecolor='darkgrey')
fig.update_yaxes(linecolor='darkgrey')

fig.update_layout(height=400, width=500, template="simple_white", showlegend=False, title_text="")
fig.update_layout(title=dict(text="<B><I>( For IVF-based indexes, nprobe=128 )", yanchor="top", y=0.85, xanchor="left", x=0.1, 
                             font_family="Arial", font_color="RebeccaPurple", font_size=13))
fig.show()
fig.write_html("nprobe128_search_recall3.html")

In [13]:
# Create figure with secondary y-axis.
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(go.Scatter(x=df128.loc[:,'index'], y=df128.loc[:,'search_time'], name="search time (ms/query)"),    
              secondary_y=False
             )
fig.add_trace(go.Scatter(x=df128.loc[:,'index'], y=df128.loc[:,"recall@3"], name="1-recall@3"),    
              secondary_y=True
             )
fig.update_layout(height=400, width=700, template="simple_white", title_text="Search time vs Recall",                   
                  legend=dict(orientation="h", yanchor="bottom", y=1.0, xanchor="right", x=0.9, borderwidth=1)
                 )

# Set unique marker style for different lines (https://plotly.com/python/marker-style/)
colors = ['#E15F99', '#2E91E5']
symbols = ["diamond", "hexagram"]
sizes = [12, 12]

for i, fdata in enumerate(fig.data):
    fdata.marker.size = sizes[i]
    fdata.marker.color = colors[i]
    fdata.line.color = colors[i]  
    fdata.marker.symbol = symbols[i]
    
fig.update_xaxes(title_text="", tickangle=8, linecolor='darkgrey')
fig.update_yaxes(range=[0.00, 0.30], secondary_y=False, title_text="<b>search time (ms/query)</b>", color="#E15F99", linecolor='darkgrey')
fig.update_yaxes(range=[0, 1], secondary_y=True, title_text="<b>1-recall@3</b>", color="#2E91E5", linecolor='darkgrey')
fig.show()
fig.write_html("nprobe128_search_recall_dual_y.html")

In [31]:
fig = px.scatter(df128, x="search_time", y="recall@3", color="index", height=400, width=650,
                 template="simple_white", text="index", size="index_size", size_max=50)     

hnsw_flat = df128[df128["index"]=="HNSW32,Flat"]["search_time"].values[0]
hnsw_pq = df128[df128["index"]=="HNSW32_PQ32"]["search_time"].values[0]

if hnsw_flat <= hnsw_pq:
    x_limit = hnsw_pq + 0.045
else:
    x_limit = hnsw_flat + 0.045

positions = ["bottom right", "top center", "top center", "bottom center"]

for i, fdata in enumerate(fig.data):   
    fdata.textposition = positions[i]
    fdata.marker.opacity = 0.8
    fdata.marker.color = colors_gorb[i]    
    fdata.textfont.color = colors_gorb[i]   
    fdata.textfont.size = 13    
    fdata.textfont.family="Arial"
    
fig.update_traces(mode='markers+text')#, textposition="middle right") 
fig.update_traces(texttemplate="%{text}<br>(%{marker.size:,d} MB)")
fig.update_layout(xaxis_title_text="search time (ms/query)", yaxis_title_text="1-recall@3", yaxis_range=[0.5, 1], xaxis_range=[0, x_limit], showlegend=False)
fig.update_layout(title=dict(text="<B><I>( For IVF-based indexes, nprobe=128 )", yanchor="top", y=0.3, xanchor="left", x=0.15, 
                             font_family="Arial", font_color="RebeccaPurple", font_size=13))
fig.update_xaxes(linecolor='darkgrey')
fig.update_yaxes(linecolor='darkgrey')
fig.show()
fig.write_html("nprobe128_search_recall_bubble0.html")

In [32]:
fig = px.scatter(df128, x="search_time", y="recall@3", color="index", height=400, width=650,
                 template="simple_white", text="index", size="index_size", size_max=50)     

hnsw_flat = df128[df128["index"]=="HNSW32,Flat"]["search_time"].values[0]
hnsw_pq = df128[df128["index"]=="HNSW32_PQ32"]["search_time"].values[0]

if hnsw_flat <= hnsw_pq:
    x_limit = hnsw_pq + 0.045
else:
    x_limit = hnsw_flat + 0.045

positions = ["bottom right", "top center", "top center", "bottom right"]

for i, fdata in enumerate(fig.data):   
    fdata.textposition = positions[i]
    fdata.marker.opacity = 0.8
    fdata.marker.color = colors_gorb[i]    
    fdata.textfont.color = colors_gorb[i]   
    fdata.textfont.size = 13 
    fdata.textfont.family="Arial"
    
fig.update_traces(mode='markers+text')#, textposition="middle right") 
fig.update_traces(texttemplate="%{text}<br>(%{marker.size:,d} MB)")
fig.update_layout(xaxis_title_text="search time (ms/query)", yaxis_title_text="1-recall@3", yaxis_range=[0.5, 1], xaxis_range=[0, x_limit], showlegend=False)
fig.update_layout(title=dict(text="<B><I>( For IVF-based indexes, nprobe=128 )", yanchor="top", y=0.3, xanchor="left", x=0.15, 
                             font_family="Arial", font_color="RebeccaPurple", font_size=13))
fig.update_xaxes(linecolor='darkgrey')
fig.update_yaxes(linecolor='darkgrey')
fig.show()
fig.write_html("nprobe128_search_recall_bubble.html")

In [33]:
df256 = df[(df["nprobe"]==256) | (df["index"]=="HNSW32,Flat") | (df["index"]=="HNSW32_PQ32")].copy()
df256

Unnamed: 0,index,M,nprobe,nsegment,recall@1,recall@3,recall@5,train_time,index_time,search_time,index_size
5,"IVF65536_HNSW32,PQ32",32,256,32,0.563,0.804,0.856,144.403778,23.839422,0.074435,154.209972
13,"IVF65536_HNSW32,Flat",32,256,0,0.904,0.904,0.904,84.892702,18.270943,0.08898,1594.078859
16,"HNSW32,Flat",32,0,0,0.85,0.85,0.85,,828.196047,0.132884,2352.424546
17,HNSW32_PQ32,32,0,32,0.629,0.789,0.81,62.063184,618.958291,0.115766,912.555659


In [34]:
fig = px.scatter(df256, x="search_time", y="recall@3", color="index", height=400, width=650,
                 template="simple_white", text="index", size="index_size", size_max=50)     

positions = ["bottom center", "top center", "top center", "bottom right"]

for i, fdata in enumerate(fig.data):   
    fdata.textposition = positions[i]
    fdata.marker.opacity = 0.8
    fdata.marker.color = colors_gorb[i]    
    fdata.textfont.color = colors_gorb[i]   
    fdata.textfont.size = 13  
    fdata.textfont.family="Arial"
    
fig.update_traces(mode='markers+text')#, textposition="middle right") 
fig.update_traces(texttemplate="%{text}<br>(%{marker.size:,d} MB)")
fig.update_layout(xaxis_title_text="search time (ms/query)", yaxis_title_text="1-recall@3", yaxis_range=[0.5, 1.06], xaxis_range=[0, x_limit], showlegend=False)
fig.update_layout(title=dict(text="<B><I>( For IVF-based indexes, nprobe=256 )", yanchor="top", y=0.3, xanchor="left", x=0.15, 
                             font_family="Arial", font_color="RebeccaPurple", font_size=13))
fig.update_xaxes(linecolor='darkgrey')
fig.update_yaxes(linecolor='darkgrey')
fig.show()
fig.write_html("nprobe256_search_recall_bubble.html")

In [35]:
df512 = df[(df["nprobe"]==512) | (df["index"]=="HNSW32,Flat") | (df["index"]=="HNSW32_PQ32")].copy()
df512

Unnamed: 0,index,M,nprobe,nsegment,recall@1,recall@3,recall@5,train_time,index_time,search_time,index_size
6,"IVF65536_HNSW32,PQ32",32,512,32,0.578,0.831,0.891,144.403778,23.839422,0.195568,154.209972
14,"IVF65536_HNSW32,Flat",32,512,0,0.944,0.944,0.944,84.892702,18.270943,0.14525,1594.078859
16,"HNSW32,Flat",32,0,0,0.85,0.85,0.85,,828.196047,0.132884,2352.424546
17,HNSW32_PQ32,32,0,32,0.629,0.789,0.81,62.063184,618.958291,0.115766,912.555659


In [36]:
df128["nprobe"] = 128
df256["nprobe"] = 256
df512["nprobe"] = 512

df3 = pd.concat([df128, df256, df512], ignore_index=True)
df3

Unnamed: 0,index,M,nprobe,nsegment,recall@1,recall@3,recall@5,train_time,index_time,search_time,index_size
0,"IVF65536_HNSW32,PQ32",32,128,32,0.547,0.765,0.806,144.403778,23.839422,0.033185,154.209972
1,"IVF65536_HNSW32,Flat",32,128,0,0.84,0.84,0.84,84.892702,18.270943,0.055386,1594.078859
2,"HNSW32,Flat",32,128,0,0.85,0.85,0.85,,828.196047,0.132884,2352.424546
3,HNSW32_PQ32,32,128,32,0.629,0.789,0.81,62.063184,618.958291,0.115766,912.555659
4,"IVF65536_HNSW32,PQ32",32,256,32,0.563,0.804,0.856,144.403778,23.839422,0.074435,154.209972
5,"IVF65536_HNSW32,Flat",32,256,0,0.904,0.904,0.904,84.892702,18.270943,0.08898,1594.078859
6,"HNSW32,Flat",32,256,0,0.85,0.85,0.85,,828.196047,0.132884,2352.424546
7,HNSW32_PQ32,32,256,32,0.629,0.789,0.81,62.063184,618.958291,0.115766,912.555659
8,"IVF65536_HNSW32,PQ32",32,512,32,0.578,0.831,0.891,144.403778,23.839422,0.195568,154.209972
9,"IVF65536_HNSW32,Flat",32,512,0,0.944,0.944,0.944,84.892702,18.270943,0.14525,1594.078859


In [37]:
fig = px.scatter(df3, x="search_time", y="recall@3", color="index", height=380, width=720, size="index_size", size_max=30, 
                 template="simple_white", color_discrete_sequence=colors_gorb, #color_discrete_sequence=px.colors.qualitative.D3,  
                 facet_col="nprobe", facet_col_spacing=0.03     
                )
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#F0F0F0', linecolor='darkgrey', ticks="")
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#F0F0F0', linecolor='darkgrey', ticks="", range=[0.5, 1.02])
fig.update_xaxes(title_text="")
fig.update_xaxes(row=1, col=2, title_standoff=10, title_text="<B>search time (ms/query)")             # title_font_family="Arial"
fig.update_yaxes(col=1, row=1, title_standoff=5, title_text="<B>1-recall@3")
fig.update_layout(legend=dict(borderwidth=1, yanchor="top", y=0.39, xanchor="left", x=0.7, title="")) # traceorder="reversed", 
fig.update_layout(title=dict(text="<B>Bubble Size corresponds to Index Size (MB)", yanchor="top", y=0.98, xanchor="center", x=0.5, 
                             font_family="Arial", font_color="RebeccaPurple"))
fig.show()
fig.write_html("nprobe_x3_indexsize.html")

In [38]:
fig = px.bar(df128, x="index", y="index_size", color="index", height=350, width=450, #text_auto=',d',
             template="simple_white", 
#             color_discrete_sequence=px.colors.qualitative.Set1,  # To override the colors from template
            )
for i, fdata in enumerate(fig.data):    
    fdata.marker.color = colors_gorb[i]   
    fdata.marker.opacity = 0.8

fig.update_traces(textfont_size=12, textposition="outside", textfont_color="blue", texttemplate="%{value:,d} MB"  )
fig.update_layout(xaxis_title_text="", yaxis_title_text="", yaxis_range=[0, 2600], showlegend=False) # yaxis_title_text="index size (MB)", 
fig.update_xaxes(ticks="", tickangle=15, linecolor='darkgrey',)
fig.update_yaxes(ticks="", showticklabels=False, showline=False)
fig.show()
fig.write_html("nprobe128_index_memory.html")