API Tutorial

In this tutorial, we will learn how to use the API to interact with CHEESE platform. We will cover the following topics: - How to retrieve molecules from available databases - Search and batch search - Comnputing CHEESE Embeddings - Computation of similarities

Prerequisities

Python
NumPy (optional): pip install numpy
RdKit (optional): pip install rdkit
Rich (optional): pip install rich

import requests
import json
from rdkit import Chem
from rdkit.Chem import Draw
import numpy as np
from rich.pretty import pprint

np.set_printoptions(precision=3)

Sanity Check

MY_URL = "http://cheese-database.ch.themama.ai:9002" # URL of the API, change it if you are using a different one
headers = {'accept': 'application/json'} # headers for the request, we want a JSON response

url = MY_URL + '/test'
response = requests.get(url, headers=headers)
pprint(response.json())

{'message': 'Health check successful !!'}

Random molecules

This endpoint returns a specified number of random molecules in SMILES format from a random selection of databases.

# Define the URL and headers
url = MY_URL + '/random_molecule'
params = {'n_mols': 5}

# Make the GET request
response = requests.get(url, headers=headers, params=params)
pprint(response.json())

[
│   'Cn1nnc(C[NH2+]C[C@H]2CCN(C(=O)[C@]34C[C@H]3COC4)C2)n1',
│   'Cc1ccc(NC(=O)c2ccc(N(Cc3ccccc3Cl)S(C)(=O)=O)cc2)cc1C',
│   'CN(C[C@@H]1CCC[N@H+](C)C1)S(=O)(=O)N1CCO[C@H](C#N)C1',
│   'C#CCCN1CC2(C1)CN(C(=O)C1=NC(=O)N(C)C1)CCO2',
│   'Cc1nccc(CNC(=O)C(=O)N[C@H](C)[C@@H]2C[N@@H+](C)CCO2)n1'
]

Draw.MolsToGridImage([Chem.MolFromSmiles(mol) for mol in response.json()], molsPerRow=5)

png

Available Databases

# Define the URL and headers
url = MY_URL + '/available_dbs'
headers = {'accept': 'application/json'}

# Make the GET request
response = requests.get(url, headers=headers)
pprint(response.json())

{'available_dbs': ['ENAMINE-REAL', 'ZINC15', 'CUSTOM_CLUSTERED', 'CUSTOM_IN_MEM']}

Search

Simple Search

params = {
    "search_input": "CC(=O)Oc1ccccc1C(=O)O", # SMILES of the query molecule
    "search_type": 'espsim_shape', # CHEESE Search type
    "n_neighbors": 5,
    "search_quality": "fast",
    "db_names": ["ENAMINE-REAL"],
}

response = requests.get(MY_URL + "/molsearch_simple", params=params)
pprint(response.json())

{
│   'smiles': [
│   │   'CC(=O)NC1=NC=CC=C1C(=O)O',
│   │   'CC(C)NC1=NC=CC=C1C(=O)O',
│   │   'COC(=O)OC1=CC=CC=C1C(=O)O',
│   │   'CC(C)OC1=CC=CC=C1C(=O)NO',
│   │   'CC(=O)NC1=CC=CC=C1C(C)=O'
│   ],
│   'id': ['Z2904267089', 'Z71176798', 'Z192948624', 'Z1776115990', 'PV-002547950599']
}

smiles = response.json()["smiles"]
ids = response.json()["id"]
Draw.MolsToGridImage([Chem.MolFromSmiles(mol) for mol in smiles], molsPerRow=5, legends=ids)

png

Advanced Search

import requests

params = {
    "search_input": "CC(=O)Oc1ccccc1C(=O)O",
    "search_type": 'espsim_shape',
    "n_neighbors": 5,
    "search_quality": "fast",
    "db_names": ["ZINC15"],
    "descriptors": True,
    "properties": True,
    "filter_molecules": False,
    "order_molecules": False
}

response = requests.get(MY_URL + "/molsearch", params=params)
pprint(response.json().keys())

dict_keys(['remarks', 'canonicalized_query', 'neighbors', 'query_properties', 'search_info'])

pprint(response.json(), max_depth=3)

{
│   'remarks': '',
│   'canonicalized_query': 'CC(=O)Oc1ccccc1C(=O)O',
│   'neighbors': [
│   │   {
│   │   │   'smiles': 'CC(=O)Oc1ccccc1C(=O)[O-]',
│   │   │   'zinc_id': 'ZINC15 : 53',
│   │   │   'embedding_similarity': 0.7071046233177185,
│   │   │   'properties': {...}
│   │   },
│   │   {
│   │   │   'smiles': 'CC(=O)Oc1ccccc1C(N)=O',
│   │   │   'zinc_id': 'ZINC15 : 404835',
│   │   │   'embedding_similarity': 0.6960273385047913,
│   │   │   'properties': {...}
│   │   },
│   │   {
│   │   │   'smiles': 'CC(=O)Oc1ccccc1C(N)=S',
│   │   │   'zinc_id': 'ZINC15 : 72284420',
│   │   │   'embedding_similarity': 0.6937514841556549,
│   │   │   'properties': {...}
│   │   },
│   │   {
│   │   │   'smiles': 'CC(=O)Nc1ccncc1C(=O)[O-]',
│   │   │   'zinc_id': 'ZINC15 : 3163078',
│   │   │   'embedding_similarity': 0.6923034489154816,
│   │   │   'properties': {...}
│   │   },
│   │   {
│   │   │   'smiles': 'CC(=O)Oc1ccccc1C(C)=O',
│   │   │   'zinc_id': 'ZINC15 : 137705',
│   │   │   'embedding_similarity': 0.6920740604400635,
│   │   │   'properties': {...}
│   │   }
│   ],
│   'query_properties': {
│   │   'smiles': 'CC(=O)Oc1ccccc1C(=O)O',
│   │   'zinc_id': '',
│   │   'properties': {
│   │   │   'absorption': {...},
│   │   │   'excretion': {...},
│   │   │   'toxicity': {...},
│   │   │   'distribution': {...},
│   │   │   'metabolism': {...},
│   │   │   'basics': {...}
│   │   }
│   },
│   'search_info': {
│   │   'query_embedding_time': 0.2988882064819336,
│   │   'search_time': 0.3333892822265625,
│   │   'filter_time': 0.003624439239501953,
│   │   'sorting_time': 0.0015327930450439453,
│   │   'property_prediction_time': 0.0522768497467041,
│   │   'total_time': 0.6897115707397461
│   }
}

One result molecule json with properties and descriptors looks like this:

pprint(response.json()["neighbors"][0])

{
│   'smiles': 'CC(=O)Oc1ccccc1C(=O)[O-]',
│   'zinc_id': 'ZINC15 : 53',
│   'embedding_similarity': 0.7071046233177185,
│   'properties': {
│   │   'absorption': {
│   │   │   'caco2_wang': -4.235,
│   │   │   'lipophilicity_astrazeneca': -0.414,
│   │   │   'solubility_aqsoldb': -2.315,
│   │   │   'bioavailability_ma': 0.908,
│   │   │   'hia_hou': 0.989,
│   │   │   'pgp_broccatelli': 0.0,
│   │   │   'clogp': -0.0246
│   │   },
│   │   'excretion': {
│   │   │   'clearance_hepatocyte_az': 86.346,
│   │   │   'clearance_microsome_az': 55.043,
│   │   │   'half_life_obach': 0.43
│   │   },
│   │   'toxicity': {'ld50_zhu': 1.81, 'ames': 0.003, 'dili': 0.021, 'herg': 0.0},
│   │   'distribution': {'ppbr_az': 45.823, 'vdss_lombardo': 0.155, 'bbb_martins': 0.985},
│   │   'metabolism': {'cyp2c9_veith': 0.046, 'cyp2d6_veith': 0.0, 'cyp3a4_veith': 0.0},
│   │   'basics': {
│   │   │   'molecular_weight': 179.03498,
│   │   │   'formal_charge': -1.0,
│   │   │   'heavy_atoms': 13.0,
│   │   │   'h_bond_acceptors': 4.0,
│   │   │   'h_bond_donor': 0.0,
│   │   │   'rotatable_bonds': 2.0,
│   │   │   'num_of_rings': 1.0,
│   │   │   'molar_refractivity': 42.0815,
│   │   │   'number_of_atoms': 13.0,
│   │   │   'topological_surface_area_mapping': 66.43
│   │   }
│   }
}

Advanced Search with filtering and ordering

import requests

params = {
    "search_input": "CC(=O)Oc1ccccc1C(=O)O",
    "search_type": 'espsim_shape',
    "n_neighbors": 10,
    "search_quality": "fast",
    "db_names": ["ZINC15"],
    "descriptors": True,
    "properties": True,
    "filter_molecules": True,
    "order_molecules": True,
    "filtering": ["PAINS", "Murcko scaffold hop"], # Using filtering to filter out PAINS and to get only scaffold hops
    "ordering": ["Morgan Tanimoto"] # Sorting results based on Morgan Fingerprints after doing cheese search 
}

def just_smiles(advanced_response):
    return [r["smiles"] for r in advanced_response["neighbors"]]

response = requests.get(MY_URL + "/molsearch", params=params)
pprint(just_smiles(response.json())) # note: we searchech 10 molecules but got 6 because of the filtering (its better to increase the number of neighbors)

[
│   'CC(=O)Oc1ccccc1C(C)=O',
│   'CC(=O)Oc1ccccc1C(N)=O',
│   'CC(=O)Oc1ccccc1C(=O)[O-]',
│   'CC(=O)Oc1ccccc1C(N)=S',
│   'CC(=O)Nc1ccccc1C(=O)[O-]',
│   'NC(=O)Nc1ccccc1C(=O)[O-]'
]

smiles = just_smiles(response.json())
Draw.MolsToGridImage([Chem.MolFromSmiles(mol) for mol in smiles], molsPerRow=3)

png

Search Array

Searching a list of molecules

import requests

params = {
    "search_input": ["CC1=CN(C)N=C1", "CNC1=CC=CC=C1", "CCN1C=CN=C1"],
    "search_type": 'espsim_shape',
    "n_neighbors": 5,
    "search_quality": "fast",
    "db_names": "ZINC15",
    "descriptors": False,
    "properties": False,
    "filter_molecules": False
}

response = requests.get(MY_URL + "/molsearch_array", params=params, headers=headers)
pprint(response.json(), max_depth=3)

{
│   'CC1=CN(C)N=C1': {
│   │   'CC1=CN(C)N=C1': {'remarks': '', 'canonicalized_query': 'Cc1cnn(C)c1', 'neighbors': [...]},
│   │   'search_info': {
│   │   │   'query_embedding_time': 0.2705063819885254,
│   │   │   'search_time': 0.09556221961975098,
│   │   │   'filter_time': 0.0029592514038085938,
│   │   │   'sorting_time': 0.010590314865112305,
│   │   │   'property_prediction_time': 0,
│   │   │   'total_time': 0.37961816787719727
│   │   }
│   },
│   'CNC1=CC=CC=C1': {
│   │   'CNC1=CC=CC=C1': {'remarks': '', 'canonicalized_query': 'CNc1ccccc1', 'neighbors': [...]},
│   │   'search_info': {
│   │   │   'query_embedding_time': 0.31961727142333984,
│   │   │   'search_time': 0.0561220645904541,
│   │   │   'filter_time': 0.002988100051879883,
│   │   │   'sorting_time': 0.008991718292236328,
│   │   │   'property_prediction_time': 0,
│   │   │   'total_time': 0.38771915435791016
│   │   }
│   },
│   'CCN1C=CN=C1': {
│   │   'CCN1C=CN=C1': {'remarks': '', 'canonicalized_query': 'CCn1ccnc1', 'neighbors': [...]},
│   │   'search_info': {
│   │   │   'query_embedding_time': 0.38388919830322266,
│   │   │   'search_time': 0.04931282997131348,
│   │   │   'filter_time': 0.002831697463989258,
│   │   │   'sorting_time': 0.008421182632446289,
│   │   │   'property_prediction_time': 0,
│   │   │   'total_time': 0.4444549083709717
│   │   }
│   }
}

Batch Search

import requests

params = {
    "search_input": [
        "CC(=O)NC1=NC=CC=C1C(=O)O",
        "CC(C)NC1=NC=CC=C1C(=O)O",
        "COC(=O)OC1=CC=CC=C1C(=O)O",
        "CC(C)OC1=CC=CC=C1C(=O)NO",
        "CC(=O)NC1=CC=CC=C1C(C)=O",
    ],
    "search_type": "espsim_shape",
    "n_neighbors": 10,
    "search_mode": "centroid",  # or "batch"
}

response = requests.get(MY_URL + "/batch_search", params=params, headers=headers)
smiles = response.json()["smiles"]
ids = response.json()["id"]
Draw.MolsToGridImage(
    [Chem.MolFromSmiles(mol) for mol in smiles], molsPerRow=5, legends=ids
)

png

Embeddings

params = {
    "search_input": ["Fc1ccccc1", "Clc1ccccc1", "Brc1ccccc1"],
    "save_embs": False,
}

response = requests.get(MY_URL + "/embeddings", params=params, headers=headers)
pprint(response.json().keys())
pprint(np.array(response.json()["espsim_shape"]).shape)
pprint(np.array(response.json()["espsim_shape"][0])) # embedding of the first molecule

dict_keys(['morgan', 'espsim_electrostatic', 'espsim_shape', 'active_pairs'])

(3, 256)

array([-0.271,  0.028, -0.28 ,  0.089, -0.067, -0.005, -0.045, -0.024,
│   │   0.126, -0.162, -0.027, -0.163,  0.142,  0.006, -0.033, -0.069,
│      -0.111,  0.045,  0.01 ,  0.206,  0.006,  0.137,  0.056,  0.012,
│      -0.033, -0.123,  0.013, -0.008,  0.076,  0.116, -0.274, -0.067,
│   │   0.042, -0.113,  0.017, -0.023,  0.085, -0.029, -0.059,  0.078,
│   │   0.091, -0.045, -0.034,  0.053,  0.05 ,  0.119, -0.018,  0.108,
│   │   0.013, -0.015, -0.021,  0.02 , -0.086,  0.16 ,  0.149, -0.174,
│      -0.047,  0.241, -0.133,  0.02 , -0.265, -0.009, -0.043, -0.118,
│   │   0.112,  0.043,  0.049, -0.001,  0.042,  0.007, -0.031,  0.145,
│      -0.07 ,  0.048,  0.01 , -0.016,  0.039, -0.027, -0.202,  0.064,
│   │   0.044, -0.077,  0.005, -0.07 , -0.09 ,  0.276, -0.047,  0.189,
│   │   0.08 , -0.094,  0.075, -0.047,  0.142, -0.242, -0.117, -0.04 ,
│   │   0.177,  0.111,  0.156, -0.015, -0.032,  0.197, -0.003, -0.006,
│      -0.064, -0.045, -0.051,  0.071,  0.043,  0.048, -0.103,  0.036,
│      -0.078,  0.025,  0.139, -0.055, -0.014,  0.064, -0.125,  0.052,
│      -0.001,  0.046,  0.02 ,  0.041,  0.009,  0.025,  0.142, -0.322,
│   │   0.001,  0.046, -0.113,  0.143,  0.126,  0.045,  0.124, -0.016,
│      -0.098, -0.13 ,  0.23 ,  0.013, -0.228, -0.069, -0.088, -0.022,
│   │   0.025,  0.011, -0.131,  0.222, -0.007, -0.092,  0.023,  0.069,
│      -0.011, -0.042,  0.101, -0.056, -0.079,  0.152, -0.027, -0.012,
│   │   0.03 ,  0.147,  0.006, -0.117,  0.081, -0.115,  0.101,  0.121,
│   │   0.101, -0.072,  0.011, -0.002,  0.003, -0.192, -0.024,  0.135,
│      -0.027,  0.01 ,  0.047, -0.116, -0.058,  0.107,  0.119,  0.002,
│      -0.18 , -0.054,  0.003, -0.021,  0.031,  0.036, -0.038, -0.121,
│   │   0.145, -0.038, -0.015, -0.015,  0.028, -0.05 , -0.104,  0.146,
│      -0.093, -0.056, -0.149, -0.074,  0.126, -0.07 , -0.112,  0.001,
│      -0.098, -0.026, -0.007, -0.125, -0.003,  0.066, -0.081, -0.035,
│      -0.09 , -0.166, -0.026, -0.114, -0.19 , -0.07 ,  0.032,  0.05 ,
│      -0.015, -0.022, -0.128,  0.041, -0.179,  0.038, -0.161, -0.05 ,
│      -0.104,  0.176, -0.028, -0.117,  0.111, -0.145,  0.166,  0.226,
│      -0.062, -0.019,  0.039,  0.006,  0.056,  0.138, -0.072, -0.022,
│   │   0.052, -0.122,  0.157, -0.012, -0.048,  0.122, -0.02 , -0.022])

Embeddings of lots of molecules

my_smiles = open("chembl.smi", "r").read().splitlines() # full chembl
pprint(my_smiles[0:10])

[
│   'N#CCCN1N=C(c2ccc(OCc3ccccc3)cc2)OCC1=O',
│   'CC(C)C[C@H](NC(=O)c1cn(Cc2ccccc2)nn1)B(O)O',
│   'Cl.NCC(=O)CCC(=O)OCc1ccccc1',
│   'CC(C)[C@H](NC(=O)[C@H](C)N)C(=O)N1CCC[C@H]1C(=O)N(C1CCCCC1)[C@@H](C)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)O',
│   'Nc1cccc(CP(=O)(O)CC(CCC(=O)O)C(=O)O)c1',
│   'O=C(Cc1ccc(OCc2ccccc2)cc1)N[C@@H](CCS)Cc1c[nH]c2ccccc12',
│   'COc1ccc2c(c1)C(=O)CC(CCN1CCC3(CC1)NCNC3=O)C2',
│   'COC(=O)c1cc(C(O)CN2CCN(c3ccccc3OC)CC2)ccc1OC',
│   'O=C(O)CCCOc1ccccc1-c1cc2cc(C(=O)NC(c3ccccc3)c3ccccc3)ccc2o1',
│   'N/C(=C\\C(=O)c1ccc(Cl)cc1)C(=O)O'
]

params = {
    "search_input": my_smiles,
    "save_embs": True,
    "search_type": "all",
    "dest": "/data/my_embeddings" # save into directory (faster than sending embeddings in json), creates if doesnt exist
}

response = requests.get(MY_URL + "/embeddings", params=params, headers=headers)
pprint(response.json(), max_length=5)

{'message': 'Success ! You can find computed embeddings in : /data/my_embeddings'}

Centroid Embeddings

This API call retrieves embeddings of database cluster centroids.

import requests

params = {
    "db_name": "ZINC15",
    "search_type": 'espsim_shape',
    "centroid_mols": False, # whether to output representative molecules for the clusters
    "save_embs": True, 
    "dest": "/data/my_centroid_embeddings" # save into directory (faster than sending embeddings in json), creates if doesnt exist
}

response = requests.get(MY_URL + "/centroid_embeddings", params=params, headers=headers)

import numpy as np
np.array(response.json()['centroid_embeddings_db_ZINC15_st_espsim_shape']).shape

(26403, 256)

Similarity

Pairwise similarity

params = {
    "smiles1": "Fc1ccccc1",
    "smiles2": "Clc1ccccc1",
    "similarity_metric": "all",
    "distance_type": "euclidean"
}

response = requests.get(MY_URL + "/similarity", params=params, headers=headers)
pprint(response.json())

{
│   'morgan': 1.3493338152007586,
│   'espsim_electrostatic': 0.7924910655564171,
│   'espsim_shape': 0.3191955858476541,
│   'active_pairs': 0.7519632147430073
}

params = {
    "smiles1": "Fc1ccccc1",
    "smiles2": "Clc1ccccc1",
    "similarity_metric": "all",
    "distance_type": "cosine" # cosine is from 0 to 1
}

response = requests.get(MY_URL + "/similarity", params=params, headers=headers)
pprint(response.json())

{
│   'morgan': 0.443562888451121,
│   'espsim_electrostatic': 0.21610830917440194,
│   'espsim_shape': 0.019051098635778363,
│   'active_pairs': 0.060692261872934083
}

Similarity Matrix

params = {
    "smiles": [
        "CC(=O)NC1=NC=CC=C1C(=O)O",
        "CC(C)NC1=NC=CC=C1C(=O)O",
        "COC(=O)OC1=CC=CC=C1C(=O)O",
        "CC(C)OC1=CC=CC=C1C(=O)NO",
        "CC(=O)NC1=CC=CC=C1C(C)=O",
    ],
    "similarity_metric": "espsim_shape",
    "distance_type": "cosine",
}

response = requests.get(MY_URL + "/similarity_matrix", params=params, headers=headers)
pprint(np.array(response.json()[params["similarity_metric"]]))

array([[1.110e-16, 4.405e-02, 7.192e-02, 1.027e-01, 3.978e-02],
│      [4.405e-02, 1.110e-16, 9.728e-02, 6.456e-02, 7.657e-02],
│      [7.192e-02, 9.728e-02, 0.000e+00, 9.438e-02, 9.444e-02],
│      [1.027e-01, 6.456e-02, 9.438e-02, 0.000e+00, 9.449e-02],
│      [3.978e-02, 7.657e-02, 9.444e-02, 9.449e-02, 0.000e+00]])

Visualisation

Visualisation command works the same as embeddings command, but returns 2D coordinates intended for visualisation

params = {
    "search_input": ["Fc1ccccc1", "Clc1ccccc1", "Brc1ccccc1"],
    "search_type": "espsim_electrostatic",
    "visualisation_method": "umap"
}

response = requests.get(MY_URL + "/visualise", params=params, headers=headers)
pprint(np.array(response.json()["espsim_electrostatic"]))

array([[8.325, 9.227],
│      [8.393, 9.108],
│      [8.269, 9.257]])

my_smiles = open("my_dataset.smi", "r").read().splitlines() # mix of databases
params = {
    "search_input": my_smiles,
    "search_type": "espsim_electrostatic",
    "visualisation_method": "umap", # UMAP or PCA
    "save_coordinates": True,
    "dest": "/data/my_umap_coordinates"
}

response = requests.get(MY_URL + "/visualise", params=params, headers=headers)
pprint(response.json())

After loading UMAP embeddings, it looks like this.