Skip to content

API Tutorial

In this tutorial, we will learn how to use the API to interact with CHEESE platform. We will cover the following topics: - How to retrieve molecules from available databases - Search and batch search - Comnputing CHEESE Embeddings - Computation of similarities

Prerequisities

  • Python
  • NumPy (optional): pip install numpy
  • RdKit (optional): pip install rdkit
  • Rich (optional): pip install rich
import requests
import json
from rdkit import Chem
from rdkit.Chem import Draw
import numpy as np
from rich.pretty import pprint

np.set_printoptions(precision=3)

Sanity Check

MY_URL = "http://cheese-database.ch.themama.ai:9002" # URL of the API, change it if you are using a different one
headers = {'accept': 'application/json'} # headers for the request, we want a JSON response
url = MY_URL + '/test'
response = requests.get(url, headers=headers)
pprint(response.json())
{'message': 'Health check successful !!'}

Random molecules

This endpoint returns a specified number of random molecules in SMILES format from a random selection of databases.

# Define the URL and headers
url = MY_URL + '/random_molecule'
params = {'n_mols': 5}

# Make the GET request
response = requests.get(url, headers=headers, params=params)
pprint(response.json())
[
'Cn1nnc(C[NH2+]C[C@H]2CCN(C(=O)[C@]34C[C@H]3COC4)C2)n1',
'Cc1ccc(NC(=O)c2ccc(N(Cc3ccccc3Cl)S(C)(=O)=O)cc2)cc1C',
'CN(C[C@@H]1CCC[N@H+](C)C1)S(=O)(=O)N1CCO[C@H](C#N)C1',
'C#CCCN1CC2(C1)CN(C(=O)C1=NC(=O)N(C)C1)CCO2',
'Cc1nccc(CNC(=O)C(=O)N[C@H](C)[C@@H]2C[N@@H+](C)CCO2)n1'
]
Draw.MolsToGridImage([Chem.MolFromSmiles(mol) for mol in response.json()], molsPerRow=5)

png

Available Databases

# Define the URL and headers
url = MY_URL + '/available_dbs'
headers = {'accept': 'application/json'}

# Make the GET request
response = requests.get(url, headers=headers)
pprint(response.json())
{'available_dbs': ['ENAMINE-REAL', 'ZINC15', 'CUSTOM_CLUSTERED', 'CUSTOM_IN_MEM']}
params = {
    "search_input": "CC(=O)Oc1ccccc1C(=O)O", # SMILES of the query molecule
    "search_type": 'espsim_shape', # CHEESE Search type
    "n_neighbors": 5,
    "search_quality": "fast",
    "db_names": ["ENAMINE-REAL"],
}

response = requests.get(MY_URL + "/molsearch_simple", params=params)
pprint(response.json())
{
'smiles': [
│   │   'CC(=O)NC1=NC=CC=C1C(=O)O',
│   │   'CC(C)NC1=NC=CC=C1C(=O)O',
│   │   'COC(=O)OC1=CC=CC=C1C(=O)O',
│   │   'CC(C)OC1=CC=CC=C1C(=O)NO',
│   │   'CC(=O)NC1=CC=CC=C1C(C)=O'
],
'id': ['Z2904267089', 'Z71176798', 'Z192948624', 'Z1776115990', 'PV-002547950599']
}
smiles = response.json()["smiles"]
ids = response.json()["id"]
Draw.MolsToGridImage([Chem.MolFromSmiles(mol) for mol in smiles], molsPerRow=5, legends=ids)

png

import requests

params = {
    "search_input": "CC(=O)Oc1ccccc1C(=O)O",
    "search_type": 'espsim_shape',
    "n_neighbors": 5,
    "search_quality": "fast",
    "db_names": ["ZINC15"],
    "descriptors": True,
    "properties": True,
    "filter_molecules": False,
    "order_molecules": False
}

response = requests.get(MY_URL + "/molsearch", params=params)
pprint(response.json().keys())
dict_keys(['remarks', 'canonicalized_query', 'neighbors', 'query_properties', 'search_info'])
pprint(response.json(), max_depth=3)
{
'remarks': '',
'canonicalized_query': 'CC(=O)Oc1ccccc1C(=O)O',
'neighbors': [
│   │   {
│   │   │   'smiles': 'CC(=O)Oc1ccccc1C(=O)[O-]',
│   │   │   'zinc_id': 'ZINC15 : 53',
│   │   │   'embedding_similarity': 0.7071046233177185,
│   │   │   'properties': {...}
│   │   },
│   │   {
│   │   │   'smiles': 'CC(=O)Oc1ccccc1C(N)=O',
│   │   │   'zinc_id': 'ZINC15 : 404835',
│   │   │   'embedding_similarity': 0.6960273385047913,
│   │   │   'properties': {...}
│   │   },
│   │   {
│   │   │   'smiles': 'CC(=O)Oc1ccccc1C(N)=S',
│   │   │   'zinc_id': 'ZINC15 : 72284420',
│   │   │   'embedding_similarity': 0.6937514841556549,
│   │   │   'properties': {...}
│   │   },
│   │   {
│   │   │   'smiles': 'CC(=O)Nc1ccncc1C(=O)[O-]',
│   │   │   'zinc_id': 'ZINC15 : 3163078',
│   │   │   'embedding_similarity': 0.6923034489154816,
│   │   │   'properties': {...}
│   │   },
│   │   {
│   │   │   'smiles': 'CC(=O)Oc1ccccc1C(C)=O',
│   │   │   'zinc_id': 'ZINC15 : 137705',
│   │   │   'embedding_similarity': 0.6920740604400635,
│   │   │   'properties': {...}
│   │   }
],
'query_properties': {
│   │   'smiles': 'CC(=O)Oc1ccccc1C(=O)O',
│   │   'zinc_id': '',
│   │   'properties': {
│   │   │   'absorption': {...},
│   │   │   'excretion': {...},
│   │   │   'toxicity': {...},
│   │   │   'distribution': {...},
│   │   │   'metabolism': {...},
│   │   │   'basics': {...}
│   │   }
},
'search_info': {
│   │   'query_embedding_time': 0.2988882064819336,
│   │   'search_time': 0.3333892822265625,
│   │   'filter_time': 0.003624439239501953,
│   │   'sorting_time': 0.0015327930450439453,
│   │   'property_prediction_time': 0.0522768497467041,
│   │   'total_time': 0.6897115707397461
}
}

One result molecule json with properties and descriptors looks like this:

pprint(response.json()["neighbors"][0])
{
'smiles': 'CC(=O)Oc1ccccc1C(=O)[O-]',
'zinc_id': 'ZINC15 : 53',
'embedding_similarity': 0.7071046233177185,
'properties': {
│   │   'absorption': {
│   │   │   'caco2_wang': -4.235,
│   │   │   'lipophilicity_astrazeneca': -0.414,
│   │   │   'solubility_aqsoldb': -2.315,
│   │   │   'bioavailability_ma': 0.908,
│   │   │   'hia_hou': 0.989,
│   │   │   'pgp_broccatelli': 0.0,
│   │   │   'clogp': -0.0246
│   │   },
│   │   'excretion': {
│   │   │   'clearance_hepatocyte_az': 86.346,
│   │   │   'clearance_microsome_az': 55.043,
│   │   │   'half_life_obach': 0.43
│   │   },
│   │   'toxicity': {'ld50_zhu': 1.81, 'ames': 0.003, 'dili': 0.021, 'herg': 0.0},
│   │   'distribution': {'ppbr_az': 45.823, 'vdss_lombardo': 0.155, 'bbb_martins': 0.985},
│   │   'metabolism': {'cyp2c9_veith': 0.046, 'cyp2d6_veith': 0.0, 'cyp3a4_veith': 0.0},
│   │   'basics': {
│   │   │   'molecular_weight': 179.03498,
│   │   │   'formal_charge': -1.0,
│   │   │   'heavy_atoms': 13.0,
│   │   │   'h_bond_acceptors': 4.0,
│   │   │   'h_bond_donor': 0.0,
│   │   │   'rotatable_bonds': 2.0,
│   │   │   'num_of_rings': 1.0,
│   │   │   'molar_refractivity': 42.0815,
│   │   │   'number_of_atoms': 13.0,
│   │   │   'topological_surface_area_mapping': 66.43
│   │   }
}
}

Advanced Search with filtering and ordering

import requests

params = {
    "search_input": "CC(=O)Oc1ccccc1C(=O)O",
    "search_type": 'espsim_shape',
    "n_neighbors": 10,
    "search_quality": "fast",
    "db_names": ["ZINC15"],
    "descriptors": True,
    "properties": True,
    "filter_molecules": True,
    "order_molecules": True,
    "filtering": ["PAINS", "Murcko scaffold hop"], # Using filtering to filter out PAINS and to get only scaffold hops
    "ordering": ["Morgan Tanimoto"] # Sorting results based on Morgan Fingerprints after doing cheese search 
}

def just_smiles(advanced_response):
    return [r["smiles"] for r in advanced_response["neighbors"]]

response = requests.get(MY_URL + "/molsearch", params=params)
pprint(just_smiles(response.json())) # note: we searchech 10 molecules but got 6 because of the filtering (its better to increase the number of neighbors)
[
'CC(=O)Oc1ccccc1C(C)=O',
'CC(=O)Oc1ccccc1C(N)=O',
'CC(=O)Oc1ccccc1C(=O)[O-]',
'CC(=O)Oc1ccccc1C(N)=S',
'CC(=O)Nc1ccccc1C(=O)[O-]',
'NC(=O)Nc1ccccc1C(=O)[O-]'
]
smiles = just_smiles(response.json())
Draw.MolsToGridImage([Chem.MolFromSmiles(mol) for mol in smiles], molsPerRow=3)

png

Search Array

Searching a list of molecules

import requests

params = {
    "search_input": ["CC1=CN(C)N=C1", "CNC1=CC=CC=C1", "CCN1C=CN=C1"],
    "search_type": 'espsim_shape',
    "n_neighbors": 5,
    "search_quality": "fast",
    "db_names": "ZINC15",
    "descriptors": False,
    "properties": False,
    "filter_molecules": False
}

response = requests.get(MY_URL + "/molsearch_array", params=params, headers=headers)
pprint(response.json(), max_depth=3)
{
'CC1=CN(C)N=C1': {
│   │   'CC1=CN(C)N=C1': {'remarks': '', 'canonicalized_query': 'Cc1cnn(C)c1', 'neighbors': [...]},
│   │   'search_info': {
│   │   │   'query_embedding_time': 0.2705063819885254,
│   │   │   'search_time': 0.09556221961975098,
│   │   │   'filter_time': 0.0029592514038085938,
│   │   │   'sorting_time': 0.010590314865112305,
│   │   │   'property_prediction_time': 0,
│   │   │   'total_time': 0.37961816787719727
│   │   }
},
'CNC1=CC=CC=C1': {
│   │   'CNC1=CC=CC=C1': {'remarks': '', 'canonicalized_query': 'CNc1ccccc1', 'neighbors': [...]},
│   │   'search_info': {
│   │   │   'query_embedding_time': 0.31961727142333984,
│   │   │   'search_time': 0.0561220645904541,
│   │   │   'filter_time': 0.002988100051879883,
│   │   │   'sorting_time': 0.008991718292236328,
│   │   │   'property_prediction_time': 0,
│   │   │   'total_time': 0.38771915435791016
│   │   }
},
'CCN1C=CN=C1': {
│   │   'CCN1C=CN=C1': {'remarks': '', 'canonicalized_query': 'CCn1ccnc1', 'neighbors': [...]},
│   │   'search_info': {
│   │   │   'query_embedding_time': 0.38388919830322266,
│   │   │   'search_time': 0.04931282997131348,
│   │   │   'filter_time': 0.002831697463989258,
│   │   │   'sorting_time': 0.008421182632446289,
│   │   │   'property_prediction_time': 0,
│   │   │   'total_time': 0.4444549083709717
│   │   }
}
}
import requests

params = {
    "search_input": [
        "CC(=O)NC1=NC=CC=C1C(=O)O",
        "CC(C)NC1=NC=CC=C1C(=O)O",
        "COC(=O)OC1=CC=CC=C1C(=O)O",
        "CC(C)OC1=CC=CC=C1C(=O)NO",
        "CC(=O)NC1=CC=CC=C1C(C)=O",
    ],
    "search_type": "espsim_shape",
    "n_neighbors": 10,
    "search_mode": "centroid",  # or "batch"
}

response = requests.get(MY_URL + "/batch_search", params=params, headers=headers)
smiles = response.json()["smiles"]
ids = response.json()["id"]
Draw.MolsToGridImage(
    [Chem.MolFromSmiles(mol) for mol in smiles], molsPerRow=5, legends=ids
)

png

Embeddings

params = {
    "search_input": ["Fc1ccccc1", "Clc1ccccc1", "Brc1ccccc1"],
    "save_embs": False,
}

response = requests.get(MY_URL + "/embeddings", params=params, headers=headers)
pprint(response.json().keys())
pprint(np.array(response.json()["espsim_shape"]).shape)
pprint(np.array(response.json()["espsim_shape"][0])) # embedding of the first molecule
dict_keys(['morgan', 'espsim_electrostatic', 'espsim_shape', 'active_pairs'])
(3, 256)
array([-0.271,  0.028, -0.28 ,  0.089, -0.067, -0.005, -0.045, -0.024,
│   │   0.126, -0.162, -0.027, -0.163,  0.142,  0.006, -0.033, -0.069,
-0.111,  0.045,  0.01 ,  0.206,  0.006,  0.137,  0.056,  0.012,
-0.033, -0.123,  0.013, -0.008,  0.076,  0.116, -0.274, -0.067,
│   │   0.042, -0.113,  0.017, -0.023,  0.085, -0.029, -0.059,  0.078,
│   │   0.091, -0.045, -0.034,  0.053,  0.05 ,  0.119, -0.018,  0.108,
│   │   0.013, -0.015, -0.021,  0.02 , -0.086,  0.16 ,  0.149, -0.174,
-0.047,  0.241, -0.133,  0.02 , -0.265, -0.009, -0.043, -0.118,
│   │   0.112,  0.043,  0.049, -0.001,  0.042,  0.007, -0.031,  0.145,
-0.07 ,  0.048,  0.01 , -0.016,  0.039, -0.027, -0.202,  0.064,
│   │   0.044, -0.077,  0.005, -0.07 , -0.09 ,  0.276, -0.047,  0.189,
│   │   0.08 , -0.094,  0.075, -0.047,  0.142, -0.242, -0.117, -0.04 ,
│   │   0.177,  0.111,  0.156, -0.015, -0.032,  0.197, -0.003, -0.006,
-0.064, -0.045, -0.051,  0.071,  0.043,  0.048, -0.103,  0.036,
-0.078,  0.025,  0.139, -0.055, -0.014,  0.064, -0.125,  0.052,
-0.001,  0.046,  0.02 ,  0.041,  0.009,  0.025,  0.142, -0.322,
│   │   0.001,  0.046, -0.113,  0.143,  0.126,  0.045,  0.124, -0.016,
-0.098, -0.13 ,  0.23 ,  0.013, -0.228, -0.069, -0.088, -0.022,
│   │   0.025,  0.011, -0.131,  0.222, -0.007, -0.092,  0.023,  0.069,
-0.011, -0.042,  0.101, -0.056, -0.079,  0.152, -0.027, -0.012,
│   │   0.03 ,  0.147,  0.006, -0.117,  0.081, -0.115,  0.101,  0.121,
│   │   0.101, -0.072,  0.011, -0.002,  0.003, -0.192, -0.024,  0.135,
-0.027,  0.01 ,  0.047, -0.116, -0.058,  0.107,  0.119,  0.002,
-0.18 , -0.054,  0.003, -0.021,  0.031,  0.036, -0.038, -0.121,
│   │   0.145, -0.038, -0.015, -0.015,  0.028, -0.05 , -0.104,  0.146,
-0.093, -0.056, -0.149, -0.074,  0.126, -0.07 , -0.112,  0.001,
-0.098, -0.026, -0.007, -0.125, -0.003,  0.066, -0.081, -0.035,
-0.09 , -0.166, -0.026, -0.114, -0.19 , -0.07 ,  0.032,  0.05 ,
-0.015, -0.022, -0.128,  0.041, -0.179,  0.038, -0.161, -0.05 ,
-0.104,  0.176, -0.028, -0.117,  0.111, -0.145,  0.166,  0.226,
-0.062, -0.019,  0.039,  0.006,  0.056,  0.138, -0.072, -0.022,
│   │   0.052, -0.122,  0.157, -0.012, -0.048,  0.122, -0.02 , -0.022])

Embeddings of lots of molecules

my_smiles = open("chembl.smi", "r").read().splitlines() # full chembl
pprint(my_smiles[0:10])
[
'N#CCCN1N=C(c2ccc(OCc3ccccc3)cc2)OCC1=O',
'CC(C)C[C@H](NC(=O)c1cn(Cc2ccccc2)nn1)B(O)O',
'Cl.NCC(=O)CCC(=O)OCc1ccccc1',
'CC(C)[C@H](NC(=O)[C@H](C)N)C(=O)N1CCC[C@H]1C(=O)N(C1CCCCC1)[C@@H](C)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)O',
'Nc1cccc(CP(=O)(O)CC(CCC(=O)O)C(=O)O)c1',
'O=C(Cc1ccc(OCc2ccccc2)cc1)N[C@@H](CCS)Cc1c[nH]c2ccccc12',
'COc1ccc2c(c1)C(=O)CC(CCN1CCC3(CC1)NCNC3=O)C2',
'COC(=O)c1cc(C(O)CN2CCN(c3ccccc3OC)CC2)ccc1OC',
'O=C(O)CCCOc1ccccc1-c1cc2cc(C(=O)NC(c3ccccc3)c3ccccc3)ccc2o1',
'N/C(=C\\C(=O)c1ccc(Cl)cc1)C(=O)O'
]
params = {
    "search_input": my_smiles,
    "save_embs": True,
    "search_type": "all",
    "dest": "/data/my_embeddings" # save into directory (faster than sending embeddings in json), creates if doesnt exist
}

response = requests.get(MY_URL + "/embeddings", params=params, headers=headers)
pprint(response.json(), max_length=5)
{'message': 'Success ! You can find computed embeddings in : /data/my_embeddings'}

Centroid Embeddings

This API call retrieves embeddings of database cluster centroids.

import requests

params = {
    "db_name": "ZINC15",
    "search_type": 'espsim_shape',
    "centroid_mols": False, # whether to output representative molecules for the clusters
    "save_embs": True, 
    "dest": "/data/my_centroid_embeddings" # save into directory (faster than sending embeddings in json), creates if doesnt exist
}

response = requests.get(MY_URL + "/centroid_embeddings", params=params, headers=headers)
import numpy as np
np.array(response.json()['centroid_embeddings_db_ZINC15_st_espsim_shape']).shape
(26403, 256)

Similarity

Pairwise similarity

params = {
    "smiles1": "Fc1ccccc1",
    "smiles2": "Clc1ccccc1",
    "similarity_metric": "all",
    "distance_type": "euclidean"
}

response = requests.get(MY_URL + "/similarity", params=params, headers=headers)
pprint(response.json())
{
'morgan': 1.3493338152007586,
'espsim_electrostatic': 0.7924910655564171,
'espsim_shape': 0.3191955858476541,
'active_pairs': 0.7519632147430073
}
params = {
    "smiles1": "Fc1ccccc1",
    "smiles2": "Clc1ccccc1",
    "similarity_metric": "all",
    "distance_type": "cosine" # cosine is from 0 to 1
}

response = requests.get(MY_URL + "/similarity", params=params, headers=headers)
pprint(response.json())
{
'morgan': 0.443562888451121,
'espsim_electrostatic': 0.21610830917440194,
'espsim_shape': 0.019051098635778363,
'active_pairs': 0.060692261872934083
}

Similarity Matrix

params = {
    "smiles": [
        "CC(=O)NC1=NC=CC=C1C(=O)O",
        "CC(C)NC1=NC=CC=C1C(=O)O",
        "COC(=O)OC1=CC=CC=C1C(=O)O",
        "CC(C)OC1=CC=CC=C1C(=O)NO",
        "CC(=O)NC1=CC=CC=C1C(C)=O",
    ],
    "similarity_metric": "espsim_shape",
    "distance_type": "cosine",
}

response = requests.get(MY_URL + "/similarity_matrix", params=params, headers=headers)
pprint(np.array(response.json()[params["similarity_metric"]]))
array([[1.110e-16, 4.405e-02, 7.192e-02, 1.027e-01, 3.978e-02],
[4.405e-02, 1.110e-16, 9.728e-02, 6.456e-02, 7.657e-02],
[7.192e-02, 9.728e-02, 0.000e+00, 9.438e-02, 9.444e-02],
[1.027e-01, 6.456e-02, 9.438e-02, 0.000e+00, 9.449e-02],
[3.978e-02, 7.657e-02, 9.444e-02, 9.449e-02, 0.000e+00]])

Visualisation

Visualisation command works the same as embeddings command, but returns 2D coordinates intended for visualisation

params = {
    "search_input": ["Fc1ccccc1", "Clc1ccccc1", "Brc1ccccc1"],
    "search_type": "espsim_electrostatic",
    "visualisation_method": "umap"
}

response = requests.get(MY_URL + "/visualise", params=params, headers=headers)
pprint(np.array(response.json()["espsim_electrostatic"]))
array([[8.325, 9.227],
[8.393, 9.108],
[8.269, 9.257]])
my_smiles = open("my_dataset.smi", "r").read().splitlines() # mix of databases
params = {
    "search_input": my_smiles,
    "search_type": "espsim_electrostatic",
    "visualisation_method": "umap", # UMAP or PCA
    "save_coordinates": True,
    "dest": "/data/my_umap_coordinates"
}

response = requests.get(MY_URL + "/visualise", params=params, headers=headers)
pprint(response.json())

After loading UMAP embeddings, it looks like this.