Skip to content

API Tutorial

In this tutorial, we will learn how to use the API to interact with CHEESE platform. We will cover the following topics: - How to retrieve molecules from available databases - Search and batch search - Comnputing CHEESE Embeddings - Computation of similarities

Prerequisities

  • Python
  • NumPy (optional): pip install numpy
  • RdKit (optional): pip install rdkit
  • Rich (optional): pip install rich
import requests
import json
from rdkit import Chem
from rdkit.Chem import Draw
import numpy as np
from rich.pretty import pprint

np.set_printoptions(precision=3)

Sanity Check

Public API

MY_URL = "https://api.cheese.themama.ai"
API_KEY = "eyJhbGciOi..." # Obtained by signing up at https://cheese.deepmedchem.com/ and clicking on "Generate API Key"
headers = {"X-API-Key": API_KEY, "accept": "application/json" }

On-Prem API

MY_URL = "http://cheese-database.ch.themama.ai:9002" # URL of the API, change it if you are using a different one
headers = {'accept': 'application/json'} # headers for the request, we want a JSON response

Test Request (Health Check)

url = MY_URL + '/test'
response = requests.get(url, headers=headers)
pprint(response.json())
{'message': 'Health check successful !!'}

Random molecules

This endpoint returns a specified number of random molecules in SMILES format from a random selection of databases.

# Define the URL and headers
url = MY_URL + '/random_molecule'
params = {'n_mols': 5}

# Make the GET request
response = requests.get(url, headers=headers, params=params)
pprint(response.json())
[
'O=C(NCc1ccccc1)c1c(Cl)cc2c(c1Cl)CCN(Cc1ccccc1OC(F)(F)F)C2',
'CCOc1ccccc1CC(=O)NCC1CCN(Cc2cnn(-c3ccccc3)c2)C1',
'CC(=O)NC(Cc1ccccc1F)C(=O)N1CCN(Cc2ccccc2)CC1',
'COC(=O)[C@H]1CN(C(=O)c2ccc(C#N)c(F)c2F)CCN1C(=O)Cc1ccc2c(c1)/C(=C\\CCN(C)C)c1ccccc1CO2',
'C[C@]1(C(=O)NC2CCS(=O)(=O)C2)CCN(CC[C@@H](O)[C@@H](O)CO)C1'
]
Draw.MolsToGridImage([Chem.MolFromSmiles(mol) for mol in response.json()], molsPerRow=5)

png

Available Databases

# Define the URL and headers
url = MY_URL + '/available_dbs'

# Make the GET request
response = requests.get(url, headers=headers)
pprint(response.json())
{
'available_dbs': [
│   │   'MCULE-FULL',
│   │   'ZINC15',
│   │   'SYNPLE',
│   │   'ENAMINE-CARBOXYLIC',
│   │   'EXPLORE-ENUMERATED',
│   │   'EXPLORE-DIVERSE',
│   │   'ENAMINE-REAL',
│   │   'CHEMRIYA',
│   │   'MCULE-IN-STOCK'
]
}
params = {
    "search_input": "CC(=O)Oc1ccccc1C(=O)O", # SMILES of the query molecule
    "search_type": 'espsim_shape', # CHEESE Search type
    "n_neighbors": 5,
    "search_quality": "fast",
    "db_names": ["ENAMINE-REAL"],
}

response = requests.get(MY_URL + "/molsearch_simple", params=params, headers=headers)
pprint(response.json())
{
'smiles': [
│   │   'CC(=O)NC1=NC=CC=C1C(=O)O',
│   │   'CC(C)NC1=NC=CC=C1C(=O)O',
│   │   'COC(=O)OC1=CC=CC=C1C(=O)O',
│   │   'CC(C)OC1=CC=CC=C1C(=O)NO',
│   │   'CC(=O)NC1=CC=CC=C1C(C)=O'
],
'id': ['Z2904267089', 'Z71176798', 'Z192948624', 'Z1776115990', 'PV-002547950599']
}
smiles = response.json()["smiles"]
ids = response.json()["id"]
Draw.MolsToGridImage([Chem.MolFromSmiles(mol) for mol in smiles], molsPerRow=5, legends=ids)

png

import requests

params = {
    "search_input": "CC(=O)Oc1ccccc1C(=O)O",
    "search_type": 'espsim_shape',
    "n_neighbors": 5,
    "search_quality": "fast",
    "db_names": ["ZINC15"],
    "descriptors": True,
    "properties": True,
    "filter_molecules": False,
    "order_molecules": False
}

response = requests.get(MY_URL + "/molsearch", params=params, headers=headers)
pprint(response.json().keys())
dict_keys(['remarks', 'canonicalized_query', 'neighbors', 'query_properties', 'search_info'])
pprint(response.json(), max_depth=3)
{
'remarks': '',
'canonicalized_query': 'CC(=O)Oc1ccccc1C(=O)O',
'neighbors': [
│   │   {
│   │   │   'smiles': 'CC(=O)Oc1ccccc1C(=O)[O-]',
│   │   │   'zinc_id': 'ZINC15 : 53',
│   │   │   'embedding_distance': 0.2928953766822815,
│   │   │   'properties': {...}
│   │   },
│   │   {
│   │   │   'smiles': 'CC(=O)Oc1ccccc1C(N)=O',
│   │   │   'zinc_id': 'ZINC15 : 404835',
│   │   │   'embedding_distance': 0.30397266149520874,
│   │   │   'properties': {...}
│   │   },
│   │   {
│   │   │   'smiles': 'CC(=O)Oc1ccccc1C(N)=S',
│   │   │   'zinc_id': 'ZINC15 : 72284420',
│   │   │   'embedding_distance': 0.3062485158443451,
│   │   │   'properties': {...}
│   │   },
│   │   {
│   │   │   'smiles': 'CC(=O)Nc1ccncc1C(=O)[O-]',
│   │   │   'zinc_id': 'ZINC15 : 3163078',
│   │   │   'embedding_distance': 0.30769655108451843,
│   │   │   'properties': {...}
│   │   },
│   │   {
│   │   │   'smiles': 'CC(=O)Oc1ccccc1C(C)=O',
│   │   │   'zinc_id': 'ZINC15 : 137705',
│   │   │   'embedding_distance': 0.3079259395599365,
│   │   │   'properties': {...}
│   │   }
],
'query_properties': {
│   │   'smiles': 'CC(=O)Oc1ccccc1C(=O)O',
│   │   'zinc_id': '',
│   │   'properties': {
│   │   │   'absorption': {...},
│   │   │   'excretion': {...},
│   │   │   'toxicity': {...},
│   │   │   'distribution': {...},
│   │   │   'metabolism': {...},
│   │   │   'basics': {...}
│   │   }
},
'search_info': {
│   │   'query_embedding_time': 0.37433314323425293,
│   │   'search_time': 0.45039892196655273,
│   │   'filter_time': 5.1021575927734375e-05,
│   │   'sorting_time': 1.2636184692382812e-05,
│   │   'property_prediction_time': 0.06553077697753906,
│   │   'total_time': 0.8903264999389648
}
}

One result molecule json with properties and descriptors looks like this:

pprint(response.json()["neighbors"][0])
{
'smiles': 'CC(=O)Oc1ccccc1C(=O)[O-]',
'zinc_id': 'ZINC15 : 53',
'embedding_distance': 0.2928953766822815,
'properties': {
│   │   'absorption': {
│   │   │   'caco2_wang': -4.235,
│   │   │   'lipophilicity_astrazeneca': -0.414,
│   │   │   'solubility_aqsoldb': -2.315,
│   │   │   'bioavailability_ma': 0.908,
│   │   │   'hia_hou': 0.989,
│   │   │   'pgp_broccatelli': 0.0,
│   │   │   'clogp': -0.0246
│   │   },
│   │   'excretion': {
│   │   │   'clearance_hepatocyte_az': 86.346,
│   │   │   'clearance_microsome_az': 55.043,
│   │   │   'half_life_obach': 0.43
│   │   },
│   │   'toxicity': {'ld50_zhu': 1.81, 'ames': 0.003, 'dili': 0.021, 'herg': 0.0},
│   │   'distribution': {'ppbr_az': 45.823, 'vdss_lombardo': 0.155, 'bbb_martins': 0.985},
│   │   'metabolism': {'cyp2c9_veith': 0.046, 'cyp2d6_veith': 0.0, 'cyp3a4_veith': 0.0},
│   │   'basics': {
│   │   │   'molecular_weight': 179.03498,
│   │   │   'formal_charge': -1.0,
│   │   │   'heavy_atoms': 13.0,
│   │   │   'h_bond_acceptors': 4.0,
│   │   │   'h_bond_donor': 0.0,
│   │   │   'rotatable_bonds': 2.0,
│   │   │   'num_of_rings': 1.0,
│   │   │   'molar_refractivity': 42.0815,
│   │   │   'number_of_atoms': 13.0,
│   │   │   'topological_surface_area_mapping': 66.43
│   │   }
}
}

Advanced Search with filtering and ordering

import requests

params = {
    "search_input": "CC(=O)Oc1ccccc1C(=O)O",
    "search_type": 'espsim_shape',
    "n_neighbors": 10,
    "search_quality": "fast",
    "db_names": ["ZINC15"],
    "descriptors": True,
    "properties": True,
    "filter_molecules": True,
    "order_molecules": True,
    "filtering": ["PAINS", "Murcko scaffold hop"], # Using filtering to filter out PAINS and to get only scaffold hops
    "ordering": ["Morgan Tanimoto"] # Sorting results based on Morgan Fingerprints after doing cheese search 
}

def just_smiles(advanced_response):
    return [r["smiles"] for r in advanced_response["neighbors"]]

response = requests.get(MY_URL + "/molsearch", params=params, headers=headers)
pprint(just_smiles(response.json())) # note: we searched 10 molecules but got 6 because of the filtering (its better to increase the number of neighbors)
[
'CC(=O)Oc1ccccc1C(C)=O',
'CC(=O)Oc1ccccc1C(N)=O',
'CC(=O)Oc1ccccc1C(=O)[O-]',
'CC(=O)Oc1ccccc1C(N)=S',
'CC(=O)Nc1ccccc1C(=O)[O-]',
'NC(=O)Nc1ccccc1C(=O)[O-]'
]
smiles = just_smiles(response.json())
Draw.MolsToGridImage([Chem.MolFromSmiles(mol) for mol in smiles], molsPerRow=3)

png

Search Array

Searching a list of molecules

import requests

params = {
    "search_input": ["CC1=CN(C)N=C1", "CNC1=CC=CC=C1", "CCN1C=CN=C1"],
    "search_type": 'espsim_shape',
    "n_neighbors": 5,
    "search_quality": "fast",
    "db_names": "ZINC15",
    "descriptors": False,
    "properties": False,
    "filter_molecules": False
}

response = requests.get(MY_URL + "/molsearch_array", params=params, headers=headers)
pprint(response.json(), max_depth=3)
{
'CC1=CN(C)N=C1': {
│   │   'CC1=CN(C)N=C1': {'remarks': '', 'canonicalized_query': 'Cc1cnn(C)c1', 'neighbors': [...]},
│   │   'search_info': {
│   │   │   'query_embedding_time': 0.43999290466308594,
│   │   │   'search_time': 0.17154765129089355,
│   │   │   'filter_time': 4.3392181396484375e-05,
│   │   │   'sorting_time': 0.009363174438476562,
│   │   │   'property_prediction_time': 0,
│   │   │   'total_time': 0.6209471225738525
│   │   }
},
'CNC1=CC=CC=C1': {
│   │   'CNC1=CC=CC=C1': {'remarks': '', 'canonicalized_query': 'CNc1ccccc1', 'neighbors': [...]},
│   │   'search_info': {
│   │   │   'query_embedding_time': 0.3788025379180908,
│   │   │   'search_time': 0.08669018745422363,
│   │   │   'filter_time': 4.506111145019531e-05,
│   │   │   'sorting_time': 0.010756254196166992,
│   │   │   'property_prediction_time': 0,
│   │   │   'total_time': 0.47629404067993164
│   │   }
},
'CCN1C=CN=C1': {
│   │   'CCN1C=CN=C1': {'remarks': '', 'canonicalized_query': 'CCn1ccnc1', 'neighbors': [...]},
│   │   'search_info': {
│   │   │   'query_embedding_time': 0.3099024295806885,
│   │   │   'search_time': 0.06690430641174316,
│   │   │   'filter_time': 4.2438507080078125e-05,
│   │   │   'sorting_time': 0.008794546127319336,
│   │   │   'property_prediction_time': 0,
│   │   │   'total_time': 0.38564372062683105
│   │   }
}
}
import requests

params = {
    "search_input": [
        "CC(=O)NC1=NC=CC=C1C(=O)O",
        "CC(C)NC1=NC=CC=C1C(=O)O",
        "COC(=O)OC1=CC=CC=C1C(=O)O",
        "CC(C)OC1=CC=CC=C1C(=O)NO",
        "CC(=O)NC1=CC=CC=C1C(C)=O",
    ],
    "search_type": "espsim_shape",
    "n_neighbors": 10,
    "search_mode": "batch",  # or "centroid"
}

response = requests.get(MY_URL + "/batch_search", params=params, headers=headers)
smiles = response.json()["smiles"]
ids = response.json()["id"]
Draw.MolsToGridImage(
    [Chem.MolFromSmiles(mol) for mol in smiles], molsPerRow=5, legends=ids
)

png

Embeddings (On-Prem users only)

You need an on-prem installation of CHEESE to use this endpoint. In public API, this endpoint will return an error.

params = {
    "search_input": ["Fc1ccccc1", "Clc1ccccc1", "Brc1ccccc1"],
    "save_embs": False,
}

response = requests.get(MY_URL + "/embeddings", params=params, headers=headers)
pprint(response.json().keys())
pprint(np.array(response.json()["espsim_shape"]).shape)
pprint(np.array(response.json()["espsim_shape"][0])) # embedding of the first molecule
dict_keys(['morgan', 'espsim_electrostatic', 'espsim_shape', 'active_pairs'])
(3, 256)
array([-0.271,  0.028, -0.28 ,  0.089, -0.067, -0.005, -0.045, -0.024,
│   │   0.126, -0.162, -0.027, -0.163,  0.142,  0.006, -0.033, -0.069,
-0.111,  0.045,  0.01 ,  0.206,  0.006,  0.137,  0.056,  0.012,
-0.033, -0.123,  0.013, -0.008,  0.076,  0.116, -0.274, -0.067,
│   │   0.042, -0.113,  0.017, -0.023,  0.085, -0.029, -0.059,  0.078,
│   │   0.091, -0.045, -0.034,  0.053,  0.05 ,  0.119, -0.018,  0.108,
│   │   0.013, -0.015, -0.021,  0.02 , -0.086,  0.16 ,  0.149, -0.174,
-0.047,  0.241, -0.133,  0.02 , -0.265, -0.009, -0.043, -0.118,
│   │   0.112,  0.043,  0.049, -0.001,  0.042,  0.007, -0.031,  0.145,
-0.07 ,  0.048,  0.01 , -0.016,  0.039, -0.027, -0.202,  0.064,
│   │   0.044, -0.077,  0.005, -0.07 , -0.09 ,  0.276, -0.047,  0.189,
│   │   0.08 , -0.094,  0.075, -0.047,  0.142, -0.242, -0.117, -0.04 ,
│   │   0.177,  0.111,  0.156, -0.015, -0.032,  0.197, -0.003, -0.006,
-0.064, -0.045, -0.051,  0.071,  0.043,  0.048, -0.103,  0.036,
-0.078,  0.025,  0.139, -0.055, -0.014,  0.064, -0.125,  0.052,
-0.001,  0.046,  0.02 ,  0.041,  0.009,  0.025,  0.142, -0.322,
│   │   0.001,  0.046, -0.113,  0.143,  0.126,  0.045,  0.124, -0.016,
-0.098, -0.13 ,  0.23 ,  0.013, -0.228, -0.069, -0.088, -0.022,
│   │   0.025,  0.011, -0.131,  0.222, -0.007, -0.092,  0.023,  0.069,
-0.011, -0.042,  0.101, -0.056, -0.079,  0.152, -0.027, -0.012,
│   │   0.03 ,  0.147,  0.006, -0.117,  0.081, -0.115,  0.101,  0.121,
│   │   0.101, -0.072,  0.011, -0.002,  0.003, -0.192, -0.024,  0.135,
-0.027,  0.01 ,  0.047, -0.116, -0.058,  0.107,  0.119,  0.002,
-0.18 , -0.054,  0.003, -0.021,  0.031,  0.036, -0.038, -0.121,
│   │   0.145, -0.038, -0.015, -0.015,  0.028, -0.05 , -0.104,  0.146,
-0.093, -0.056, -0.149, -0.074,  0.126, -0.07 , -0.112,  0.001,
-0.098, -0.026, -0.007, -0.125, -0.003,  0.066, -0.081, -0.035,
-0.09 , -0.166, -0.026, -0.114, -0.19 , -0.07 ,  0.032,  0.05 ,
-0.015, -0.022, -0.128,  0.041, -0.179,  0.038, -0.161, -0.05 ,
-0.104,  0.176, -0.028, -0.117,  0.111, -0.145,  0.166,  0.226,
-0.062, -0.019,  0.039,  0.006,  0.056,  0.138, -0.072, -0.022,
│   │   0.052, -0.122,  0.157, -0.012, -0.048,  0.122, -0.02 , -0.022])

Embeddings of lots of molecules

my_smiles = open("chembl.smi", "r").read().splitlines() # full chembl
pprint(my_smiles[0:10])
[
'N#CCCN1N=C(c2ccc(OCc3ccccc3)cc2)OCC1=O',
'CC(C)C[C@H](NC(=O)c1cn(Cc2ccccc2)nn1)B(O)O',
'Cl.NCC(=O)CCC(=O)OCc1ccccc1',
'CC(C)[C@H](NC(=O)[C@H](C)N)C(=O)N1CCC[C@H]1C(=O)N(C1CCCCC1)[C@@H](C)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)O',
'Nc1cccc(CP(=O)(O)CC(CCC(=O)O)C(=O)O)c1',
'O=C(Cc1ccc(OCc2ccccc2)cc1)N[C@@H](CCS)Cc1c[nH]c2ccccc12',
'COc1ccc2c(c1)C(=O)CC(CCN1CCC3(CC1)NCNC3=O)C2',
'COC(=O)c1cc(C(O)CN2CCN(c3ccccc3OC)CC2)ccc1OC',
'O=C(O)CCCOc1ccccc1-c1cc2cc(C(=O)NC(c3ccccc3)c3ccccc3)ccc2o1',
'N/C(=C\\C(=O)c1ccc(Cl)cc1)C(=O)O'
]
params = {
    "search_input": my_smiles,
    "save_embs": True,
    "search_type": "all",
    "dest": "/data/my_embeddings" # save into directory (faster than sending embeddings in json), creates if doesnt exist
}

response = requests.get(MY_URL + "/embeddings", params=params, headers=headers)
pprint(response.json(), max_length=5)
{'message': 'Success ! You can find computed embeddings in : /data/my_embeddings'}

Centroid Embeddings (On-Prem users only)

You need an on-prem installation of CHEESE to use this endpoint. In public API, this endpoint will return an error.

This API call retrieves embeddings of database cluster centroids.

import requests

params = {
    "db_name": "ZINC15",
    "search_type": 'espsim_shape',
    "centroid_mols": False, # whether to output representative molecules for the clusters
    "save_embs": True, 
    "dest": "/data/my_centroid_embeddings" # save into directory (faster than sending embeddings in json), creates if doesnt exist
}

response = requests.get(MY_URL + "/centroid_embeddings", params=params, headers=headers)
import numpy as np
np.array(response.json()['centroid_embeddings_db_ZINC15_st_espsim_shape']).shape
(26403, 256)

Similarity (On-Prem users only)

You need an on-prem installation of CHEESE to use this endpoint. In public API, this endpoint will return an error.

Pairwise similarity

params = {
    "smiles1": "Fc1ccccc1",
    "smiles2": "Clc1ccccc1",
    "similarity_metric": "all",
    "distance_type": "euclidean"
}

response = requests.get(MY_URL + "/similarity", params=params, headers=headers)
pprint(response.json())
{
'morgan': 1.3493338152007586,
'espsim_electrostatic': 0.7924910655564171,
'espsim_shape': 0.3191955858476541,
'active_pairs': 0.7519632147430073
}
params = {
    "smiles1": "Fc1ccccc1",
    "smiles2": "Clc1ccccc1",
    "similarity_metric": "all",
    "distance_type": "cosine" # cosine is from 0 to 1
}

response = requests.get(MY_URL + "/similarity", params=params, headers=headers)
pprint(response.json())
{
'morgan': 0.443562888451121,
'espsim_electrostatic': 0.21610830917440194,
'espsim_shape': 0.019051098635778363,
'active_pairs': 0.060692261872934083
}

Similarity Matrix

params = {
    "smiles": [
        "CC(=O)NC1=NC=CC=C1C(=O)O",
        "CC(C)NC1=NC=CC=C1C(=O)O",
        "COC(=O)OC1=CC=CC=C1C(=O)O",
        "CC(C)OC1=CC=CC=C1C(=O)NO",
        "CC(=O)NC1=CC=CC=C1C(C)=O",
    ],
    "similarity_metric": "espsim_shape",
    "distance_type": "cosine",
}

response = requests.get(MY_URL + "/similarity_matrix", params=params, headers=headers)
pprint(np.array(response.json()[params["similarity_metric"]]))
array([[1.110e-16, 4.405e-02, 7.192e-02, 1.027e-01, 3.978e-02],
[4.405e-02, 1.110e-16, 9.728e-02, 6.456e-02, 7.657e-02],
[7.192e-02, 9.728e-02, 0.000e+00, 9.438e-02, 9.444e-02],
[1.027e-01, 6.456e-02, 9.438e-02, 0.000e+00, 9.449e-02],
[3.978e-02, 7.657e-02, 9.444e-02, 9.449e-02, 0.000e+00]])

Visualisation (On-Prem users only)

You need an on-prem installation of CHEESE to use this endpoint. In public API, this endpoint will return an error.

Visualisation command works the same as embeddings command, but returns 2D coordinates intended for visualisation

params = {
    "search_input": ["Fc1ccccc1", "Clc1ccccc1", "Brc1ccccc1"],
    "search_type": "espsim_electrostatic",
    "visualisation_method": "umap"
}

response = requests.get(MY_URL + "/visualise", params=params, headers=headers)
pprint(np.array(response.json()["espsim_electrostatic"]))
array([[8.325, 9.227],
[8.393, 9.108],
[8.269, 9.257]])
my_smiles = open("my_dataset.smi", "r").read().splitlines() # mix of databases
params = {
    "search_input": my_smiles,
    "search_type": "espsim_electrostatic",
    "visualisation_method": "umap", # UMAP or PCA
    "save_coordinates": True,
    "dest": "/data/my_umap_coordinates"
}

response = requests.get(MY_URL + "/visualise", params=params, headers=headers)
pprint(response.json())

After loading UMAP embeddings, it looks like this.