API Tutorial
In this tutorial, we will learn how to use the API to interact with CHEESE platform. We will cover the following topics: - How to retrieve molecules from available databases - Search and batch search - Comnputing CHEESE Embeddings - Computation of similarities
Prerequisities
- Python
- NumPy (optional):
pip install numpy
- RdKit (optional):
pip install rdkit
- Rich (optional):
pip install rich
import requests
import json
from rdkit import Chem
from rdkit.Chem import Draw
import numpy as np
from rich.pretty import pprint
np.set_printoptions(precision=3)
Sanity Check
MY_URL = "http://cheese-database.ch.themama.ai:9002" # URL of the API, change it if you are using a different one
headers = {'accept': 'application/json'} # headers for the request, we want a JSON response
url = MY_URL + '/test'
response = requests.get(url, headers=headers)
pprint(response.json())
{'message': 'Health check successful !!'}
Random molecules
This endpoint returns a specified number of random molecules in SMILES format from a random selection of databases.
# Define the URL and headers
url = MY_URL + '/random_molecule'
params = {'n_mols': 5}
# Make the GET request
response = requests.get(url, headers=headers, params=params)
pprint(response.json())
[ │ 'Cn1nnc(C[NH2+]C[C@H]2CCN(C(=O)[C@]34C[C@H]3COC4)C2)n1', │ 'Cc1ccc(NC(=O)c2ccc(N(Cc3ccccc3Cl)S(C)(=O)=O)cc2)cc1C', │ 'CN(C[C@@H]1CCC[N@H+](C)C1)S(=O)(=O)N1CCO[C@H](C#N)C1', │ 'C#CCCN1CC2(C1)CN(C(=O)C1=NC(=O)N(C)C1)CCO2', │ 'Cc1nccc(CNC(=O)C(=O)N[C@H](C)[C@@H]2C[N@@H+](C)CCO2)n1' ]
Draw.MolsToGridImage([Chem.MolFromSmiles(mol) for mol in response.json()], molsPerRow=5)
Available Databases
# Define the URL and headers
url = MY_URL + '/available_dbs'
headers = {'accept': 'application/json'}
# Make the GET request
response = requests.get(url, headers=headers)
pprint(response.json())
{'available_dbs': ['ENAMINE-REAL', 'ZINC15', 'CUSTOM_CLUSTERED', 'CUSTOM_IN_MEM']}
Search
Simple Search
params = {
"search_input": "CC(=O)Oc1ccccc1C(=O)O", # SMILES of the query molecule
"search_type": 'espsim_shape', # CHEESE Search type
"n_neighbors": 5,
"search_quality": "fast",
"db_names": ["ENAMINE-REAL"],
}
response = requests.get(MY_URL + "/molsearch_simple", params=params)
pprint(response.json())
{ │ 'smiles': [ │ │ 'CC(=O)NC1=NC=CC=C1C(=O)O', │ │ 'CC(C)NC1=NC=CC=C1C(=O)O', │ │ 'COC(=O)OC1=CC=CC=C1C(=O)O', │ │ 'CC(C)OC1=CC=CC=C1C(=O)NO', │ │ 'CC(=O)NC1=CC=CC=C1C(C)=O' │ ], │ 'id': ['Z2904267089', 'Z71176798', 'Z192948624', 'Z1776115990', 'PV-002547950599'] }
smiles = response.json()["smiles"]
ids = response.json()["id"]
Draw.MolsToGridImage([Chem.MolFromSmiles(mol) for mol in smiles], molsPerRow=5, legends=ids)
Advanced Search
import requests
params = {
"search_input": "CC(=O)Oc1ccccc1C(=O)O",
"search_type": 'espsim_shape',
"n_neighbors": 5,
"search_quality": "fast",
"db_names": ["ZINC15"],
"descriptors": True,
"properties": True,
"filter_molecules": False,
"order_molecules": False
}
response = requests.get(MY_URL + "/molsearch", params=params)
pprint(response.json().keys())
dict_keys(['remarks', 'canonicalized_query', 'neighbors', 'query_properties', 'search_info'])
pprint(response.json(), max_depth=3)
{ │ 'remarks': '', │ 'canonicalized_query': 'CC(=O)Oc1ccccc1C(=O)O', │ 'neighbors': [ │ │ { │ │ │ 'smiles': 'CC(=O)Oc1ccccc1C(=O)[O-]', │ │ │ 'zinc_id': 'ZINC15 : 53', │ │ │ 'embedding_similarity': 0.7071046233177185, │ │ │ 'properties': {...} │ │ }, │ │ { │ │ │ 'smiles': 'CC(=O)Oc1ccccc1C(N)=O', │ │ │ 'zinc_id': 'ZINC15 : 404835', │ │ │ 'embedding_similarity': 0.6960273385047913, │ │ │ 'properties': {...} │ │ }, │ │ { │ │ │ 'smiles': 'CC(=O)Oc1ccccc1C(N)=S', │ │ │ 'zinc_id': 'ZINC15 : 72284420', │ │ │ 'embedding_similarity': 0.6937514841556549, │ │ │ 'properties': {...} │ │ }, │ │ { │ │ │ 'smiles': 'CC(=O)Nc1ccncc1C(=O)[O-]', │ │ │ 'zinc_id': 'ZINC15 : 3163078', │ │ │ 'embedding_similarity': 0.6923034489154816, │ │ │ 'properties': {...} │ │ }, │ │ { │ │ │ 'smiles': 'CC(=O)Oc1ccccc1C(C)=O', │ │ │ 'zinc_id': 'ZINC15 : 137705', │ │ │ 'embedding_similarity': 0.6920740604400635, │ │ │ 'properties': {...} │ │ } │ ], │ 'query_properties': { │ │ 'smiles': 'CC(=O)Oc1ccccc1C(=O)O', │ │ 'zinc_id': '', │ │ 'properties': { │ │ │ 'absorption': {...}, │ │ │ 'excretion': {...}, │ │ │ 'toxicity': {...}, │ │ │ 'distribution': {...}, │ │ │ 'metabolism': {...}, │ │ │ 'basics': {...} │ │ } │ }, │ 'search_info': { │ │ 'query_embedding_time': 0.2988882064819336, │ │ 'search_time': 0.3333892822265625, │ │ 'filter_time': 0.003624439239501953, │ │ 'sorting_time': 0.0015327930450439453, │ │ 'property_prediction_time': 0.0522768497467041, │ │ 'total_time': 0.6897115707397461 │ } }
One result molecule json with properties and descriptors looks like this:
pprint(response.json()["neighbors"][0])
{ │ 'smiles': 'CC(=O)Oc1ccccc1C(=O)[O-]', │ 'zinc_id': 'ZINC15 : 53', │ 'embedding_similarity': 0.7071046233177185, │ 'properties': { │ │ 'absorption': { │ │ │ 'caco2_wang': -4.235, │ │ │ 'lipophilicity_astrazeneca': -0.414, │ │ │ 'solubility_aqsoldb': -2.315, │ │ │ 'bioavailability_ma': 0.908, │ │ │ 'hia_hou': 0.989, │ │ │ 'pgp_broccatelli': 0.0, │ │ │ 'clogp': -0.0246 │ │ }, │ │ 'excretion': { │ │ │ 'clearance_hepatocyte_az': 86.346, │ │ │ 'clearance_microsome_az': 55.043, │ │ │ 'half_life_obach': 0.43 │ │ }, │ │ 'toxicity': {'ld50_zhu': 1.81, 'ames': 0.003, 'dili': 0.021, 'herg': 0.0}, │ │ 'distribution': {'ppbr_az': 45.823, 'vdss_lombardo': 0.155, 'bbb_martins': 0.985}, │ │ 'metabolism': {'cyp2c9_veith': 0.046, 'cyp2d6_veith': 0.0, 'cyp3a4_veith': 0.0}, │ │ 'basics': { │ │ │ 'molecular_weight': 179.03498, │ │ │ 'formal_charge': -1.0, │ │ │ 'heavy_atoms': 13.0, │ │ │ 'h_bond_acceptors': 4.0, │ │ │ 'h_bond_donor': 0.0, │ │ │ 'rotatable_bonds': 2.0, │ │ │ 'num_of_rings': 1.0, │ │ │ 'molar_refractivity': 42.0815, │ │ │ 'number_of_atoms': 13.0, │ │ │ 'topological_surface_area_mapping': 66.43 │ │ } │ } }
Advanced Search with filtering and ordering
import requests
params = {
"search_input": "CC(=O)Oc1ccccc1C(=O)O",
"search_type": 'espsim_shape',
"n_neighbors": 10,
"search_quality": "fast",
"db_names": ["ZINC15"],
"descriptors": True,
"properties": True,
"filter_molecules": True,
"order_molecules": True,
"filtering": ["PAINS", "Murcko scaffold hop"], # Using filtering to filter out PAINS and to get only scaffold hops
"ordering": ["Morgan Tanimoto"] # Sorting results based on Morgan Fingerprints after doing cheese search
}
def just_smiles(advanced_response):
return [r["smiles"] for r in advanced_response["neighbors"]]
response = requests.get(MY_URL + "/molsearch", params=params)
pprint(just_smiles(response.json())) # note: we searchech 10 molecules but got 6 because of the filtering (its better to increase the number of neighbors)
[ │ 'CC(=O)Oc1ccccc1C(C)=O', │ 'CC(=O)Oc1ccccc1C(N)=O', │ 'CC(=O)Oc1ccccc1C(=O)[O-]', │ 'CC(=O)Oc1ccccc1C(N)=S', │ 'CC(=O)Nc1ccccc1C(=O)[O-]', │ 'NC(=O)Nc1ccccc1C(=O)[O-]' ]
smiles = just_smiles(response.json())
Draw.MolsToGridImage([Chem.MolFromSmiles(mol) for mol in smiles], molsPerRow=3)
Search Array
Searching a list of molecules
import requests
params = {
"search_input": ["CC1=CN(C)N=C1", "CNC1=CC=CC=C1", "CCN1C=CN=C1"],
"search_type": 'espsim_shape',
"n_neighbors": 5,
"search_quality": "fast",
"db_names": "ZINC15",
"descriptors": False,
"properties": False,
"filter_molecules": False
}
response = requests.get(MY_URL + "/molsearch_array", params=params, headers=headers)
pprint(response.json(), max_depth=3)
{ │ 'CC1=CN(C)N=C1': { │ │ 'CC1=CN(C)N=C1': {'remarks': '', 'canonicalized_query': 'Cc1cnn(C)c1', 'neighbors': [...]}, │ │ 'search_info': { │ │ │ 'query_embedding_time': 0.2705063819885254, │ │ │ 'search_time': 0.09556221961975098, │ │ │ 'filter_time': 0.0029592514038085938, │ │ │ 'sorting_time': 0.010590314865112305, │ │ │ 'property_prediction_time': 0, │ │ │ 'total_time': 0.37961816787719727 │ │ } │ }, │ 'CNC1=CC=CC=C1': { │ │ 'CNC1=CC=CC=C1': {'remarks': '', 'canonicalized_query': 'CNc1ccccc1', 'neighbors': [...]}, │ │ 'search_info': { │ │ │ 'query_embedding_time': 0.31961727142333984, │ │ │ 'search_time': 0.0561220645904541, │ │ │ 'filter_time': 0.002988100051879883, │ │ │ 'sorting_time': 0.008991718292236328, │ │ │ 'property_prediction_time': 0, │ │ │ 'total_time': 0.38771915435791016 │ │ } │ }, │ 'CCN1C=CN=C1': { │ │ 'CCN1C=CN=C1': {'remarks': '', 'canonicalized_query': 'CCn1ccnc1', 'neighbors': [...]}, │ │ 'search_info': { │ │ │ 'query_embedding_time': 0.38388919830322266, │ │ │ 'search_time': 0.04931282997131348, │ │ │ 'filter_time': 0.002831697463989258, │ │ │ 'sorting_time': 0.008421182632446289, │ │ │ 'property_prediction_time': 0, │ │ │ 'total_time': 0.4444549083709717 │ │ } │ } }
Batch Search
import requests
params = {
"search_input": [
"CC(=O)NC1=NC=CC=C1C(=O)O",
"CC(C)NC1=NC=CC=C1C(=O)O",
"COC(=O)OC1=CC=CC=C1C(=O)O",
"CC(C)OC1=CC=CC=C1C(=O)NO",
"CC(=O)NC1=CC=CC=C1C(C)=O",
],
"search_type": "espsim_shape",
"n_neighbors": 10,
"search_mode": "centroid", # or "batch"
}
response = requests.get(MY_URL + "/batch_search", params=params, headers=headers)
smiles = response.json()["smiles"]
ids = response.json()["id"]
Draw.MolsToGridImage(
[Chem.MolFromSmiles(mol) for mol in smiles], molsPerRow=5, legends=ids
)
Embeddings
params = {
"search_input": ["Fc1ccccc1", "Clc1ccccc1", "Brc1ccccc1"],
"save_embs": False,
}
response = requests.get(MY_URL + "/embeddings", params=params, headers=headers)
pprint(response.json().keys())
pprint(np.array(response.json()["espsim_shape"]).shape)
pprint(np.array(response.json()["espsim_shape"][0])) # embedding of the first molecule
dict_keys(['morgan', 'espsim_electrostatic', 'espsim_shape', 'active_pairs'])
(3, 256)
array([-0.271, 0.028, -0.28 , 0.089, -0.067, -0.005, -0.045, -0.024, │ │ 0.126, -0.162, -0.027, -0.163, 0.142, 0.006, -0.033, -0.069, │ -0.111, 0.045, 0.01 , 0.206, 0.006, 0.137, 0.056, 0.012, │ -0.033, -0.123, 0.013, -0.008, 0.076, 0.116, -0.274, -0.067, │ │ 0.042, -0.113, 0.017, -0.023, 0.085, -0.029, -0.059, 0.078, │ │ 0.091, -0.045, -0.034, 0.053, 0.05 , 0.119, -0.018, 0.108, │ │ 0.013, -0.015, -0.021, 0.02 , -0.086, 0.16 , 0.149, -0.174, │ -0.047, 0.241, -0.133, 0.02 , -0.265, -0.009, -0.043, -0.118, │ │ 0.112, 0.043, 0.049, -0.001, 0.042, 0.007, -0.031, 0.145, │ -0.07 , 0.048, 0.01 , -0.016, 0.039, -0.027, -0.202, 0.064, │ │ 0.044, -0.077, 0.005, -0.07 , -0.09 , 0.276, -0.047, 0.189, │ │ 0.08 , -0.094, 0.075, -0.047, 0.142, -0.242, -0.117, -0.04 , │ │ 0.177, 0.111, 0.156, -0.015, -0.032, 0.197, -0.003, -0.006, │ -0.064, -0.045, -0.051, 0.071, 0.043, 0.048, -0.103, 0.036, │ -0.078, 0.025, 0.139, -0.055, -0.014, 0.064, -0.125, 0.052, │ -0.001, 0.046, 0.02 , 0.041, 0.009, 0.025, 0.142, -0.322, │ │ 0.001, 0.046, -0.113, 0.143, 0.126, 0.045, 0.124, -0.016, │ -0.098, -0.13 , 0.23 , 0.013, -0.228, -0.069, -0.088, -0.022, │ │ 0.025, 0.011, -0.131, 0.222, -0.007, -0.092, 0.023, 0.069, │ -0.011, -0.042, 0.101, -0.056, -0.079, 0.152, -0.027, -0.012, │ │ 0.03 , 0.147, 0.006, -0.117, 0.081, -0.115, 0.101, 0.121, │ │ 0.101, -0.072, 0.011, -0.002, 0.003, -0.192, -0.024, 0.135, │ -0.027, 0.01 , 0.047, -0.116, -0.058, 0.107, 0.119, 0.002, │ -0.18 , -0.054, 0.003, -0.021, 0.031, 0.036, -0.038, -0.121, │ │ 0.145, -0.038, -0.015, -0.015, 0.028, -0.05 , -0.104, 0.146, │ -0.093, -0.056, -0.149, -0.074, 0.126, -0.07 , -0.112, 0.001, │ -0.098, -0.026, -0.007, -0.125, -0.003, 0.066, -0.081, -0.035, │ -0.09 , -0.166, -0.026, -0.114, -0.19 , -0.07 , 0.032, 0.05 , │ -0.015, -0.022, -0.128, 0.041, -0.179, 0.038, -0.161, -0.05 , │ -0.104, 0.176, -0.028, -0.117, 0.111, -0.145, 0.166, 0.226, │ -0.062, -0.019, 0.039, 0.006, 0.056, 0.138, -0.072, -0.022, │ │ 0.052, -0.122, 0.157, -0.012, -0.048, 0.122, -0.02 , -0.022])
Embeddings of lots of molecules
my_smiles = open("chembl.smi", "r").read().splitlines() # full chembl
pprint(my_smiles[0:10])
[ │ 'N#CCCN1N=C(c2ccc(OCc3ccccc3)cc2)OCC1=O', │ 'CC(C)C[C@H](NC(=O)c1cn(Cc2ccccc2)nn1)B(O)O', │ 'Cl.NCC(=O)CCC(=O)OCc1ccccc1', │ 'CC(C)[C@H](NC(=O)[C@H](C)N)C(=O)N1CCC[C@H]1C(=O)N(C1CCCCC1)[C@@H](C)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)O', │ 'Nc1cccc(CP(=O)(O)CC(CCC(=O)O)C(=O)O)c1', │ 'O=C(Cc1ccc(OCc2ccccc2)cc1)N[C@@H](CCS)Cc1c[nH]c2ccccc12', │ 'COc1ccc2c(c1)C(=O)CC(CCN1CCC3(CC1)NCNC3=O)C2', │ 'COC(=O)c1cc(C(O)CN2CCN(c3ccccc3OC)CC2)ccc1OC', │ 'O=C(O)CCCOc1ccccc1-c1cc2cc(C(=O)NC(c3ccccc3)c3ccccc3)ccc2o1', │ 'N/C(=C\\C(=O)c1ccc(Cl)cc1)C(=O)O' ]
params = {
"search_input": my_smiles,
"save_embs": True,
"search_type": "all",
"dest": "/data/my_embeddings" # save into directory (faster than sending embeddings in json), creates if doesnt exist
}
response = requests.get(MY_URL + "/embeddings", params=params, headers=headers)
pprint(response.json(), max_length=5)
{'message': 'Success ! You can find computed embeddings in : /data/my_embeddings'}
Centroid Embeddings
This API call retrieves embeddings of database cluster centroids.
import requests
params = {
"db_name": "ZINC15",
"search_type": 'espsim_shape',
"centroid_mols": False, # whether to output representative molecules for the clusters
"save_embs": True,
"dest": "/data/my_centroid_embeddings" # save into directory (faster than sending embeddings in json), creates if doesnt exist
}
response = requests.get(MY_URL + "/centroid_embeddings", params=params, headers=headers)
import numpy as np
np.array(response.json()['centroid_embeddings_db_ZINC15_st_espsim_shape']).shape
(26403, 256)
Similarity
Pairwise similarity
params = {
"smiles1": "Fc1ccccc1",
"smiles2": "Clc1ccccc1",
"similarity_metric": "all",
"distance_type": "euclidean"
}
response = requests.get(MY_URL + "/similarity", params=params, headers=headers)
pprint(response.json())
{ │ 'morgan': 1.3493338152007586, │ 'espsim_electrostatic': 0.7924910655564171, │ 'espsim_shape': 0.3191955858476541, │ 'active_pairs': 0.7519632147430073 }
params = {
"smiles1": "Fc1ccccc1",
"smiles2": "Clc1ccccc1",
"similarity_metric": "all",
"distance_type": "cosine" # cosine is from 0 to 1
}
response = requests.get(MY_URL + "/similarity", params=params, headers=headers)
pprint(response.json())
{ │ 'morgan': 0.443562888451121, │ 'espsim_electrostatic': 0.21610830917440194, │ 'espsim_shape': 0.019051098635778363, │ 'active_pairs': 0.060692261872934083 }
Similarity Matrix
params = {
"smiles": [
"CC(=O)NC1=NC=CC=C1C(=O)O",
"CC(C)NC1=NC=CC=C1C(=O)O",
"COC(=O)OC1=CC=CC=C1C(=O)O",
"CC(C)OC1=CC=CC=C1C(=O)NO",
"CC(=O)NC1=CC=CC=C1C(C)=O",
],
"similarity_metric": "espsim_shape",
"distance_type": "cosine",
}
response = requests.get(MY_URL + "/similarity_matrix", params=params, headers=headers)
pprint(np.array(response.json()[params["similarity_metric"]]))
array([[1.110e-16, 4.405e-02, 7.192e-02, 1.027e-01, 3.978e-02], │ [4.405e-02, 1.110e-16, 9.728e-02, 6.456e-02, 7.657e-02], │ [7.192e-02, 9.728e-02, 0.000e+00, 9.438e-02, 9.444e-02], │ [1.027e-01, 6.456e-02, 9.438e-02, 0.000e+00, 9.449e-02], │ [3.978e-02, 7.657e-02, 9.444e-02, 9.449e-02, 0.000e+00]])
Visualisation
Visualisation command works the same as embeddings command, but returns 2D coordinates intended for visualisation
params = {
"search_input": ["Fc1ccccc1", "Clc1ccccc1", "Brc1ccccc1"],
"search_type": "espsim_electrostatic",
"visualisation_method": "umap"
}
response = requests.get(MY_URL + "/visualise", params=params, headers=headers)
pprint(np.array(response.json()["espsim_electrostatic"]))
array([[8.325, 9.227], │ [8.393, 9.108], │ [8.269, 9.257]])
my_smiles = open("my_dataset.smi", "r").read().splitlines() # mix of databases
params = {
"search_input": my_smiles,
"search_type": "espsim_electrostatic",
"visualisation_method": "umap", # UMAP or PCA
"save_coordinates": True,
"dest": "/data/my_umap_coordinates"
}
response = requests.get(MY_URL + "/visualise", params=params, headers=headers)
pprint(response.json())
After loading UMAP embeddings, it looks like this.