Activity cliffs are pairs of structurally similar compounds with large differences in activity, like inhibitory potency.
I searched them in ChEMBL database with pychembldb.
I haven't maintained it for a long time but It works well even in Python3.7 :-)
My strategy for finding them is filtering assay data by whether it has confidence_score 9 (Direct single protein target assigned) and has enough data points(>=10 and <100) and then generating SMILES and Activity file from each of assay data for processing Matched Molecular Pairs Analysis (MMPs) by mmpdb
from pychembldb import *
for assay in chembldb.query(Assay).filter(Assay.confidence_score > 8).all():
smiles_file = "{}.smi".format(assay.chembl_id)
act_file = "{}.tsv".format(assay.chembl_id)
act_list = [a for a in assay.activities if a.standard_value is not None and and a.standard_units == "nM"]
if len(act_list) > 9 and len(act_list) < 100::
with open(smiles_file, "w") as sf:
with open(act_file, "w") as af:
af.write("ID\tVal\n")
for activity in assay.activities:
if activity.standard_value is not None and \
activity.compound.molecule.structure is not None:
sf.write("{} {}\n".format(
activity.compound.molecule.structure.canonical_smiles,
activity.compound.molecule.chembl_id))
af.write("{}\t{}\n".format(
activity.compound.molecule.chembl_id,
activity.standard_value
))
It took a few hours, but finally, I got around 55000 assay data.
I used mmpdb for generating MMP, and prepared a script for batch processing.
from glob import glob
import subprocess
def make_mmpdb(chembl_id):
smiles_file = "{}.smi".format(chembl_id)
act_file = "{}.tsv".format(chembl_id)
fragments_file = "{}.fragments".format(chembl_id)
sqlite_file = "{}.mmpdb".format(chembl_id)
subprocess.run([
"mmpdb",
"fragment",
smiles_file,
"-o",
fragments_file])
subprocess.run([
"mmpdb",
"index",
fragments_file,
"-o",
sqlite_file,
"--properties",
act_file])
if __name__ == "__main__":
for fn in glob("*.smi"):
chembl_id = fn.split(".")[0]
make_mmpdb(chembl_id)
Finally, I extracted the pairs whose activity difference is more than 2 by pIC50.
import os
from sqlalchemy import *
from sqlalchemy.orm import create_session, relationship
from sqlalchemy.ext.declarative import declarative_base
from math import log10
from glob import glob
def search_ac(mmpdb_name):
uri = 'sqlite:///{}'.format(mmpdb_name)
Base = declarative_base()
engine = create_engine(uri)
metadata = MetaData(bind=engine)
class Dataset(Base):
__table__ = Table('dataset', metadata, autoload=True)
class Compound(Base):
__table__ = Table('compound', metadata, autoload=True)
class PropertyName(Base):
__table__ = Table('property_name', metadata, autoload=True)
class CompoundProperty(Base):
__table__ = Table('compound_property', metadata, autoload=True)
compound = relationship('Compound', backref='compound_properties')
property_name = relationship('PropertyName', backref='compound_properties')
class RuleEnvironment(Base):
__table__ = Table('rule_environment', metadata, autoload=True)
class EnvironmentFingerprint(Base):
__table__ = Table('environment_fingerprint', metadata, autoload=True)
class ConstantSmiles(Base):
__table__ = Table('constant_smiles', metadata, autoload=True)
class Pair(Base):
__table__ = Table('pair', metadata, autoload=True)
constant_smiles = relationship('ConstantSmiles', backref='pair')
rule_environment = relationship('RuleEnvironment', backref='pair')
class RuleEnvironmentStatics(Base):
__table__ = Table('rule_environment_statistics', metadata, autoload=True)
sa = create_session(bind=engine)
pIC50_dict = {}
public_id = {}
for cp in sa.query(CompoundProperty).filter_by(property_name_id=0).all():
#print(cp.value)
pIC50_dict[cp.compound_id] = 9.0 - log10(cp.value)
public_id[cp.compound_id] = cp.compound.public_id
pairs = []
for pair in sa.query(Pair).all():
if (pair.compound1_id, pair.compound2_id) not in pairs:
pairs.append((pair.compound1_id, pair.compound2_id))
diff = abs(pIC50_dict[pair.compound1_id] - pIC50_dict[pair.compound2_id])
if diff >= 2.0:
print("{}: ({}, {}) {}".format(
mmpdb_name.split(".")[0],
public_id[pair.compound1_id],
public_id[pair.compound2_id],
diff))
if __name__ == "__main__":
for fn in glob("*.mmpdb"):
search_ac(fn)
After this calculation I realized that I needed to eliminate cell based assays that caused AC noises.