Finding activity cliffs in ChEMBL

16 06 2019 chemoinformatics Tweet

Finding activity cliffs in ChEMBL

Activity cliffs are pairs of structurally similar compounds with large differences in activity, like inhibitory potency. I searched them in ChEMBL database with pychembldb.

I haven't maintained it for a long time but It works well even in Python3.7 :-)

My strategy for finding them is filtering assay data by whether it has confidence_score 9 (Direct single protein target assigned) and has enough data points(>=10 and <100) and then generating SMILES and Activity file from each of assay data for processing Matched Molecular Pairs Analysis (MMPs) by mmpdb

from pychembldb import *

for assay in chembldb.query(Assay).filter(Assay.confidence_score > 8).all():
    smiles_file = "{}.smi".format(assay.chembl_id)
    act_file = "{}.tsv".format(assay.chembl_id)
    act_list = [a for a in assay.activities if a.standard_value is not None and  and a.standard_units == "nM"]
    if len(act_list) > 9 and len(act_list) < 100::
        with open(smiles_file, "w") as sf:
            with open(act_file, "w") as af:
                af.write("ID\tVal\n")
                for activity in assay.activities:
                    if activity.standard_value is not None and \
                    activity.compound.molecule.structure is not None:
                        sf.write("{} {}\n".format(
                            activity.compound.molecule.structure.canonical_smiles,
                            activity.compound.molecule.chembl_id))
                        af.write("{}\t{}\n".format(
                            activity.compound.molecule.chembl_id,
                            activity.standard_value
                        ))

It took a few hours, but finally, I got around 55000 assay data.

I used mmpdb for generating MMP, and prepared a script for batch processing.

from glob import glob
import subprocess

def make_mmpdb(chembl_id):
    smiles_file = "{}.smi".format(chembl_id)
    act_file = "{}.tsv".format(chembl_id)
    fragments_file = "{}.fragments".format(chembl_id)
    sqlite_file = "{}.mmpdb".format(chembl_id)

    subprocess.run([
        "mmpdb",
        "fragment",
        smiles_file,
        "-o",
        fragments_file])

    subprocess.run([
        "mmpdb",
        "index",
        fragments_file,
        "-o",
        sqlite_file,
        "--properties",
        act_file])

if __name__ == "__main__":
    for fn in glob("*.smi"):
        chembl_id = fn.split(".")[0]
        make_mmpdb(chembl_id)

Finally, I extracted the pairs whose activity difference is more than 2 by pIC50.

import os
from sqlalchemy import *
from sqlalchemy.orm import create_session, relationship
from sqlalchemy.ext.declarative import declarative_base
from math import log10
from glob import glob

def search_ac(mmpdb_name):
    uri = 'sqlite:///{}'.format(mmpdb_name)

    Base = declarative_base()
    engine = create_engine(uri)
    metadata = MetaData(bind=engine)

    class Dataset(Base):
        __table__ = Table('dataset', metadata, autoload=True)

    class Compound(Base):
        __table__ = Table('compound', metadata, autoload=True)

    class PropertyName(Base):
        __table__ = Table('property_name', metadata, autoload=True)

    class CompoundProperty(Base):
        __table__ = Table('compound_property', metadata, autoload=True)
        compound = relationship('Compound', backref='compound_properties')
        property_name = relationship('PropertyName', backref='compound_properties')

    class RuleEnvironment(Base):
        __table__ = Table('rule_environment', metadata, autoload=True)

    class EnvironmentFingerprint(Base):
        __table__ = Table('environment_fingerprint', metadata, autoload=True)

    class ConstantSmiles(Base):
        __table__ = Table('constant_smiles', metadata, autoload=True)

    class Pair(Base):
        __table__ = Table('pair', metadata, autoload=True)
        constant_smiles = relationship('ConstantSmiles', backref='pair')
        rule_environment = relationship('RuleEnvironment', backref='pair')

    class RuleEnvironmentStatics(Base):
        __table__ = Table('rule_environment_statistics', metadata, autoload=True)

    sa = create_session(bind=engine)
    pIC50_dict = {}
    public_id = {}

    for cp in sa.query(CompoundProperty).filter_by(property_name_id=0).all():
        #print(cp.value)
        pIC50_dict[cp.compound_id] = 9.0 - log10(cp.value)
        public_id[cp.compound_id] = cp.compound.public_id

    pairs = []
    for pair in sa.query(Pair).all():
        if (pair.compound1_id, pair.compound2_id) not in pairs:
            pairs.append((pair.compound1_id, pair.compound2_id))
            diff = abs(pIC50_dict[pair.compound1_id] - pIC50_dict[pair.compound2_id])
            if diff >= 2.0:
                print("{}: ({}, {}) {}".format(
                    mmpdb_name.split(".")[0],
                    public_id[pair.compound1_id],
                    public_id[pair.compound2_id],
                    diff))

if __name__ == "__main__":
    for fn in glob("*.mmpdb"):
        search_ac(fn)

After this calculation I realized that I needed to eliminate cell based assays that caused AC noises.

Drkcore

Finding activity cliffs in ChEMBL