Drkcore

16 06 2019 chemoinformatics Tweet

Finding activity cliffs in ChEMBL

Activity cliffs are pairs of structurally similar compounds with large differences in activity, like inhibitory potency. I searched them in ChEMBL database with pychembldb.

I haven't maintained it for a long time but It works well even in Python3.7 :-)

My strategy for finding them is filtering assay data by whether it has confidence_score 9 (Direct single protein target assigned) and has enough data points(>=10 and <100) and then generating SMILES and Activity file from each of assay data for processing Matched Molecular Pairs Analysis (MMPs) by mmpdb

from pychembldb import *

for assay in chembldb.query(Assay).filter(Assay.confidence_score > 8).all():
    smiles_file = "{}.smi".format(assay.chembl_id)
    act_file = "{}.tsv".format(assay.chembl_id)
    act_list = [a for a in assay.activities if a.standard_value is not None and  and a.standard_units == "nM"]
    if len(act_list) > 9 and len(act_list) < 100::
        with open(smiles_file, "w") as sf:
            with open(act_file, "w") as af:
                af.write("ID\tVal\n")
                for activity in assay.activities:
                    if activity.standard_value is not None and \
                    activity.compound.molecule.structure is not None:
                        sf.write("{} {}\n".format(
                            activity.compound.molecule.structure.canonical_smiles,
                            activity.compound.molecule.chembl_id))
                        af.write("{}\t{}\n".format(
                            activity.compound.molecule.chembl_id,
                            activity.standard_value
                        ))

It took a few hours, but finally, I got around 55000 assay data.

I used mmpdb for generating MMP, and prepared a script for batch processing.

from glob import glob
import subprocess

def make_mmpdb(chembl_id):
    smiles_file = "{}.smi".format(chembl_id)
    act_file = "{}.tsv".format(chembl_id)
    fragments_file = "{}.fragments".format(chembl_id)
    sqlite_file = "{}.mmpdb".format(chembl_id)

    subprocess.run([
        "mmpdb",
        "fragment",
        smiles_file,
        "-o",
        fragments_file])

    subprocess.run([
        "mmpdb",
        "index",
        fragments_file,
        "-o",
        sqlite_file,
        "--properties",
        act_file])

if __name__ == "__main__":
    for fn in glob("*.smi"):
        chembl_id = fn.split(".")[0]
        make_mmpdb(chembl_id)

Finally, I extracted the pairs whose activity difference is more than 2 by pIC50.

import os
from sqlalchemy import *
from sqlalchemy.orm import create_session, relationship
from sqlalchemy.ext.declarative import declarative_base
from math import log10
from glob import glob

def search_ac(mmpdb_name):
    uri = 'sqlite:///{}'.format(mmpdb_name)

    Base = declarative_base()
    engine = create_engine(uri)
    metadata = MetaData(bind=engine)

    class Dataset(Base):
        __table__ = Table('dataset', metadata, autoload=True)

    class Compound(Base):
        __table__ = Table('compound', metadata, autoload=True)

    class PropertyName(Base):
        __table__ = Table('property_name', metadata, autoload=True)

    class CompoundProperty(Base):
        __table__ = Table('compound_property', metadata, autoload=True)
        compound = relationship('Compound', backref='compound_properties')
        property_name = relationship('PropertyName', backref='compound_properties')

    class RuleEnvironment(Base):
        __table__ = Table('rule_environment', metadata, autoload=True)

    class EnvironmentFingerprint(Base):
        __table__ = Table('environment_fingerprint', metadata, autoload=True)

    class ConstantSmiles(Base):
        __table__ = Table('constant_smiles', metadata, autoload=True)

    class Pair(Base):
        __table__ = Table('pair', metadata, autoload=True)
        constant_smiles = relationship('ConstantSmiles', backref='pair')
        rule_environment = relationship('RuleEnvironment', backref='pair')

    class RuleEnvironmentStatics(Base):
        __table__ = Table('rule_environment_statistics', metadata, autoload=True)

    sa = create_session(bind=engine)
    pIC50_dict = {}
    public_id = {}

    for cp in sa.query(CompoundProperty).filter_by(property_name_id=0).all():
        #print(cp.value)
        pIC50_dict[cp.compound_id] = 9.0 - log10(cp.value)
        public_id[cp.compound_id] = cp.compound.public_id

    pairs = []
    for pair in sa.query(Pair).all():
        if (pair.compound1_id, pair.compound2_id) not in pairs:
            pairs.append((pair.compound1_id, pair.compound2_id))
            diff = abs(pIC50_dict[pair.compound1_id] - pIC50_dict[pair.compound2_id])
            if diff >= 2.0:
                print("{}: ({}, {}) {}".format(
                    mmpdb_name.split(".")[0],
                    public_id[pair.compound1_id],
                    public_id[pair.compound2_id],
                    diff))

if __name__ == "__main__":
    for fn in glob("*.mmpdb"):
        search_ac(fn)

After this calculation I realized that I needed to eliminate cell based assays that caused AC noises.

About

  • もう5年目(wishlistありマス♡)
  • 最近はPythonとDeepLearning
  • 日本酒自粛中
  • ドラムンベースからミニマルまで
  • ポケモンGOゆるめ

Tag

Python Deep Learning javascript chemoinformatics Emacs sake and more...

Ad

© kzfm 2003-2021