16062019 chemoinformatics
Activity cliffs are pairs of structurally similar compounds with large differences in activity, like inhibitory potency. I searched them in ChEMBL database with pychembldb.
I haven't maintained it for a long time but It works well even in Python3.7 :-)
My strategy for finding them is filtering assay data by whether it has confidence_score 9 (Direct single protein target assigned) and has enough data points(>=10 and <100) and then generating SMILES and Activity file from each of assay data for processing Matched Molecular Pairs Analysis (MMPs) by mmpdb
from pychembldb import * for assay in chembldb.query(Assay).filter(Assay.confidence_score > 8).all(): smiles_file = "{}.smi".format(assay.chembl_id) act_file = "{}.tsv".format(assay.chembl_id) act_list = [a for a in assay.activities if a.standard_value is not None and and a.standard_units == "nM"] if len(act_list) > 9 and len(act_list) < 100:: with open(smiles_file, "w") as sf: with open(act_file, "w") as af: af.write("ID\tVal\n") for activity in assay.activities: if activity.standard_value is not None and \ activity.compound.molecule.structure is not None: sf.write("{} {}\n".format( activity.compound.molecule.structure.canonical_smiles, activity.compound.molecule.chembl_id)) af.write("{}\t{}\n".format( activity.compound.molecule.chembl_id, activity.standard_value ))
It took a few hours, but finally, I got around 55000 assay data.
I used mmpdb for generating MMP, and prepared a script for batch processing.
from glob import glob import subprocess def make_mmpdb(chembl_id): smiles_file = "{}.smi".format(chembl_id) act_file = "{}.tsv".format(chembl_id) fragments_file = "{}.fragments".format(chembl_id) sqlite_file = "{}.mmpdb".format(chembl_id) subprocess.run([ "mmpdb", "fragment", smiles_file, "-o", fragments_file]) subprocess.run([ "mmpdb", "index", fragments_file, "-o", sqlite_file, "--properties", act_file]) if __name__ == "__main__": for fn in glob("*.smi"): chembl_id = fn.split(".")[0] make_mmpdb(chembl_id)
Finally, I extracted the pairs whose activity difference is more than 2 by pIC50.
import os from sqlalchemy import * from sqlalchemy.orm import create_session, relationship from sqlalchemy.ext.declarative import declarative_base from math import log10 from glob import glob def search_ac(mmpdb_name): uri = 'sqlite:///{}'.format(mmpdb_name) Base = declarative_base() engine = create_engine(uri) metadata = MetaData(bind=engine) class Dataset(Base): __table__ = Table('dataset', metadata, autoload=True) class Compound(Base): __table__ = Table('compound', metadata, autoload=True) class PropertyName(Base): __table__ = Table('property_name', metadata, autoload=True) class CompoundProperty(Base): __table__ = Table('compound_property', metadata, autoload=True) compound = relationship('Compound', backref='compound_properties') property_name = relationship('PropertyName', backref='compound_properties') class RuleEnvironment(Base): __table__ = Table('rule_environment', metadata, autoload=True) class EnvironmentFingerprint(Base): __table__ = Table('environment_fingerprint', metadata, autoload=True) class ConstantSmiles(Base): __table__ = Table('constant_smiles', metadata, autoload=True) class Pair(Base): __table__ = Table('pair', metadata, autoload=True) constant_smiles = relationship('ConstantSmiles', backref='pair') rule_environment = relationship('RuleEnvironment', backref='pair') class RuleEnvironmentStatics(Base): __table__ = Table('rule_environment_statistics', metadata, autoload=True) sa = create_session(bind=engine) pIC50_dict = {} public_id = {} for cp in sa.query(CompoundProperty).filter_by(property_name_id=0).all(): #print(cp.value) pIC50_dict[cp.compound_id] = 9.0 - log10(cp.value) public_id[cp.compound_id] = cp.compound.public_id pairs = [] for pair in sa.query(Pair).all(): if (pair.compound1_id, pair.compound2_id) not in pairs: pairs.append((pair.compound1_id, pair.compound2_id)) diff = abs(pIC50_dict[pair.compound1_id] - pIC50_dict[pair.compound2_id]) if diff >= 2.0: print("{}: ({}, {}) {}".format( mmpdb_name.split(".")[0], public_id[pair.compound1_id], public_id[pair.compound2_id], diff)) if __name__ == "__main__": for fn in glob("*.mmpdb"): search_ac(fn)
After this calculation I realized that I needed to eliminate cell based assays that caused AC noises.