Drkcore

import re import openbabel as ob def convert(gf): obc = ob.OBConversion() obc.SetOutFormat('smi') mol = ob.OBMol() for l in gf.split('\n'): if len(l) > 0 and l[0] == 'v': a = mol.NewAtom() atomic_num = int(l.split(' ')[2]) a.SetAtomicNum(atomic_num) elif len(l) > 0 and l[0] == 'e': begin_atom_idx = int(l.split(' ')[1]) + 1 end_atom_idx = int(l.split(' ')[2]) + 1 bond_order = int(l.split(' ')[3]) b = mol.AddBond(begin_atom_idx, end_atom_idx, bond_order) elif len(l) > 0 and l[0] == '#': title = l.split(' ')[1] mol.SetTitle(title) return obc.WriteString(mol) if __name__ == '__main__': txt = open('gaston.out').read() p = re.compile('#.+?(?=(#|$))',re.S) m = p.finditer(txt) for ss in m: print convert(ss.group())[:-1]

12012010 chemoinformatics Python openbabel GASTON

SDFからGASTON用のファイルを作る

gSpanやGASTONの入力はラベル付きのvertexとedgeなので、openbabelでsdfを読み込んで、GASTONのinputに変換するものを作ってみた。

import openbabel as ob

def convert(sdf):
    obc = ob.OBConversion()
    obc.SetInAndOutFormats('sdf','smi')

    mol = ob.OBMol()
    next = obc.ReadFile(mol,sdf)
    molnum = 0
    while next:
        # mol
        print "t # %d" % molnum
        # atom
        for i,atom in enumerate(ob.OBMolAtomIter(mol)): 
            print "v %d %d " % (i,atom.GetAtomicNum())

        # bond
        for i,bond in enumerate(ob.OBMolBondIter(mol)): 
            print "e %d %d %d" % (bond.GetBeginAtomIdx()-1,bond.GetEndAtomIdx()-1,bond.GetBondOrder())

        mol = ob.OBMol()
        next = obc.Read(mol)

        molnum += 1
    return True

if __name__ == "__main__":
    sdffile = 'pubchem_sample.sdf'

    convert(sdffile)

SDFはpubchemから適当に選んだがCID: 16757835は外しといた。

./gaston 11 pubchem_data pubchem_out

GASTONを実行した出力の一部

アロマティックなボンドは3みたいに別のラベルを与える必要がある気はするが、、、

後は、GASTONの出力ファイルから構造を構築するものを用意すれば、頻出する部分構造を扱えるようになる。

Drkcore

GASTONの出力ファイルをsmilesに変換する

SDFからGASTON用のファイルを作る