Drkcore

12082010 chemoinformatics Python

Gastonを使ってMCSを求める

openbabelにはMaximum Common Substructure(MCS)を求めるメソッドがないのでGastonを使った。

似たようなのは昔書いたけど扱いにくいのでクラスにしといた。ついでにMCSSTanimotoも求めるようにした。

MCSSTanimoto(A,B) = 　MCS /(A + B - MCS) #全てヘテロ原子数

という、MCS版のタニモト距離。例えばOc1ccccc1 とCc1ccccc1のMCSTanimotoは

6 /(7 + 7 - 6) = 0.75

スキャフォールドが決まっている場合には割と手軽に使えんじゃないかなぁ。まぁヘテロの性質無視して同一性だけで判断しているのでハロゲンの違いは割と似ているとかそういう補正は入れたほうがいいのかもしれないが。

import openbabel as ob
import os,re
from tempfile import mkstemp

class Gaston:
    def __init__(self,mols=[]):
        self.mols = mols

    def writefile(self,filename):
        """ convert file to gaston format"""
        f = open(filename, 'w')
        for (molnum, mol) in enumerate(self.mols):
            f.write("t # %d\n" % molnum)
            for i,atom in enumerate(ob.OBMolAtomIter(mol)):
                f.write("v %d %d\n" % (i,atom.GetAtomicNum()))
            for i,bond in enumerate(ob.OBMolBondIter(mol)):
                f.write("e %d %d %d\n" % (bond.GetBeginAtomIdx()-1,bond.GetEndAtomIdx()-1,bond.GetBondOrder()))

        return filename

    def readfile(self,file):
        txt = open(file).read()
        p = re.compile('#.+?(?=(#|$))',re.S)
        mols = p.finditer(txt)

        result = [self._gas2ob(gas.group()) for gas in mols]
        return result

    def _gas2ob(self,gf):
        mol = ob.OBMol()

        for l in gf.split('\n'):
            if len(l) > 0 and l[0] == 'v':
                a = mol.NewAtom()
                atomic_num = int(l.split(' ')[2])
                a.SetAtomicNum(atomic_num)
            elif len(l) > 0 and l[0] == 'e':
                begin_atom_idx = int(l.split(' ')[1]) + 1
                end_atom_idx = int(l.split(' ')[2]) + 1
                bond_order = int(l.split(' ')[3])
                b = mol.AddBond(begin_atom_idx, end_atom_idx, bond_order)
            elif len(l) > 0 and l[0] == '#':
                title = l.split(' ')[1]
                mol.SetTitle(title)
        return mol

    def search(self,freq=None):
        if freq == None: freq = len(self.mols)

        (m,gasfile) = mkstemp()
        (n,output) = mkstemp()
        self.writefile(gasfile)
        os.system("/opt/local/bin/gaston %d %s %s > /dev/null 2>&1" % (freq,gasfile,output))
        mols = self.readfile(output)
        os.unlink(gasfile)
        os.unlink(output)
        return mols

    def mcs(self):
        substructures = self.search()
        mcs = substructures[0]
        for substructure in substructures[1:]:
            if substructure.NumAtoms() > mcs.NumAtoms():
                mcs = substructure
            elif substructure.NumAtoms() == mcs.NumAtoms():
                if substructure.NumBonds() > mcs.NumBonds():
                    mcs = substructure
        return mcs

class MCSSTanimoto:
    def __init__(self,mol1,mol2):
        self.mol1 = mol1
        self.mol2 = mol2

    def score(self):
        mcs = Gaston(mols=[self.mol1,self.mol2]).mcs()
        return float(mcs.NumAtoms()) / (mol1.NumAtoms() + mol2.NumAtoms() - mcs.NumAtoms())


if __name__ == '__main__':
    import sys
    obc = ob.OBConversion()
    obc.SetInAndOutFormats("smi", "smi")

    mol1 = ob.OBMol()
    obc.ReadString(mol1, "CCC1=CC=CS1")
    mol2 = ob.OBMol()
    obc.ReadString(mol2, "OCC1=CC=CS1")
    mol3 = ob.OBMol()
    obc.ReadString(mol3, "CCCC1=CC=CS1")

    gaston = Gaston(mols=[mol1,mol2,mol3])
    print ":::Frequent Structures:::"
    for m in gaston.search():
        sys.stdout.write(obc.WriteString(m))
    print ":::MCS:::"
    sys.stdout.write(obc.WriteString(gaston.mcs()))
    print ":::MCSSTanimoto:::"
    score = MCSSTanimoto(mol1,mol2).score()
    print score

実行結果

:::Frequent Structures:::
CC  3
CC=C    3
C(=C)C=C    3
C(=C)C=CS   3
CC(=C)S 3
CC=CC   3
CC(=CC)S    3
CC=CC=C 3
CC(=CC=C)S  3
Cc1cccs1    3
CC=CC=CS    3
CC=CS   3
CC=CSC  3
c1ccsc1 3
CC=C(SC)C   3
CC=CSCC 3
CCS 3
CCSC    3
CC(=C)SC    3
CCSC=C  3
CC(=C)SC=C  3
C=C 3
C=CS    3
C=CSC   3
C=CSC=C 3
CS  3
CSC 3
:::MCS:::
Cc1cccs1    3
:::MCSSTanimoto:::
0.75

あと実際にモジュール化する場合にはコマンドがどこにあるかわからないし、もしかしたたらインストールされてないかもしれないけど、そういう場合にどういう感じで処理すればいいんだろうか？whichで調べるとかすんのかな？

07082010 Python Flask

Flask+jQuery UIでAutocomplete

Flaskはjsonで返すメソッドがあるのでjQueryとの親和性は高そうな感じ。Autocomplete(suggest機能)が欲しかったので色々調べた結果、jQuery UIのがよさそうだったのでこれで。

GoogleのAJAX Libraries APIを使えばローカルにライブラリ置いておかなくてもよいので楽ちん。タグのリストはあえてJSONで取得するようなサンプルにしてみた。ちなみに実際に作っているモノはSQLiteでデータベースからselectした結果を返すようにしている。

下のコードだけで動く。

from flask import Flask, jsonify

app = Flask(__name__)

@app.route("/")
def hello():
    response = """<html><head>
<link rel="stylesheet" href="http://ajax.googleapis.com/ajax/libs/jqueryui/1.8.3/themes/base/jquery-ui.css" type="text/css" media="all" />
<script src="http://ajax.googleapis.com/ajax/libs/jquery/1.4.2/jquery.min.js" type="text/javascript"></script>
<script src="http://ajax.googleapis.com/ajax/libs/jqueryui/1.8.3/jquery-ui.min.js" type="text/javascript"></script>         
<script type="text/javascript">
$(function() {$.getJSON("/tags", function(json){$("#tags").autocomplete({ source: json.tags});});});
</script></head>
<body><div class="demo"><div class="ui-widget"><label for="tags">Tags: </label><input id="tags"></div></div></body></html>
"""
    return response

@app.route("/tags")
def show_tags():
    tags = ["ActionScript", "AppleScript", "Asp", "BASIC", "C", "C++", "Clojure", "COBOL",
            "ColdFusion", "Erlang", "Fortran", "Groovy", "Haskell", "Java", "JavaScript", 
            "Lisp", "Perl", "PHP", "Python", "Ruby", "Scala", "Scheme"];
    return jsonify(tags = tags)

if __name__ == "__main__":
    app.run()

実際に動かしてみたらタグを複数選択することが出来なくて、自分の欲しいものとはちょっと違うので、AutoSuggest使うかも。

あとautocompleteはリストじゃなくてURLを指定することもできるんだけど、Flask側でJSONのリストだけを返す方法がわからなくて困った。必ずハッシュで返ってきちゃうのかな？

07082010 chemoinformatics R Python ggplot2

化合物の類似性ネットワークのために閾値とedgeの数をプロットする。

昨日の化合物の類似性ネットワークのやつは何がメリットなのかなぁと考えてみたけど、一つにはedgeの数を自由に決められるあたりなんじゃないかと。あんまり低い閾値だとなんでもかんでもつながって何見てるかわからんし、逆に厳しい閾値にしちゃうとシングルトンばかりになっちゃうし。

というわけで、プロット欠かせてそれ見て閾値決める必要はあるんだろうなと。最終的にはeddgesの数とか、ノードあたりに生えているedgeの数の平均とかを統計的に処理して自動で閾値を決めるのが楽なんだろうけど、、、（一般的な方法あるのかな？）。X-meansに倣って、BICとかAIC使ってみようかなぁ。

で、プロット描いた。最初に書いたコードがナイーブすぎて遅くていらっときたので書き直した。まだかなり遅いけど、耐えられるのでこれでよしとした。

import pybel

mols = list(pybel.readfile("sdf", "pc_sample.sdf"))
fps = [x.calcfp() for x in mols] 

similarity_matrix = []
for i in range(len(fps)):
    for j in range(i+1,len(fps)):
        similarity = fps[i] | fps[j]
        similarity_matrix.append(similarity)

threshold = 0.0
print "threshold\tnodes"
while threshold < 100:
    t = threshold / 100.0
    new_matrix = [e for e in similarity_matrix if e > t]
    nodes = len(new_matrix)
    print "%2.1f\t%d" % (threshold, nodes)

    similarity_matrix = new_matrix
    threshold += 0.1

あとはRで

library(ggplot2)
nodes <- read.table("xxxx",header=T)
qplot(threshold,edgess, data=nodes, log="y")
dev.copy(device=png,filename="xxxxxx")
dev.off()

nuedges

node数が1158なのでedgeの数が10000から100あたりの間でちょうどいい閾値を探したいんだけど、これだけじゃわからん。やっぱシングルトンの数とかノードあたりに生えているエッジ数の平均とか数えないとだめかな。

06082010 chemoinformatics Python cytoscape

化合物の類似性でネットワークを構築してCytoscapeで見てみる

chemoinformaticsでもネットワークアナリシスは重要だと思う。今はマイナーだけど今後精力的に研究されんじゃないかなぁと。

cytoscape

pybel使うとシミラリティベースのネットワークは簡単に作れる。

import pybel

mols = list(pybel.readfile("sdf", "pc_sample.sdf"))
fps = [x.calcfp() for x in mols] 

for i in range(len(fps)):
    ids = []
    for j in range(i,len(fps)): 
        if (fps[i] | fps[j]) > 0.5 and i != j: 
            ids.append(mols[j].title)
    print "%s %s %s" % (mols[i].title, "sim", " ".join(ids))

SIF(simple interaction format)形式なので、拡張子sifでcytoscapeに読み込まさせればOK。

あとでケモインフォクックブックにも載せておこう。

05082010 Python openbabel

Pythonで演算子のオーバーライド

openbabelのPythonのとこ見てたらFingerprintsのとこで

>>> print fps[0] | fps[1] # Print the Tanimoto coefficient
0.3333

タニモト距離を求めるように、メソッドオーバーライドしてんのかと。コードを眺めた。

class Fingerprint(object):
    def __init__(self, fingerprint):
        self.fp = fingerprint
    def __or__(self, other):
        return ob.OBFingerprint.Tanimoto(self.fp, other.fp)
    @property 
    def bits(self):
            return _findbits(self.fp, ob.OBFingerprint.Getbitsperint())
    def __str__(self):
        return ", ".join([str(x) for x in self.fp])

参考

3.4.7 数値型をエミュレーションする

Python クックブック第2版
Alex Martelli,Anna Martelli Ravenscroft,David Ascher
オライリー・ジャパン / ￥ 4,410 ()
在庫あり。

04082010 Python Flask

FlaskでJSON出力

jsonifyっていう関数が用意されているので楽ちん

@app.route('/json/tags')
def show_tags():
    tags = db_session.query(Tag).all()
    return jsonify(tags = [tag.name for tag in tags if len(tag.entries) > 0])

JavaScript: The Good Parts ―「良いパーツ」によるベストプラクティス
Douglas Crockford
オライリージャパン / ￥ 1,890 ()
在庫あり。

02082010 Python Flask

FlaskでRSSを出力する

make_responseでresponseオブジェクトを作って、Content-Typeを設定する。

from flask import Flask

app = Flask(__name__)

@app.route("/")
def hello():
    response = app.make_response(rss)
    response.headers['Content-Type'] = 'application/rss+xml'
    return response

if __name__ == "__main__":
    app.run()

RSSの生成には、ElementTreeを使うかエキスパートPythonプログラミングにあるようにテンプレートを使う。

エキスパートPythonプログラミング
Tarek Ziade
アスキー・メディアワークス / ￥ 3,780 ()
在庫あり。

01082010 Python Flask SQLAlchemy

SQLAlchemyのmany-to-manyの検索のやりかたがわからない

なぜかFlaskではなくSQLAlchemyではまる

よくあるタグとエントリーの多対多のテーブル。

entry_tags = Table('entry_tags', Base.metadata,
    Column('entry_id', Integer, ForeignKey('entries.id')),
    Column('tag_id', Integer, ForeignKey('tags.id'))
)

class Tag(Base):
    __tablename__ = 'tags'
    id      = Column(Integer, primary_key=True)
    name    = Column(String(128), unique=True)
    entries = relation("Entry", secondary=entry_tags)

class Entry(Base):
    __tablename__    = 'entries'
    id               = Column(Integer, primary_key=True)
    title            = Column(String(128), unique=True)
    content          = Column(Text())
    pubdate          = Column(DateTime)
    tags             = relation("Tag", secondary=entry_tags)

このとき、タグをもつエントリを検索したい。単にエントリ最新10件をとってきたい場合には

entries = db_session.query(Entry).order_by(Entry.pubdate.desc()).limit(10)

でいい。

続いて、あるタグがふられているエントリの最新10件をとってきたい場合にどう書いていいか悩んだ挙句、結局わからなかったのでINを使うことにした。

tag_ids = [t.id for t in db_session.query(Tag).filter(Tag.name == tagname).first().entries]
entries = db_session.query(Entry).filter(Entry.id.in_(tag_ids)).order_by(Entry.pubdate.desc()).limit(10)

filterかfilter_byに何入れればいいんだろう？

Essential Sqlalchemy
Rick Copeland
Oreilly & Associates Inc / ￥ 3,370 ()
通常1～3週間以内に発送

2010.08.01 追記

relationでorder_byを設定すればよかった。

entries = relation("Entry", secondary=entry_tags, order_by=Entry.pubdate.desc)

でもって

entries = db_session.query(Tag).filter(Tag.name == tagname).first().entries

とすれば、あるタグを含むエントリを日付の最近の順に取ってくる。

31072010 Python

PythonでUnix Time

datetimeじゃなくてtimeモジュールを使う

>>> from time import time
>>> time()
1280575552.882154
>>> int(time())
1280575598

31072010 R Python

PypeR

the Journal of Statistical SoftwareにPypeRというpythonからRを使うパッケージの論文が出てた。

Most Bioconductor tools are presented as R packages. Therefore, R can be a good complement to Python in bioinformatics computation.

メモリの使用量が低く抑えられる感じ？RPyやRPy2とかと比較している

でも、なんかpythonの中でRの構文埋め込んでるみたいな感じだなぁ。PypeRってのはそういうことなのかな。

Bioinformatics Programming Using Python
Mitchell L. Model
Oreilly & Associates Inc / ￥ 5,584 ()
在庫あり。

Statistical Bioinformatics: with R
Sunil K. Mathur
Academic Press / ￥ 5,605 ()
在庫あり。

Building Bioinformatics Solutions: With Perl, R and Mysql
Conrad Bessant,Ian Shadforth,Darren Oakly
Oxford Univ Pr (Txt) / ￥ 4,163 ()
在庫あり。

R Programming for Bioinformatics (Chapman & Hall/Crc Computer Science & Data Analysis)
Robert Gentleman
Crc Pr I Llc / ￥ 6,532 ()
通常2～5週間以内に発送

もう5年目(wishlistありマス♡)
最近はPythonとDeepLearning
日本酒自粛中
ドラムンベースからミニマルまで
ポケモンGOゆるめ