Drkcore

import rpy2.robjects.lib.ggplot2 as ggplot2 from rpy2.robjects import r from rpy2.robjects.packages import importr base = importr('base') datasets = importr('datasets') mtcars = datasets.mtcars pp = ggplot2.ggplot(mtcars) + \ ggplot2.aes_string(x='wt', y='mpg', col='factor(cyl)') + \ ggplot2.geom_point() + \ ggplot2.geom_smooth(ggplot2.aes_string(group='cyl'), method='lm') #pp.plot() r['ggsave'](file="mypng.png", plot=pp) #r['dev.off']()

fx <- function(x){x^4-x^2+6} sfx <- function(x){12/(abs(x)+1)} gx <- function(x){1/2*cos(6*x)+7/2} plot(gx,-2,2,xlim=c(-7,7),ylim=c(0,7),xlab="",ylab="") par(new=T) plot(sfx,-7,-1,xlim=c(-7,7),ylim=c(0,7),axes=F,xlab="",ylab="") par(new=T) plot(sfx,1,7,xlim=c(-7,7),ylim=c(0,7),axes=F,xlab="",ylab="") par(new=T) plot(fx,-1,1,xlim=c(-7,7),ylim=c(0,7),axes=F,xlab="",ylab="")

gx <- function(x){ x[-2>x | x>2] <-NA 1/2*cos(6*x)+7/2 } fx <- function(x){ x[x<(-1)] <- NA x[x>1] <- NA x^4-x^2+6 } fx <- function(x){ x[-1<x & x<1] <-NA 12/(abs(x)+1) } ggplot(data.frame(x=c(-7,7),y=0), aes(x,y))+stat_function(fun=gx) \ +stat_function(fun=fx)+stat_function(fun=sx)

07082010 chemoinformatics R Python ggplot2

化合物の類似性ネットワークのために閾値とedgeの数をプロットする。

昨日の化合物の類似性ネットワークのやつは何がメリットなのかなぁと考えてみたけど、一つにはedgeの数を自由に決められるあたりなんじゃないかと。あんまり低い閾値だとなんでもかんでもつながって何見てるかわからんし、逆に厳しい閾値にしちゃうとシングルトンばかりになっちゃうし。

というわけで、プロット欠かせてそれ見て閾値決める必要はあるんだろうなと。最終的にはeddgesの数とか、ノードあたりに生えているedgeの数の平均とかを統計的に処理して自動で閾値を決めるのが楽なんだろうけど、、、（一般的な方法あるのかな？）。X-meansに倣って、BICとかAIC使ってみようかなぁ。

で、プロット描いた。最初に書いたコードがナイーブすぎて遅くていらっときたので書き直した。まだかなり遅いけど、耐えられるのでこれでよしとした。

import pybel

mols = list(pybel.readfile("sdf", "pc_sample.sdf"))
fps = [x.calcfp() for x in mols] 

similarity_matrix = []
for i in range(len(fps)):
    for j in range(i+1,len(fps)):
        similarity = fps[i] | fps[j]
        similarity_matrix.append(similarity)

threshold = 0.0
print "threshold\tnodes"
while threshold < 100:
    t = threshold / 100.0
    new_matrix = [e for e in similarity_matrix if e > t]
    nodes = len(new_matrix)
    print "%2.1f\t%d" % (threshold, nodes)

    similarity_matrix = new_matrix
    threshold += 0.1

あとはRで

library(ggplot2)
nodes <- read.table("xxxx",header=T)
qplot(threshold,edgess, data=nodes, log="y")
dev.copy(device=png,filename="xxxxxx")
dev.off()

nuedges

node数が1158なのでedgeの数が10000から100あたりの間でちょうどいい閾値を探したいんだけど、これだけじゃわからん。やっぱシングルトンの数とかノードあたりに生えているエッジ数の平均とか数えないとだめかな。

library(ggplot2) data <- read.csv("/Users/kzfm/mybp.csv",header=FALSE) time <- strptime(data$V1, "%Y-%m-%d") high <- data$V2 low <- data$V3 mybp <- data.frame(time=time,high=high,low=low) p <- qplot(data=mybp,time,high) p + stat_smooth() + stat_smooth(colour="green", method="lm", formula=y~poly(x,9), size=1,se=FALSE)

Drkcore

rpy2+ggplot2

Rで富士山関数

化合物の類似性ネットワークのために閾値とedgeの数をプロットする。

ここ数年分の血圧データをggplot2で