gffファイルをつくる関数
行数と染色体の数、それとタイプを指定してgffの中身をランダムに生成する。
# n: 行数
# chrnum: 染色体の数
# type: "protein_coding", etc.
gff.gen <- function(n,chrnum,type){
chrs <- paste("chr",1:chrnum,sep="")
source <- "R"
type <- type
dots <- rep(".",n)
chr <- sample(chrs,n,replace=T)
start <- abs(round(rnorm(n)*1000000))
end <- start+1500+(round(rnorm(n)/2*500))
strand <- sample(c("+","-"),n,replace=T,prob=c(0.5,0.5))
id <- paste("gene",1:n,sep="")
gff <- data.frame(chr,source,type,start,end,dots,strand,dots,id)
return(gff)
}
# usage
gff <- gff.gen(1000,3,"ncRNA")
# check
head(gff)
# min length
min(gff[,5]-gff[,4])
# distribution
hist(gff[,5]-gff[,4],main="gene length")
# sort.left
gff.s <- sort.left(gff)
# check
head(gff.s)
# number of each strand
nrow(gff.s[gff.s$strand=="+",])
nrow(gff.s[gff.s$strand=="-",])
関数「sort.left()」はこちらを参照。
上の「# usage」以下の実行結果。
> # usage
> gff <- gff.gen(1000,3,"ncRNA")
> # check
> head(gff)
chr source type start end dots strand dots.1 id
1 chr2 R ncRNA 272400 273456 . - . gene1
2 chr1 R ncRNA 1173705 1175235 . + . gene2
3 chr1 R ncRNA 726009 727790 . + . gene3
4 chr3 R ncRNA 2041368 2042869 . - . gene4
5 chr3 R ncRNA 607172 608515 . + . gene5
6 chr3 R ncRNA 556786 558183 . + . gene6
> # min length
> min(gff[,5]-gff[,4])
[1] 767
> # distribution
> hist(gff[,5]-gff[,4],main="gene length")
> # sort.left
> gff.s <- sort.left(gff)
> # check
> head(gff.s)
chr source type start end dots strand dots.1 id
150 chr1 R ncRNA 1017 2463 . - . gene150
378 chr1 R ncRNA 9260 11146 . - . gene378
164 chr1 R ncRNA 9922 10844 . + . gene164
93 chr1 R ncRNA 14776 15981 . + . gene93
557 chr1 R ncRNA 17269 18323 . + . gene557
863 chr1 R ncRNA 21640 22834 . - . gene863
> # number of each strand
> nrow(gff.s[gff.s$strand=="+",])
[1] 495
> nrow(gff.s[gff.s$strand=="-",])
[1] 505
0 件のコメント:
コメントを投稿