Welcome
This site is for analyzing RNA-sequence using tophat, cufflinks and sequgio. Mostly the examples are given using UPPMAX (http://www.uppmax.uu.se/) facilities.
Data Preparation
- Fastq files
- Human Reference Genome (Fasta file)
- human reference genome annotation database (B37 from EnsEMBL or hg19) (gtf file)
Alignment
#!/bin/bash -l
#SBATCH -A b2012036#SBATCH -p node -n 8#SBATCH -t 40:00:00#SBATCH –mail-user=user@ki.se#SBATCH –mail-type=ALL#SBATCH -J tophatmodule load bioinfo-toolsmodule load tophat/1.4.0tophat -o INBOX/BRCA/batch1/tophat.output.SRR327626 -p 8 –no-novel-juncs –library-type=fr-unstranded -G reference/genes.gtf reference/BowtieIndex/genome INBOX/BRCA/batch1/SRR327626_1.fastq.gz INBOX/BRCA/batch1/SRR327626_2.fastq.gz
Cufflinks
module load bioinfo-toolsmodule load cufflinks/2.0.2cufflinks -o INBOX/BRCA/batch1/Cuffres.SRR327626 -p 8 -G reference/genes.gtf -b reference/BowtieIndex/genome.fa INBOX/BRCA/batch1/tophat.output.SRR327626/accepted_hits.bam
topcuf_batch7sc.txt
):#!/bin/bash -l#SBATCH -A b2012036#SBATCH -p node -n 8#SBATCH -t 60:00:00#SBATCH –mail-user=user@ki.se#SBATCH –mail-type=ALL#SBATCH -J TCB7scmodule load bioinfo-toolsmodule load tophat/1.4.0module load cufflinks/2.0.2tophat -o /scratch/tophat.outputsc.$1 \-p 8 –library-type=fr-unstranded \-G reference/genes.gtf reference/BowtieIndex/genome \INBOX/BRCA/batch7/$1_1.fastq.gz \INBOX/BRCA/batch7/$1_2.fastq.gzcp -r /scratch/tophat.outputsc.$1 INBOX/BRCA/batch7/cufflinks -o /scratch/CuffresGsc.$1 \-p 8 -G reference/genes.gtf \-b reference/BowtieIndex/genome.fa \INBOX/BRCA/batch7/tophat.outputsc.$1/accepted_hits.bamcp -r /scratch/CuffresGsc.$1 INBOX/BRCA/batch7/
sbatch topcuf_batch7sc.txt
SRR327626
#################################
## Run Tophat and Cufflinks ###
## allowing novel transcripts ###
## BATCH 7
###
#################################
setwd('/lynx/cvol/v25/b2012036/INBOX/BRCA/')
f.long = dir(recursive=TRUE)
ffastq <- f.long[grep(".fastq.gz", f.long)]
SRR.batch7 = unique(substr(ffastq ,1,9))
write.csv2(SRR.batch7,file="SRR.batch7.csv")
setwd('
/lynx/cvol/v25/b2012036/INBOX/BRCA/')
topcufMC7sc = function(f){
cmd =paste('sbatch topcuf_batch7sc.txt', f,sep=' ')
system(cmd)
}
for (i in 1:length(
SRR.batch7) ) topcufMC7sc(SRR.batch7[i])
runReshape.R
)library(Sequgio)
dbfile <- "GRCh37.69.sqlite"
mybio <- loadDb(dbfile)
mparam <- MulticoreParam(8)
txdb <- reshapeTxDb(mybio,probelen=50L,with.junctions=T,mcpar=mparam)
save(txdb,file= "txdb37.RData")
makeDesign.R
):mparam <- MulticoreParam(16)
attr(txdb,"probelen") = 50L
Design <- makeXmatrix(txdb,method="PE",mulen=200,sdlen=80,mcpar=mparam)
save(Design, file="Design.RData")
export PYTHONPATH=/home/dhany/pysam-0.7.5/lib64/python2.6/site-packages/
cd /home/dhany/pysam-0.7.5/python setup.py install –prefix /home/dhany/pysam-0.7.5date; python /home/dhany/fixQNAME.py -i yourfile.bam -o yourfile.fixed.bam -r “/\d+$” -s “”; date
Get counts runGetcountsBatch7.R:
library(Sequgio)
mparam <- MulticoreParam(8)
load( "txdb37.RData")
loc <- "/pica/h1/setia/BRCA/INBOX/BRCA/batch7/"
files <- dir(path=loc )
files <- files[grep("tophat.outputsc",files)]
args=(commandArgs(TRUE))
args
args[[1]]
if(length(args)==0)
stop("No chromosome supplied.")
eval(parse(text=args[[1]]))
samples <- substr(files[i],17,25)
samples
target <- data.frame(filenames= paste(loc, "tophat.outputsc.", samples, "/accepted_hits.bam", sep="") ,
samplenames=samples ,
index=paste(loc, "tophat.outputsc.", samples, "/accepted_hits.bam.bai", sep=""),stringsAsFactors=FALSE)
allCounts.bigM <- getCounts(target,txdb ,mcpar=mparam,mapq.filter= 30,use.samtools=T)
allCounts <- as.matrix(allCounts.bigM[,,drop=FALSE])
save(allCounts, file= paste("/bubo/home/h1/setia/BRCA/Sequgio/batch7/allCounts.",samples,".RData", sep="") )
i
for a sample defined in (Getcountbatch7.txt
):#!/bin/bash -l
#SBATCH -A b2012036
#SBATCH -p node -n 8
#SBATCH -t 5:00:00
#SBATCH -C mem72GB
#SBATCH --mail-user=setia.pramana@ki.se
#SBATCH --mail-type=ALL
#SBATCH -J CountB7
module load bioinfo-tools
module load GATK
module load samtools/0.1.18
R < runGetcountsBatch7.R --no-save $1
sbatch Getcountbatch7.txt '--args i=1'
MultsubmitBatch7.txt
):#!/bin/bash -l
#SBATCH -A b2012036
#SBATCH -p core -n 5
#SBATCH -t 15:00 --qos=short
#SBATCH -J submit
#SBATCH --mail-user=setia.pramana@ki.se
#SBATCH --mail-type=ALL
##################
p="--args"
u=" i="
v="sbatch Getcountbatch7.txt"
# sbatch MultsubmitBatch7.txt #
for i in {1..40}
do
echo $v \'$p$u$i\'
eval $v \'$p$u$i\'
done
sbatch
MultsubmitBatch7.txt
r
unGetcountsBatch7.R
library(Sequgio)
setwd('/pica/h1/setia/BRCA/INBOX/Sequgio_TCGA/')
load('/pica/h1/setia/BRCA/INBOX/BRCA/txdb37.RData')
loc <- "/pica/h1/setia/BRCA/INBOX/Sequgio_TCGA/batch7/"
files <- dir(path=loc )
files <- files[grep("allCounts",files)]
allCountsMat <- NULL
for (i in 1:length(files)) {
load(paste(loc ,files[i],sep="" ) )
allCountsMat <- cbind(allCountsMat, allCounts )
rm(allCounts )
cat(i)
}
load('/pica/h1/setia/BRCA/Design37.RData')
## Fit Models ##
library(parallel)
gNames <- sapply(Design, function(x) strsplit(attributes(x)$dimnames[[2]][1], '__')[[1]][2])
names(gNames) <- gNames
names(Design) <- gNames
Thetas <- mclapply(gNames,fitModels,design=Design,counts=allCountsMat ,maxit=20,verbose=T, useC=F, ls.start=F, Q1=0.9)
save(Thetas , file='Thetas.batch7.RData')
/lynx/cvol/v25/b2012036/INBOX/Sequgio_TCGA
library(Sequgio)
dbfile <- "/proj/b2012036/GRCh37.69.sqlite"
mybio <- loadDb(dbfile)
mparam <- MulticoreParam(8)
txdb <- reshapeTxDb(mybio,probelen=50L,with.junctions=T,mcpar=mparam)
save(txdb,file="/proj/b2012036/Dhany/Sequgio/txdbgrch.RData")
write.table(as.data.frame(txdb@unlistData), "/proj/b2012036/Dhany/Sequgio/txdb.sql", sep="\t")
db <- dbConnect(SQLite(), dbname=”/proj/b2012036/Dhany/Sequgio/grch3769.sqlite”)dbWriteTable(conn=db, name=”humangenome”, value=”txdb.sql”, row.names=FALSE, header=FALSE, sep=”\t”)
library(Sequgio)load(“/proj/b2012036/Dhany/Sequgio/txdbgrch.RData”)mparam <- MulticoreParam(16)
attr(txdb,"probelen") = 50L
Design <- makeXmatrix(txdb,method="PE",mulen=200,sdlen=80, mcpar=mparam)
save(Design, file="/proj/b2012036/Dhany/Sequgio/DesignGrch.RData")
cd /proj/b2012036/INBOX/Dhany/newdata/tophat.outputsc.SRR328008/bash /proj/b2012036/Dhany/Sequgio/preprocess.sh file=accepted_hits.bamfor (( i=1; i<=22; i++ )); do awk -v j=$i ‘{ if($3==j) print $0 }’ accepted_hits.bam.sortMA > accepted_hits.$i & doneawk ‘{ if($3==”X”) print $0 }’ accepted_hits.bam.sortMA > accepted_hits.X &awk ‘{ if($3==”Y”) print $0 }’ accepted_hits.bam.sortMA > accepted_hits.Y &waitfor (( i=1; i<=22; i++ )); do python /proj/b2012036/Dhany/Sequgio/getPairCount.py accepted_hits output.$i /proj/b2012036/Dhany/Sequgio/grch3769.sqlite $i & donepython /proj/b2012036/Dhany/Sequgio/getPairCount.py accepted_hits output.X /proj/b2012036/Dhany/Sequgio/grch3769.sqlite X &python /proj/b2012036/Dhany/Sequgio/getPairCount.py accepted_hits output.Y /proj/b2012036/Dhany/Sequgio/grch3769.sqlite Y &waitcat output.* > outputfinal.txtrm -f output.*for (( i=1; i<=22; i++ )); do rm -f accepted_hits.$i & donerm -f accepted_hits.Xrm -f accepted_hits.Y
library(Sequgio)load(“txdbgrch.RData”)ex_list <- split(values(txdb@unlistData)$exon_name,values(txdb@unlistData)$tx_name)reg_vec <- sapply(split(values(txdb@unlistData)$region_id,values(txdb@unlistData)$tx_name),function(x) x[1])sizes_ex_list <- sapply(ex_list,length)n.exons <- sum((sizes_ex_list^2+sizes_ex_list)/2)exons.names <- unique(.Call("makeExNames",ex_list,reg_vec,as.integer(n.exons)))
write.table(exons.names, “exons19.txt”, row.names=FALSE, col.names=FALSE, quote=FALSE, sep=”\t”)
python /proj/b2012036/Dhany/Sequgio/makeAllcountMatrix.py /proj/b2012036/Dhany/Sequgio/exons.txt 5 /proj/b2012036/INBOX/Dhany/newdata/tophat.output.SRR327626/output.txt /proj/b2012036/INBOX/Dhany/newdata/tophat.output.SRR327734/outputfinal.txt /proj/b2012036/INBOX/Dhany/newdata/tophat.output.SRR327735/outputfinal.txt /proj/b2012036/INBOX/Dhany/newdata/tophat.output.SRR327736/outputfinal.txt /proj/b2012036/INBOX/Dhany/newdata/tophat.output.SRR327737/outputfinal.txt SRR327626 SRR327734 SRR327735 SRR327736 SRR327737 > sample.counts;
bash /proj/b2012036/Dhany/Sequgio/batch_fastfitting.sh <size of Design matrix> <Design matrix.RData> <allCounts> <proj number> <emails for slurm status> <number of cores per job> <number of Design matrix size to process per batch> <your uppmax username>Play around with <number of cores per job> and <number of Design matrix size to process per batch> to get better performance.
bash
/proj/b2012036/Dhany/Sequgio/batch_fastfitting.sh31700 /proj/b2012036/Dhany/Sequgio/DesignGrch.RData /proj/b2012036/INBOX/Dhany/newdata/sample.count.grch b2012036 dhany.saputra@ki.se 4 500 dhanycat myR.* > myfpkm.txt