Establishing Transcriptional Regulatory Network

Quality Control

First we need to check each database to find if the TF is the correct one
we have 101 id and its TFid, we need to blaste with whole genome sequence to find its corresponding 261 id
check the gene location to see if the TF is coded by this gene
# build reference database

makeblastdb -in Niben261_genome.annotation.proteins.fasta -dbtype prot -out Nben261_protein_db &
#makeblastdb -in Niben101_annotation.proteins.fasta -dbtype prot -out Nben101_protein_db &

# 101 gene id --> seq

conda install -c bioconda seqkit
seqkit grep -f gene_id Niben101_annotation.proteins.fasta -o seq_101.fasta

# seq --> 261 gene id

blastp -query seq_101.fasta -db ./ref/Nben261_protein_db -out out_blast -outfmt 6 -evalue 1e-10 -num_threads 6 -num_alignments 3


# get gene.gff

cat Niben261_genome.annotation.gene_models.gff | awk -v OFS="\t" '{if ($3=="gene") {print $1,$2,$3,$4,$5,$6,$7,$8,$9}}' > Nb_gene.gff

# get gene.bed

awk '{OFS="\t"}{split($9,a,";"); split(a[1],b,"=")}{print $1"\t"$4"\t"$5"\t"$7"\t"b[2]}' Nb_gene.gff > Nb_TF.bed

#########

ls ./peak/*narrowPeak|while read id;
do
label=$(basename $id "_peaks.narrowPeak")
echo $label
sort -n -r -k 7 $id | head -10 > ./overlap/$label.tmp
bedtools intersect -a Nb_TF.bed -b ./overlap/$label.tmp -wa -wb > ./overlap/$label.genebody_overlap.txt
done &

SRR15036556_peaks.narrowPeak
sort -n -r -k 7 ./peak/SRR15036556_peaks.narrowPeak | head -10 > ./overlap/SRR15036556.tmp
bedtools intersect -a Nb_TF.bed -b ./overlap/SRR15036556.tmp -wa -wb > ./overlap/SRR15036556.genebody_overlap.txt
rm ./overlap/$label.narrowpeak.tmp

### ./overlap contains no best genes

Find Target Gene

Here we need to find target genes of each TF with two replication
using the method of TIP, the original essay has been put below, along with the reference dateset
we need to rewrite the code to fit our Nb-genome
# wigfilename: the name of the wiggle file
# annotationfilename: the names of annotation file hg18
# width: the window size upstream
## smooth: Logic value, if smooth=T perform smoothing for the TF binding profile;

#####
peak2weight<-function(peakfilename, annotationfilename, width=10000, smooth=T, outfilename)
{
chr.len = c(186295347,151625522,194605305,189815799,138451958,148966868,152648067,
137820123,137640743,138251820,143669543,158218613,153652170,153778118,
153112746,148817306,155116228,148425545,148948562)
chr = sprintf("%02d", 1:19)
chr.nam = paste("Niben261Chr", chr, sep = "")
# chr.len = chr.len[1:(length(chr.len)-2)]
# chr.nam = chr.nam[1:(length(chr.nam)-2)]
# read in gene info
mygene = read.table(annotationfilename, sep="\t", header=F, colClasses=c(rep("character", 3), rep("numeric", 2)))
# tag = substr(mygene[,1], 1, 3)
# mygene = mygene[tag=="NM_", 1:5]
colnames(mygene) = c("name", "chr", "str", "sta", "end")
tmp = mygene$sta
tmp1 = tmp-width
tmp2 = tmp+width
tag = mygene[,3]=="-"
tmp[tag==1] = mygene[tag==1,5]
tmp1[tag==1] = tmp[tag==1]+width
tmp2[tag==1] = tmp[tag==1]-width
mygene$sta = tmp1
mygene$end = tmp2
for(k in 1:length(chr.nam))
{
tag = mygene$chr==chr.nam[k]
tmp = mygene$sta[tag==1]
tmp[tmp<1] = 1
tmp[tmp>chr.len[k]] = chr.len[k]
mygene$sta[tag==1] = tmp
tmp = mygene$end[tag==1]
tmp[tmp<1] = 1
tmp[tmp>chr.len[k]] = chr.len[k]
mygene$end[tag==1] = tmp
}
# read in peak file
data = read.table(peakfilename, sep="\t", header=F)
colnames(data) = c("chr", "height", "start", "end", "numtags", "max_ht_offset")
## create weight file
myw = rep(0, width*2+1)
for(k in 1:length(chr.nam))
{
cat("\r", chr.nam[k])
# signal vector
read.cov = rep(0, chr.len[k])
tag = data$chr==chr.nam[k]
if(sum(tag)==0) next
mysig = data[tag==1,]
for(i in 1:nrow(mysig))
{
tmp1 = as.numeric(mysig$start[i])+1 # start line
tmp2 = as.numeric(mysig$end[i]) # end line
read.cov[tmp1:tmp2] = read.cov[tmp1:tmp2] + as.numeric(mysig$height[i])
}
# refseq
curgene = mygene[mygene[,2]==chr.nam[k],]
for(i in 1:nrow(curgene))
{
myw = myw+read.cov[curgene[i,4]:curgene[i,5]]
}
}
myw = myw/nrow(mygene)
tmp = myw
if(smooth==T)
{
for(i in 1:length(myw))
{
myw[i] = mean(tmp[max(i-250,1):min(i+250, length(myw))])
}
}
myw = myw/sum(myw)
write.table(myw, outfilename, sep="\t", row.names=F, col.names=F, quote=F)
}
#####

calscoreonpeak<-function(peakfilename, annotationfilename, weightfilename, outfilename)
{
chr.len = c(186295347,151625522,194605305,189815799,138451958,148966868,152648067,
137820123,137640743,138251820,143669543,158218613,153652170,153778118,
153112746,148817306,155116228,148425545,148948562)
chr = sprintf("%02d", 1:19)
chr.nam = paste("Niben261Chr", chr, sep = "")
# read in weight
conIn = file(weightfilename, "r")
myw = readLines(conIn)
close(conIn)
myw = as.numeric(myw)
width = (length(myw)-1)/2
# read in gene info
mygene = read.table(annotationfilename, sep="\t", header=F, colClasses=c(rep("character", 3), rep("numeric", 2)))
# tag = substr(mygene[,1], 1, 3)
# mygene = mygene[tag=="NM_", 1:5]
colnames(mygene) = c("name", "chr", "str", "sta", "end")
# read in peak file
data = read.table(peakfilename, sep="\t", header=F)
colnames(data) = c("chr", "height", "start", "end", "numtags", "max_ht_offset")
# data$chr = paste("chr", data$chr, sep="")
## calculate score
mysco = rep(0, nrow(mygene)) #TSS
gname = rep("",nrow(mygene))
count = 0
for(k in 1:length(chr.nam))
{
cat("\r", chr.nam[k])
# signal vector
read.cov = rep(0, chr.len[k])
tag = data$chr==chr.nam[k]
if(sum(tag)==0) next
mysig = data[tag==1,]
for(i in 1:nrow(mysig))
{
tmp1 = as.numeric(mysig$start[i])+1 # start line
tmp2 = as.numeric(mysig$end[i]) # end line
read.cov[tmp1:tmp2] = read.cov[tmp1:tmp2] + as.numeric(data$height[i])
}
# refseq
curgene = mygene[mygene[,2]==chr.nam[k],]
for(i in 1:nrow(curgene))
{
count = count + 1
if(curgene$str[i]=="+")
{
tmp= read.cov[max(1,curgene$sta[i]-width):min(curgene$sta[i]+width, chr.len[k])]
b1 = 1-(curgene$sta[i]-width)
b2 = curgene$sta[i]+width-chr.len[k]
if(b1>0)
{
tmp=c(rep(0, b1), tmp)
}
if(b2>0)
{
tmp=c(tmp, rep(0, b2))
}
mysco[count] = sum(tmp*myw)
}
if(curgene$str[i]=="-")
{
tmp= read.cov[min(chr.len[k],curgene$end[i]+width):max(curgene$end[i]-width, 1)]
b1 = curgene$end[i]+width-chr.len[k]
b2 = 1-(curgene$end[i]-width)
if(b1>0)
{
tmp=c(rep(0, b1), tmp)
}
if(b2>0)
{
tmp=c(tmp, rep(0, b2))
}
mysco[count] = sum(tmp*myw)
}
gname[count] = curgene[i,1]
}
}
zscore = (mysco-mean(mysco))/sd(mysco) #Z-score based onTSS
pvalue = pnorm(-zscore)
res = cbind(gname, mysco, zscore, pvalue)
colnames(res) = c("name", "raw.score", "zscore", "p.value")
res = res[order(res[,1], decreasing=T), ]
write.table(res, outfilename, sep="\t", row.names=F, quote=F)
}


And here is the circle
while IFS= read -r id; do
peakFile="${id}.peak"
weightFile="${id}_weight.txt"
scoreFile="${id}_score.txt"
Rscript -e "
setwd(\"/home/octavia/project/202304_Nb_ChIP/tip_peak/\")
source(\"tip.r\")
mypeakFile <- \"${peakFile}\"
myannoFile <- \"Nb_tip_annotation.txt\"
myoutFile <- \"${weightFile}\"
peak2weight(mypeakFile, myannoFile, width = 10000, smooth = TRUE, myoutFile)
mypeakFile <- \"${peakFile}\"
myannoFile <- \"Nb_tip_annotation.txt\"
myweightFile <- \"${weightFile}\"
myoutFile <- \"${scoreFile}\"
calscoreonpeak(mypeakFile, myannoFile, myweightFile, myoutFile)
"
done < peak.list
using this method, we can get target genes for each TF
Want to print your doc?
This is not the way.
Try clicking the ⋯ next to your doc name or using a keyboard shortcut (
CtrlP
) instead.