Gallery

ChIP-seq Analysis

Evaluation of ChIP-seq data

Establishing Transcriptional Regulatory Network

Explore

Establishing Transcriptional Regulatory Network

Quality Control

First we need to check each database to find if the TF is the correct one

we have 101 id and its TFid, we need to blaste with whole genome sequence to find its corresponding 261 id

check the gene location to see if the TF is coded by this gene

# build reference database

makeblastdb -in Niben261_genome.annotation.proteins.fasta -dbtype prot -out Nben261_protein_db &

#makeblastdb -in Niben101_annotation.proteins.fasta -dbtype prot -out Nben101_protein_db &

# 101 gene id --> seq

conda install -c bioconda seqkit

seqkit grep -f gene_id Niben101_annotation.proteins.fasta -o seq_101.fasta

# seq --> 261 gene id

blastp -query seq_101.fasta -db ./ref/Nben261_protein_db -out out_blast -outfmt 6 -evalue 1e-10 -num_threads 6 -num_alignments 3

# get gene.gff

cat Niben261_genome.annotation.gene_models.gff | awk -v OFS="\t" '{if ($3=="gene") {print $1,$2,$3,$4,$5,$6,$7,$8,$9}}' > Nb_gene.gff

# get gene.bed

awk '{OFS="\t"}{split($9,a,";"); split(a[1],b,"=")}{print $1"\t"$4"\t"$5"\t"$7"\t"b[2]}' Nb_gene.gff > Nb_TF.bed

#########

ls ./peak/*narrowPeak|while read id;

label=$(basename $id "_peaks.narrowPeak")

echo $label

sort -n -r -k 7 $id | head -10 > ./overlap/$label.tmp

bedtools intersect -a Nb_TF.bed -b ./overlap/$label.tmp -wa -wb > ./overlap/$label.genebody_overlap.txt

done &

SRR15036556_peaks.narrowPeak

sort -n -r -k 7 ./peak/SRR15036556_peaks.narrowPeak | head -10 > ./overlap/SRR15036556.tmp

bedtools intersect -a Nb_TF.bed -b ./overlap/SRR15036556.tmp -wa -wb > ./overlap/SRR15036556.genebody_overlap.txt

rm ./overlap/$label.narrowpeak.tmp

### ./overlap contains no best genes

Find Target Gene

Here we need to find target genes of each TF with two replication

using the method of TIP, the original essay has been put below, along with the reference dateset

⁠

https://pubmed.ncbi.nlm.nih.gov/22039215/⁠

⁠

we need to rewrite the code to fit our Nb-genome

# wigfilename: the name of the wiggle file

# annotationfilename: the names of annotation file hg18

# width: the window size upstream

## smooth: Logic value, if smooth=T perform smoothing for the TF binding profile;

#####

peak2weight<-function(peakfilename, annotationfilename, width=10000, smooth=T, outfilename)

{

chr.len = c(186295347,151625522,194605305,189815799,138451958,148966868,152648067,

137820123,137640743,138251820,143669543,158218613,153652170,153778118,

153112746,148817306,155116228,148425545,148948562)

chr = sprintf("%02d", 1:19)

chr.nam = paste("Niben261Chr", chr, sep = "")

# chr.len = chr.len[1:(length(chr.len)-2)]

# chr.nam = chr.nam[1:(length(chr.nam)-2)]

# read in gene info

mygene = read.table(annotationfilename, sep="\t", header=F, colClasses=c(rep("character", 3), rep("numeric", 2)))

# tag = substr(mygene[,1], 1, 3)

# mygene = mygene[tag=="NM_", 1:5]

colnames(mygene) = c("name", "chr", "str", "sta", "end")

tmp = mygene$sta

tmp1 = tmp-width

tmp2 = tmp+width

tag = mygene[,3]=="-"

tmp[tag==1] = mygene[tag==1,5]

tmp1[tag==1] = tmp[tag==1]+width

tmp2[tag==1] = tmp[tag==1]-width

mygene$sta = tmp1

mygene$end = tmp2

for(k in 1:length(chr.nam))

{

tag = mygene$chr==chr.nam[k]

tmp = mygene$sta[tag==1]

tmp[tmp<1] = 1

tmp[tmp>chr.len[k]] = chr.len[k]

mygene$sta[tag==1] = tmp

tmp = mygene$end[tag==1]

tmp[tmp<1] = 1

tmp[tmp>chr.len[k]] = chr.len[k]

mygene$end[tag==1] = tmp

}

# read in peak file

data = read.table(peakfilename, sep="\t", header=F)

colnames(data) = c("chr", "height", "start", "end", "numtags", "max_ht_offset")

## create weight file

myw = rep(0, width*2+1)

for(k in 1:length(chr.nam))

{

cat("\r", chr.nam[k])

# signal vector

read.cov = rep(0, chr.len[k])

tag = data$chr==chr.nam[k]

if(sum(tag)==0) next

mysig = data[tag==1,]

for(i in 1:nrow(mysig))

{

tmp1 = as.numeric(mysig$start[i])+1 # start line

tmp2 = as.numeric(mysig$end[i]) # end line

read.cov[tmp1:tmp2] = read.cov[tmp1:tmp2] + as.numeric(mysig$height[i])

}

# refseq

curgene = mygene[mygene[,2]==chr.nam[k],]

for(i in 1:nrow(curgene))

{

myw = myw+read.cov[curgene[i,4]:curgene[i,5]]

}

myw = myw/nrow(mygene)

tmp = myw

if(smooth==T)

{

for(i in 1:length(myw))

{

myw[i] = mean(tmp[max(i-250,1):min(i+250, length(myw))])

}

myw = myw/sum(myw)

write.table(myw, outfilename, sep="\t", row.names=F, col.names=F, quote=F)

}

#####

calscoreonpeak<-function(peakfilename, annotationfilename, weightfilename, outfilename)

{

chr.len = c(186295347,151625522,194605305,189815799,138451958,148966868,152648067,

137820123,137640743,138251820,143669543,158218613,153652170,153778118,

153112746,148817306,155116228,148425545,148948562)

chr = sprintf("%02d", 1:19)

chr.nam = paste("Niben261Chr", chr, sep = "")

# read in weight

conIn = file(weightfilename, "r")

myw = readLines(conIn)

close(conIn)

myw = as.numeric(myw)

width = (length(myw)-1)/2

# read in gene info

mygene = read.table(annotationfilename, sep="\t", header=F, colClasses=c(rep("character", 3), rep("numeric", 2)))

# tag = substr(mygene[,1], 1, 3)

# mygene = mygene[tag=="NM_", 1:5]

colnames(mygene) = c("name", "chr", "str", "sta", "end")

# read in peak file

data = read.table(peakfilename, sep="\t", header=F)

colnames(data) = c("chr", "height", "start", "end", "numtags", "max_ht_offset")

# data$chr = paste("chr", data$chr, sep="")

## calculate score

mysco = rep(0, nrow(mygene)) #TSS

gname = rep("",nrow(mygene))

count = 0

for(k in 1:length(chr.nam))

{

cat("\r", chr.nam[k])

# signal vector

read.cov = rep(0, chr.len[k])

tag = data$chr==chr.nam[k]

if(sum(tag)==0) next

mysig = data[tag==1,]

for(i in 1:nrow(mysig))

{

tmp1 = as.numeric(mysig$start[i])+1 # start line

tmp2 = as.numeric(mysig$end[i]) # end line

read.cov[tmp1:tmp2] = read.cov[tmp1:tmp2] + as.numeric(data$height[i])

}

# refseq

curgene = mygene[mygene[,2]==chr.nam[k],]

for(i in 1:nrow(curgene))

{

count = count + 1

if(curgene$str[i]=="+")

{

tmp= read.cov[max(1,curgene$sta[i]-width):min(curgene$sta[i]+width, chr.len[k])]

b1 = 1-(curgene$sta[i]-width)

b2 = curgene$sta[i]+width-chr.len[k]

if(b1>0)

{

tmp=c(rep(0, b1), tmp)

}

if(b2>0)

{

tmp=c(tmp, rep(0, b2))

}

mysco[count] = sum(tmp*myw)

}

if(curgene$str[i]=="-")

{

tmp= read.cov[min(chr.len[k],curgene$end[i]+width):max(curgene$end[i]-width, 1)]

b1 = curgene$end[i]+width-chr.len[k]

b2 = 1-(curgene$end[i]-width)

if(b1>0)

{

tmp=c(rep(0, b1), tmp)

}

if(b2>0)

{

tmp=c(tmp, rep(0, b2))

}

mysco[count] = sum(tmp*myw)

}

gname[count] = curgene[i,1]

}

zscore = (mysco-mean(mysco))/sd(mysco) #Z-score based onTSS

pvalue = pnorm(-zscore)

res = cbind(gname, mysco, zscore, pvalue)

colnames(res) = c("name", "raw.score", "zscore", "p.value")

res = res[order(res[,1], decreasing=T), ]

write.table(res, outfilename, sep="\t", row.names=F, quote=F)

}

And here is the circle

while IFS= read -r id; do

peakFile="${id}.peak"

weightFile="${id}_weight.txt"

scoreFile="${id}_score.txt"

Rscript -e "

setwd(\"/home/octavia/project/202304_Nb_ChIP/tip_peak/\")

source(\"tip.r\")

mypeakFile <- \"${peakFile}\"

myannoFile <- \"Nb_tip_annotation.txt\"

myoutFile <- \"${weightFile}\"

peak2weight(mypeakFile, myannoFile, width = 10000, smooth = TRUE, myoutFile)

mypeakFile <- \"${peakFile}\"

myannoFile <- \"Nb_tip_annotation.txt\"

myweightFile <- \"${weightFile}\"

myoutFile <- \"${scoreFile}\"

calscoreonpeak(mypeakFile, myannoFile, myweightFile, myoutFile)

done < peak.list

using this method, we can get target genes for each TF

Want to print your doc?
This is not the way.

Try clicking the ⋯ next to your doc name or using a keyboard shortcut (

CtrlP

) instead.