Skip to content

Commit 2a60d79

Browse files
committed
updated tss-prox intersection
1 parent 3a4ece6 commit 2a60d79

File tree

2 files changed

+80
-26
lines changed

2 files changed

+80
-26
lines changed
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/bin/bash
2+
3+
data=$1
4+
biosample=$(echo $data | awk -F "." '{print $1}')
5+
6+
setDir=~/Lab/Target-Gene/Benchmark
7+
scriptDir=~/Projects/Target-Gene-Prediction/Scripts/Benchmark-Characteristics
8+
train=$setDir/$data-Benchmark.v3.txt
9+
output=~/Lab/Target-Gene/Benchmark/Characteristics/TF-Signal
10+
dataDir=/data/projects/encode/data/
11+
tss=~/Lab/Reference/Human/hg19/Gencode19/TSS.2019.bed
12+
ccres=~/Lab/ENCODE/Encyclopedia/V4/Registry/V4-hg19/hg19-ccREs-Simple.bed
13+
tfList=~/Lab/Target-Gene/Benchmark/Characteristics/CTCF-List.txt
14+
15+
cat $train | awk '{print $1}' | sort -u > ccres
16+
awk 'FNR==NR {x[$1];next} ($5 in x)' ccres $ccres > tmp1
17+
18+
#cat $train | awk '{print $2}' | sort -u > genes
19+
#awk 'FNR==NR {x[$1];next} ($7 in x)' genes $tss | \
20+
# awk '{print $1 "\t" $2-250 "\t" $3+250 "\t" $4}' > tss
21+
22+
width=0
23+
24+
dset=$(grep $biosample $tfList | awk '{print $1}')
25+
dsig=$(grep $biosample $tfList | awk '{print $2}')
26+
27+
awk -F "\t" '{printf "%s\t%.0f\t%.0f\t%s\n", $1,$2-'$width',$3+'$width',$4}' \
28+
tmp1 | awk '{if ($2 < 0) print $1 "\t" 0 "\t" $3 "\t" $4 ; else print $0}' \
29+
| sort -u > little
30+
31+
~/bin/bigWigAverageOverBed -bedOut=out2.bed $dataDir/$dset/$dsig.bigWig little out2
32+
33+
awk '{print $1 "\t" $5}' out2 | sort -k1,1 > $output/$data"-ELS."$dset"-"$dsig".txt"
34+
35+
rm ccres tmp1 little out2.bed out2

Scripts/Generate-Benchmark/Generate-Benchmark.sh

Lines changed: 45 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,68 +4,63 @@ biosample=$1
44
linkType=$2
55
links=$3
66
name=$4
7+
blackList=$5
78

89
genome=hg19
910
version=V4
1011
scriptDir=~/Projects/Target-Gene-Prediction/Scripts/Generate-Benchmark
1112
masterDir=~/Lab/ENCODE/Encyclopedia/$version/Registry/$version-$genome/
1213
masterList=$masterDir/Cell-Type-Specific/Master-Cell-List.txt
1314

14-
if [[ $genome == "mm10" ]]
15-
then
16-
tss=~/Lab/Reference/Mouse/GencodeM4/TSS.Filtered.4K.bed
17-
elif [[ $genome == "hg38" ]]
18-
then
19-
tss=~/Lab/Reference/Human/$genome/GENCODE24/TSS.Filtered.4K.bed
20-
elif [[ $genome == "hg19" ]]
21-
then
22-
tss=~/Lab/Reference/Human/$genome/Gencode19/TSS.Filtered.4K.bed
23-
fi
15+
prox=~/Lab/Reference/Human/$genome/Gencode19/TSS.2019.4K.bed
16+
tss=~/Lab/Reference/Human/$genome/Gencode19/TSS.2019.bed
2417

2518
file=$(awk '{if ($9 == "'$biosample'") print $2}' $masterList)
2619

2720
bedtools intersect -v -a $masterDir/Cell-Type-Specific/Five-Group/$file*.bed \
28-
-b $tss | grep "Enhancer" > enhancers.bed
21+
-b $prox | grep "Enhancer" > enhancers.bed
2922

3023
if [ $linkType == "ChIA-PET" ]
3124
then
3225
awk '{if ($NF >= 4) print $0}' $links > links
33-
python $scriptDir/process.chiapet.py links enhancers.bed $tss \
34-
$name-Blacklist.txt > $name-Links.txt
26+
python $scriptDir/process.chiapet.py links enhancers.bed $prox \
27+
$name-Blacklist.txt $blackList > $name-Links.txt
3528

3629
elif [ $linkType == "Hi-C" ]
3730
then
3831
awk '{if (NR != 1) print "chr"$1 "\t" $2 "\t" $3 "\t" "chr"$4 \
3932
"\t" $5 "\t" $6}' $links > links
40-
python $scriptDir/process.chiapet.py links enhancers.bed $tss \
41-
$name-Blacklist.txt > $name-Links.txt
33+
python $scriptDir/process.chiapet.py links enhancers.bed $prox \
34+
$name-Blacklist.txt $blackList > $name-Links.txt
4235
rm links
4336

4437
elif [ $linkType == "CHi-C" ]
4538
then
4639
awk '{if (NR != 1 && $NF > 10) print $1 "\t" $2 "\t" $3 "\t" $7 \
4740
"\t" $8 "\t" $9}' $links > links
48-
python $scriptDir/process.chiapet.py links enhancers.bed $tss \
49-
$name-Blacklist.txt > $name-Links.txt
41+
python $scriptDir/process.chiapet.py links enhancers.bed $prox \
42+
$name-Blacklist.txt $blackList > $name-Links.txt
5043
rm links
5144

5245
elif [ $linkType == "eQTL" ]
5346
then
54-
python $scriptDir/process.eqtl.py $links enhancers.bed $tss \
47+
python $scriptDir/process.eqtl.py $links enhancers.bed $prox \
5548
> $name-Links.txt
5649

5750
elif [ $linkType == "CRISPR" ]
5851
then
59-
python $scriptDir/process.crispr.py $links enhancers.bed $tss \
52+
python $scriptDir/process.crispr.py $links enhancers.bed $prox \
6053
> $name-Links.txt
6154
tssK562=~/Lab/Target-Gene/Benchmark/Raw/CRISPR/Shendure/K562-V19-TSS.bed
6255
fi
6356

6457
if [ $linkType == "CRISPR" ]
6558
then
66-
cutoff=1000000
67-
python $scriptDir/calculate.distance.py $tss enhancers.bed \
68-
$name-Links.txt $name-Distance.txt
59+
#cutoff=1000000
60+
#python $scriptDir/calculate.distance.py $tss enhancers.bed \
61+
# $name-Links.txt $name-Distance.txt
62+
cutoff=$(python $scriptDir/calculate.distance.py $tss enhancers.bed \
63+
$name-Links.txt $name-Distance.txt)
6964
else
7065
cutoff=$(python $scriptDir/calculate.distance.py $tss enhancers.bed \
7166
$name-Links.txt $name-Distance.txt)
@@ -76,19 +71,43 @@ awk '{if ($3 <= '$cutoff') print $0}' $name-Distance.txt > \
7671

7772
if [ $linkType == "CRISPR" ]
7873
then
74+
#python $scriptDir/create.experiment.sets.py $name-Distance.cutoff.txt \
75+
#$tssK562 enhancers.bed output $cutoff $name $linkType $blackList
7976
python $scriptDir/create.experiment.sets.py $name-Distance.cutoff.txt \
80-
$tssK562 enhancers.bed output $cutoff $name $linkType
77+
$tss enhancers.bed output $cutoff $name $linkType $blackList
8178
else
8279
python $scriptDir/create.experiment.sets.py $name-Distance.cutoff.txt \
83-
$tss enhancers.bed output $cutoff $name $linkType
80+
$tss enhancers.bed output $cutoff $name $linkType $blackList
8481
fi
8582

8683
awk '{print $1}' positive | sort -u > ccre-list.txt
8784
awk 'FNR==NR {x[$1];next} ($4 in x)' ccre-list.txt enhancers.bed > tmp.bed
8885
awk '{print $1 "\t" $2 "\t" 1}' positive > total
8986
awk '{print $1 "\t" $2 "\t" 0}' negative >> total
9087

91-
python $scriptDir/assign.groups.py tmp.bed total > $name-Benchmark.v1.txt
88+
p=$(wc -l positive | awk '{print $1}')
89+
n=$(wc -l negative | awk '{print $1}')
90+
echo -e "\t" $p "\t" $n "\t" $cutoff
91+
92+
if [ $blackList == "yes" ]
93+
then
94+
v1="v1"
95+
v2="v2"
96+
# cp
97+
else
98+
v1="v3"
99+
v2="v4"
100+
fi
101+
102+
cp $name-Distance.txt $name-Distance.$v1.txt
103+
104+
python $scriptDir/assign.groups.py tmp.bed total > $name-Benchmark.$v1.txt
105+
106+
python $scriptDir/select.ratio.pairs.py $name-Benchmark.$v1.txt > new-total
107+
awk '{if ($3 == 1) p+=1; else n+=1}END{print "\t" p "\t" n "\t" "'$cutoff'"}' new-total
108+
109+
cat skip.txt
110+
python $scriptDir/assign.groups.py tmp.bed new-total > $name-Benchmark.$v2.txt
92111

93-
rm -f bed1 bed2 out1 out2 positive negative range output
112+
rm -f bed1 bed2 out1 out2 positive negative range output new-total skip.txt
94113
rm -f ccre-list.txt enhancers.bed intersection2.bed total tmp.bed

0 commit comments

Comments
 (0)