-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsingle-dataset-correct-plinkfiles.sh
154 lines (112 loc) · 5.37 KB
/
single-dataset-correct-plinkfiles.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/bin/bash
########################################################################
Help()
{
# Display help
printf "
This script combines MaCH and minimac imputed data for consent groups from the same study.\n\n"
printf "Usage:\n\n"
echo "-d Directories containing the dataset, pass once per directory."
echo "-n dataset output naming prefix, for naming things."
echo "-h Print this help."
echo ""
}
# input arguments
while getopts ":d:n:h" option;
do
case $option in
d)
dataset_dir="$OPTARG"
;;
n)
dataset="$OPTARG"
;;
h)
Help
exit 1
;;
\?)
printf "\nInvalid option passed: -$OPTARG. See usage below.\n"
Help
exit 1
;;
esac
done
# display help on passing no arguments
if [ $OPTIND -eq 1 ];
then
Help
exit 1
fi
# check all args given
for arg in "$dataset_dir" "$dataset"
do
if [ -n "${!arg}" ]
then
printf "Error: Missing argument(s). See usage below."
Help
exit 1
fi
done
# display a helpful message of inputs
printf "\nUsing specified arguments:\n"
printf "\nDataset directories:\n"
printf "\n $dataset_dir \n"
printf "\n\nName prefix: $dataset\n"
########################################################################
# Collect low quality SNPs
rm -rf "${out_dir}"/"${output_name}"_tempsnps.txt
for i in "${datasets[@]}"; do
echo "Copying low quality snps for "${i}" to "${out_dir}" "
cat "${i}"/*_all_chr_lq_all_snps.txt >> "${out_dir}"/"${output_name}"_tempsnps.txt
done
echo "Finished combining low quality SNPs"
# Keep only unique entries
sort "${out_dir}"/"${output_name}"_tempsnps.txt | uniq -u > "${out_dir}"/"${output_name}"_lq_all_snps.txt
for i in "${datasets[@]}"; do
echo "importing "${i}" into plink2 format"
#take name of folder as the name for each individual dataset
dtst=$(basename "$i")
plink2 --import-dosage "${i}"/*_allchr.pdat \
--psam "${i}"/"${dtst}"_allchr.pfam \
--map "${i}"/"${dtst}".map \
--extract "${out_dir}"/"${output_name}"_sharedsnps.tsv \
--exclude "${i}"/*_lq_all_snps.txt \
--pheno "${i}"/"${dtst}"_pheno.tsv \
--covar "${i}"/"${dtst}"_covar.tsv \
--make-pgen \
--out "${i}"/"${dtst}"_plink2_temp1
echo "converting "${i}" into plink1 format"
plink2 --pfile "${i}"/"${dtst}"_plink2_temp1 \
--make-bed \
--out "${i}"/"${dtst}"_plink1_temp1
done
# Correct bfiles of plink1 files ##
#gawk 'BEGIN{FS="\t"; OFS="\t"}{print $3}' "${dataset_dir}"/"${dataset}"_temp2*.bim | gawk 'BEGIN{FS=":";OFS="\t"}{print $1,$2}' > "${dataset_dir}"/"${dataset}"_temp2_chrpos.bim
gawk 'BEGIN{FS="\t"; OFS="\t"}{print $2}' "${dataset_dir}"/"${dataset}"_plink1_temp1.bim | gawk 'BEGIN{FS=":";OFS="\t"}{print $1,$2}' > "${dataset_dir}"/"${dataset}"_plink1_temp1_chrpos.bim
paste "${dataset_dir}"/"${dataset}"_plink1_temp1*.bim "${dataset_dir}"/"${dataset}"_plink1_temp1_chrpos.bim > "${dataset_dir}"/"${dataset}"_plink1_temp1_w_chrpos.bim
#paste "${dataset_dir}"/"${dataset}"_temp2.bim "${dataset_dir}"/"${dataset}"_temp2_chrpos.bim > "${dataset_dir}"/"${dataset}"_temp2_w_chrpos.bim
#gawk 'BEGIN{FS="\t";OFS="\t"}{print $7,$2,$3,$8,$5, $6}' "${dataset_dir}"/"${dataset}"_temp2_w_chrpos.pvar > "${dataset_dir}"/"${dataset}"_temp2_updated.pvar
gawk 'BEGIN{FS="\t";OFS="\t"}{print $7,$2,$3,$8,$5, $6}' "${dataset_dir}"/"${dataset}"_plink1_temp1_w_chrpos.bim > "${dataset_dir}"/"${dataset}"_plink1_temp1_updated.bim
echo "Copying other files in the set for plink compatibility..."
cp "${dataset_dir}"/"${dataset}"_plink1_temp1.bed "${dataset_dir}"/"${dataset}"_plink1_temp1_updated.bed
cp "${dataset_dir}"/"${dataset}"_plink1_temp1.fam "${dataset_dir}"/"${dataset}"_plink1_temp1_updated.fam
echo "Done"
## Step 3.2: Correct pfiles ##
# Once the files have been imported into plink format the chromosome and position information must be updated as they are currently null.
# To do this we will use information that is present within the pvar file.
#echo "Generating files for '${dataset}' "
# Print the 3rd column called ID from the pvar file and split it based on ':' then take the first and second element. Take off the header and add a new header and add to new file
#echo "Taking ID and splitting into chromosome and position for later use..."
#gawk 'BEGIN{FS="\t"; OFS="\t"}{print $3}' "${dataset_dir}"/"${dataset}"_temp.pvar | gawk 'BEGIN{FS=":";OFS="\t"}{print $1,$2}' | tail -n+2 | sed '1i #CHROM POS' > "${dataset_dir}"/"${dataset}"_temp_chrpos.pvar
# Paste the original pvar and the new intermediate file into another intermediate file
#echo "Making temporary file..."
#paste "${dataset_dir}"/"${dataset}"_temp.pvar "${dataset_dir}"/"${dataset}"_temp_chrpos.pvar > "${dataset_dir}"/"${dataset}"_temp_w_chrpos.pvar
# Make a new pvar file with the corrected columns
#echo "Make new pvar file with correct chromosome and position information..."
#gawk 'BEGIN{FS="\t";OFS="\t"}{print $6,$7,$3,$4,$5}' "${dataset_dir}"/"${dataset}"_temp_w_chrpos.pvar > "${dataset_dir}"/"${dataset}"_temp_updated.pvar
# Copy the other pfiles with a matching name so that plink2 knows they are together.
#echo "Copying other files in the set for plink compatibility..."
#cp "${dataset_dir}"/"${dataset}"_temp.psam "${dataset_dir}"/"${dataset}"_temp_updated.psam
#cp "${dataset_dir}"/"${dataset}"_temp.pgen "${dataset_dir}"/"${dataset}"_temp_updated.pgen
#echo "Done"