-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorrect-plinkfiles.sh
executable file
·123 lines (85 loc) · 4.3 KB
/
correct-plinkfiles.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/bin/bash
########################################################################
Help()
{
# Display help
printf "
This script combines MaCH and minimac imputed data for consent groups from the same study.\n\n"
printf "Usage:\n\n"
echo "-d Directories containing the dataset, pass once per directory."
echo "-n dataset output naming prefix, for naming things."
echo "-h Print this help."
echo ""
}
# input arguments
while getopts ":d:n:h" option;
do
case $option in
d)
dataset_dir="$OPTARG"
;;
n)
dataset="$OPTARG"
;;
h)
Help
exit 1
;;
\?)
printf "\nInvalid option passed: -$OPTARG. See usage below.\n"
Help
exit 1
;;
esac
done
# display help on passing no arguments
if [ $OPTIND -eq 1 ];
then
Help
exit 1
fi
# check all args given
for arg in "$dataset_dir" "$dataset"
do
if [ -n "${!arg}" ]
then
printf "Error: Missing argument(s). See usage below."
Help
exit 1
fi
done
# display a helpful message of inputs
printf "\nUsing specified arguments:\n"
printf "\nDataset directories:\n"
printf "\n $dataset_dir \n"
printf "\n\nName prefix: $dataset\n"
########################################################################
# Correct bfiles of plink1 files ##
#gawk 'BEGIN{FS="\t"; OFS="\t"}{print $3}' "${dataset_dir}"/"${dataset}"_temp2*.bim | gawk 'BEGIN{FS=":";OFS="\t"}{print $1,$2}' > "${dataset_dir}"/"${dataset}"_temp2_chrpos.bim
gawk 'BEGIN{FS="\t"; OFS="\t"}{print $2}' "${dataset_dir}"/"${dataset}"_plink1_temp1.bim | gawk 'BEGIN{FS=":";OFS="\t"}{print $1,$2}' > "${dataset_dir}"/"${dataset}"_plink1_temp1_chrpos.bim
paste "${dataset_dir}"/"${dataset}"_plink1_temp1*.bim "${dataset_dir}"/"${dataset}"_plink1_temp1_chrpos.bim > "${dataset_dir}"/"${dataset}"_plink1_temp1_w_chrpos.bim
#paste "${dataset_dir}"/"${dataset}"_temp2.bim "${dataset_dir}"/"${dataset}"_temp2_chrpos.bim > "${dataset_dir}"/"${dataset}"_temp2_w_chrpos.bim
#gawk 'BEGIN{FS="\t";OFS="\t"}{print $7,$2,$3,$8,$5, $6}' "${dataset_dir}"/"${dataset}"_temp2_w_chrpos.pvar > "${dataset_dir}"/"${dataset}"_temp2_updated.pvar
gawk 'BEGIN{FS="\t";OFS="\t"}{print $7,$2,$3,$8,$5, $6}' "${dataset_dir}"/"${dataset}"_plink1_temp1_w_chrpos.bim > "${dataset_dir}"/"${dataset}"_plink1_temp1_updated.bim
echo "Copying other files in the set for plink compatibility..."
cp "${dataset_dir}"/"${dataset}"_plink1_temp1.bed "${dataset_dir}"/"${dataset}"_plink1_temp1_updated.bed
cp "${dataset_dir}"/"${dataset}"_plink1_temp1.fam "${dataset_dir}"/"${dataset}"_plink1_temp1_updated.fam
echo "Done"
## Step 3.2: Correct pfiles ##
# Once the files have been imported into plink format the chromosome and position information must be updated as they are currently null.
# To do this we will use information that is present within the pvar file.
#echo "Generating files for '${dataset}' "
# Print the 3rd column called ID from the pvar file and split it based on ':' then take the first and second element. Take off the header and add a new header and add to new file
#echo "Taking ID and splitting into chromosome and position for later use..."
#gawk 'BEGIN{FS="\t"; OFS="\t"}{print $3}' "${dataset_dir}"/"${dataset}"_temp.pvar | gawk 'BEGIN{FS=":";OFS="\t"}{print $1,$2}' | tail -n+2 | sed '1i #CHROM POS' > "${dataset_dir}"/"${dataset}"_temp_chrpos.pvar
# Paste the original pvar and the new intermediate file into another intermediate file
#echo "Making temporary file..."
#paste "${dataset_dir}"/"${dataset}"_temp.pvar "${dataset_dir}"/"${dataset}"_temp_chrpos.pvar > "${dataset_dir}"/"${dataset}"_temp_w_chrpos.pvar
# Make a new pvar file with the corrected columns
#echo "Make new pvar file with correct chromosome and position information..."
#gawk 'BEGIN{FS="\t";OFS="\t"}{print $6,$7,$3,$4,$5}' "${dataset_dir}"/"${dataset}"_temp_w_chrpos.pvar > "${dataset_dir}"/"${dataset}"_temp_updated.pvar
# Copy the other pfiles with a matching name so that plink2 knows they are together.
#echo "Copying other files in the set for plink compatibility..."
#cp "${dataset_dir}"/"${dataset}"_temp.psam "${dataset_dir}"/"${dataset}"_temp_updated.psam
#cp "${dataset_dir}"/"${dataset}"_temp.pgen "${dataset_dir}"/"${dataset}"_temp_updated.pgen
#echo "Done"