Skip to content

Commit 705235b

Browse files
committed
window offset
1 parent 8938a37 commit 705235b

File tree

3 files changed

+186
-28
lines changed

3 files changed

+186
-28
lines changed

diploshic/diploSHIC

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,11 @@ parser_d.add_argument(
262262
help="Right boundary of region in which feature vectors are calculated (whole arm if omitted)",
263263
default="None",
264264
)
265+
parser_d.add_argument(
266+
"--windowOffset",
267+
help="Offset for window positioning (shifts the entire windowing grid by this amount, default=0)",
268+
default="0",
269+
)
265270
parser_d.set_defaults(mode="fvecVcf")
266271
parser_d._positionals.title = "required arguments"
267272

@@ -735,6 +740,9 @@ elif argsDict["mode"] == "fvecVcf":
735740
additionalArgs = []
736741
if argsDict["segmentStart"] != "None":
737742
additionalArgs += [argsDict["segmentStart"], argsDict["segmentEnd"]]
743+
if argsDict["windowOffset"] != "0":
744+
additionalArgs += [argsDict["windowOffset"]]
745+
if additionalArgs:
738746
cmd += " " + " ".join(additionalArgs)
739747
# cmd += " > " + argsDict['fvecFileName']
740748
print(cmd)

diploshic/makeFeatureVecsForChrArmFromVcfDiploid.py

Lines changed: 89 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,74 @@
66
import time
77
from diploshic.fvTools import *
88

9-
if not len(sys.argv) in [13, 15]:
9+
if not len(sys.argv) in [13, 15, 16, 17]:
1010
sys.exit(
11-
"usage:\npython makeFeatureVecsForChrArmFromVcfDiploid.py vcfFileName chrArm chrLen targetPop winSize numSubWins maskFileName unmaskedFracCutoff unmaskedGenoFracCutoff sampleToPopFileName statFileName outFileName [segmentStart segmentEnd]\n"
11+
"usage:\npython makeFeatureVecsForChrArmFromVcfDiploid.py vcfFileName chrArm chrLen targetPop winSize numSubWins maskFileName unmaskedFracCutoff unmaskedGenoFracCutoff sampleToPopFileName statFileName outFileName [segmentStart segmentEnd] [windowOffset]\n"
1212
)
13-
if len(sys.argv) == 15:
13+
14+
# Handle different argument combinations
15+
if len(sys.argv) == 17: # All optional args: segmentStart segmentEnd windowOffset
16+
(
17+
vcfFileName,
18+
chrArm,
19+
chrLen,
20+
targetPop,
21+
winSize,
22+
numSubWins,
23+
maskFileName,
24+
unmaskedFracCutoff,
25+
unmaskedGenoFracCutoff,
26+
sampleToPopFileName,
27+
statFileName,
28+
outfn,
29+
segmentStart,
30+
segmentEnd,
31+
windowOffset,
32+
) = sys.argv[1:]
33+
segmentStart, segmentEnd, windowOffset = int(segmentStart), int(segmentEnd), int(windowOffset)
34+
elif len(sys.argv) == 16: # Could be segmentStart+segmentEnd OR just windowOffset
35+
# Check if we have segmentStart and segmentEnd by seeing if the 13th arg looks like a reasonable coordinate
36+
try:
37+
potential_segment_start = int(sys.argv[13])
38+
potential_segment_end = int(sys.argv[14])
39+
potential_window_offset = int(sys.argv[15])
40+
# If all three parse as integers, assume segmentStart, segmentEnd, windowOffset
41+
segmentStart, segmentEnd, windowOffset = potential_segment_start, potential_segment_end, potential_window_offset
42+
# Extract the base arguments
43+
(
44+
vcfFileName,
45+
chrArm,
46+
chrLen,
47+
targetPop,
48+
winSize,
49+
numSubWins,
50+
maskFileName,
51+
unmaskedFracCutoff,
52+
unmaskedGenoFracCutoff,
53+
sampleToPopFileName,
54+
statFileName,
55+
outfn,
56+
) = sys.argv[1:13]
57+
except (ValueError, IndexError):
58+
# If parsing fails, treat as just windowOffset
59+
(
60+
vcfFileName,
61+
chrArm,
62+
chrLen,
63+
targetPop,
64+
winSize,
65+
numSubWins,
66+
maskFileName,
67+
unmaskedFracCutoff,
68+
unmaskedGenoFracCutoff,
69+
sampleToPopFileName,
70+
statFileName,
71+
outfn,
72+
windowOffset,
73+
) = sys.argv[1:]
74+
segmentStart = None
75+
windowOffset = int(windowOffset)
76+
elif len(sys.argv) == 15: # segmentStart and segmentEnd only
1477
(
1578
vcfFileName,
1679
chrArm,
@@ -28,7 +91,8 @@
2891
segmentEnd,
2992
) = sys.argv[1:]
3093
segmentStart, segmentEnd = int(segmentStart), int(segmentEnd)
31-
else:
94+
windowOffset = 0
95+
else: # len(sys.argv) == 13, no optional args
3296
(
3397
vcfFileName,
3498
chrArm,
@@ -44,6 +108,7 @@
44108
outfn,
45109
) = sys.argv[1:]
46110
segmentStart = None
111+
windowOffset = 0
47112

48113
unmaskedFracCutoff = float(unmaskedFracCutoff)
49114
if unmaskedFracCutoff < 0.0 or unmaskedFracCutoff > 1.0:
@@ -62,20 +127,25 @@
62127
subWinSize = int(winSize / numSubWins)
63128

64129

65-
def getSubWinBounds(chrLen, subWinSize):
66-
lastSubWinEnd = chrLen - chrLen % subWinSize
130+
def getSubWinBounds(chrLen, subWinSize, windowOffset=0):
131+
# Start windows from windowOffset + 1 instead of 1
132+
firstSubWinStart = windowOffset + 1
133+
lastSubWinEnd = chrLen - ((chrLen - windowOffset) % subWinSize)
67134
lastSubWinStart = lastSubWinEnd - subWinSize + 1
135+
68136
subWinBounds = []
69-
for subWinStart in range(1, lastSubWinStart + 1, subWinSize):
137+
for subWinStart in range(firstSubWinStart, lastSubWinStart + 1, subWinSize):
70138
subWinEnd = subWinStart + subWinSize - 1
71-
subWinBounds.append((subWinStart, subWinEnd))
139+
if subWinEnd <= chrLen: # Don't exceed chromosome length
140+
subWinBounds.append((subWinStart, subWinEnd))
72141
return subWinBounds
73142

74143

75-
def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs):
76-
subWinStart = 1
144+
def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs, windowOffset=0):
145+
subWinStart = windowOffset + 1 # Start from offset
77146
subWinEnd = subWinStart + subWinSize - 1
78147
snpIndicesInSubWins = [[]]
148+
79149
for i in range(len(snpLocs)):
80150
while snpLocs[i] <= lastSubWinEnd and not (
81151
snpLocs[i] >= subWinStart and snpLocs[i] <= subWinEnd
@@ -85,6 +155,8 @@ def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs):
85155
snpIndicesInSubWins.append([])
86156
if snpLocs[i] <= lastSubWinEnd:
87157
snpIndicesInSubWins[-1].append(i)
158+
159+
# Add empty windows for any remaining subwindows
88160
while subWinEnd < lastSubWinEnd:
89161
snpIndicesInSubWins.append([])
90162
subWinStart += subWinSize
@@ -188,7 +260,7 @@ def readSampleToPopFile(sampleToPopFileName):
188260
"diplo_Omega",
189261
]
190262

191-
subWinBounds = getSubWinBounds(chrLen, subWinSize)
263+
subWinBounds = getSubWinBounds(chrLen, subWinSize, windowOffset)
192264

193265
header = "chrom classifiedWinStart classifiedWinEnd bigWinRange".split()
194266
statHeader = "chrom start end".split()
@@ -206,17 +278,20 @@ def readSampleToPopFile(sampleToPopFileName):
206278

207279
startTime = time.perf_counter()
208280
goodSubWins = []
209-
lastSubWinEnd = chrLen - chrLen % subWinSize
281+
lastSubWinEnd = chrLen - ((chrLen - windowOffset) % subWinSize)
210282
snpIndicesInSubWins = getSnpIndicesInSubWins(
211-
subWinSize, lastSubWinEnd, positions
283+
subWinSize, lastSubWinEnd, positions, windowOffset
212284
)
213285
subWinIndex = 0
286+
firstSubWinStart = windowOffset + 1
214287
lastSubWinStart = lastSubWinEnd - subWinSize + 1
215288
if statFileName:
216289
statFile = open(statFileName, "w")
217290
statFile.write(statHeader + "\n")
218-
for subWinStart in range(1, lastSubWinStart + 1, subWinSize):
291+
for subWinStart in range(firstSubWinStart, lastSubWinStart + 1, subWinSize):
219292
subWinEnd = subWinStart + subWinSize - 1
293+
if subWinEnd > chrLen: # Skip windows that exceed chromosome length
294+
break
220295
unmaskedFrac = unmasked[subWinStart - 1 : subWinEnd].count(True) / float(
221296
subWinEnd - subWinStart + 1
222297
)

diploshic/makeFeatureVecsForChrArmFromVcf_ogSHIC.py

Lines changed: 89 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,74 @@
66
import time
77
from diploshic.fvTools import *
88

9-
if not len(sys.argv) in [13, 15]:
9+
if not len(sys.argv) in [13, 15, 16, 17]:
1010
sys.exit(
11-
"usage:\npython makeFeatureVecsForChrArmFromVcf_ogSHIC.py chrArmFileName chrArm chrLen targetPop winSize numSubWins maskFileName sampleToPopFileName ancestralArmFaFileName statFileName outFileName [segmentStart segmentEnd]\n"
11+
"usage:\npython makeFeatureVecsForChrArmFromVcf_ogSHIC.py chrArmFileName chrArm chrLen targetPop winSize numSubWins maskFileName sampleToPopFileName ancestralArmFaFileName statFileName outFileName [segmentStart segmentEnd] [windowOffset]\n"
1212
)
13-
if len(sys.argv) == 15:
13+
14+
# Handle different argument combinations
15+
if len(sys.argv) == 17: # All optional args: segmentStart segmentEnd windowOffset
16+
(
17+
chrArmFileName,
18+
chrArm,
19+
chrLen,
20+
targetPop,
21+
winSize,
22+
numSubWins,
23+
maskFileName,
24+
unmaskedFracCutoff,
25+
sampleToPopFileName,
26+
ancestralArmFaFileName,
27+
statFileName,
28+
outfn,
29+
segmentStart,
30+
segmentEnd,
31+
windowOffset,
32+
) = sys.argv[1:]
33+
segmentStart, segmentEnd, windowOffset = int(segmentStart), int(segmentEnd), int(windowOffset)
34+
elif len(sys.argv) == 16: # Could be segmentStart+segmentEnd OR just windowOffset
35+
# Check if we have segmentStart and segmentEnd by seeing if the 13th arg looks like a reasonable coordinate
36+
try:
37+
potential_segment_start = int(sys.argv[13])
38+
potential_segment_end = int(sys.argv[14])
39+
potential_window_offset = int(sys.argv[15])
40+
# If all three parse as integers, assume segmentStart, segmentEnd, windowOffset
41+
segmentStart, segmentEnd, windowOffset = potential_segment_start, potential_segment_end, potential_window_offset
42+
# Extract the base arguments
43+
(
44+
chrArmFileName,
45+
chrArm,
46+
chrLen,
47+
targetPop,
48+
winSize,
49+
numSubWins,
50+
maskFileName,
51+
unmaskedFracCutoff,
52+
sampleToPopFileName,
53+
ancestralArmFaFileName,
54+
statFileName,
55+
outfn,
56+
) = sys.argv[1:13]
57+
except (ValueError, IndexError):
58+
# If parsing fails, treat as just windowOffset
59+
(
60+
chrArmFileName,
61+
chrArm,
62+
chrLen,
63+
targetPop,
64+
winSize,
65+
numSubWins,
66+
maskFileName,
67+
unmaskedFracCutoff,
68+
sampleToPopFileName,
69+
ancestralArmFaFileName,
70+
statFileName,
71+
outfn,
72+
windowOffset,
73+
) = sys.argv[1:]
74+
segmentStart = None
75+
windowOffset = int(windowOffset)
76+
elif len(sys.argv) == 15: # segmentStart and segmentEnd only
1477
(
1578
chrArmFileName,
1679
chrArm,
@@ -28,7 +91,8 @@
2891
segmentEnd,
2992
) = sys.argv[1:]
3093
segmentStart, segmentEnd = int(segmentStart), int(segmentEnd)
31-
else:
94+
windowOffset = 0
95+
else: # len(sys.argv) == 13, no optional args
3296
(
3397
chrArmFileName,
3498
chrArm,
@@ -44,27 +108,33 @@
44108
outfn,
45109
) = sys.argv[1:]
46110
segmentStart = None
111+
windowOffset = 0
47112

48113
unmaskedFracCutoff = float(unmaskedFracCutoff)
49114
chrLen, winSize, numSubWins = int(chrLen), int(winSize), int(numSubWins)
50115
assert winSize % numSubWins == 0 and numSubWins > 1
51116
subWinSize = int(winSize / numSubWins)
52117

53118

54-
def getSubWinBounds(chrLen, subWinSize):
55-
lastSubWinEnd = chrLen - chrLen % subWinSize
119+
def getSubWinBounds(chrLen, subWinSize, windowOffset=0):
120+
# Start windows from windowOffset + 1 instead of 1
121+
firstSubWinStart = windowOffset + 1
122+
lastSubWinEnd = chrLen - ((chrLen - windowOffset) % subWinSize)
56123
lastSubWinStart = lastSubWinEnd - subWinSize + 1
124+
57125
subWinBounds = []
58-
for subWinStart in range(1, lastSubWinStart + 1, subWinSize):
126+
for subWinStart in range(firstSubWinStart, lastSubWinStart + 1, subWinSize):
59127
subWinEnd = subWinStart + subWinSize - 1
60-
subWinBounds.append((subWinStart, subWinEnd))
128+
if subWinEnd <= chrLen: # Don't exceed chromosome length
129+
subWinBounds.append((subWinStart, subWinEnd))
61130
return subWinBounds
62131

63132

64-
def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs):
65-
subWinStart = 1
133+
def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs, windowOffset=0):
134+
subWinStart = windowOffset + 1 # Start from offset
66135
subWinEnd = subWinStart + subWinSize - 1
67136
snpIndicesInSubWins = [[]]
137+
68138
for i in range(len(snpLocs)):
69139
while snpLocs[i] <= lastSubWinEnd and not (
70140
snpLocs[i] >= subWinStart and snpLocs[i] <= subWinEnd
@@ -74,6 +144,8 @@ def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs):
74144
snpIndicesInSubWins.append([])
75145
if snpLocs[i] <= lastSubWinEnd:
76146
snpIndicesInSubWins[-1].append(i)
147+
148+
# Add empty windows for any remaining subwindows
77149
while subWinEnd < lastSubWinEnd:
78150
snpIndicesInSubWins.append([])
79151
subWinStart += subWinSize
@@ -198,7 +270,7 @@ def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs):
198270
alleleCounts = alleleCounts.map_alleles(mapping)
199271
haps = genos.to_haplotypes()
200272

201-
subWinBounds = getSubWinBounds(chrLen, subWinSize)
273+
subWinBounds = getSubWinBounds(chrLen, subWinSize, windowOffset)
202274
precomputedStats = {} # not using this
203275

204276
header = "chrom classifiedWinStart classifiedWinEnd bigWinRange".split()
@@ -217,17 +289,20 @@ def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs):
217289

218290
startTime = time.perf_counter()
219291
goodSubWins = []
220-
lastSubWinEnd = chrLen - chrLen % subWinSize
292+
lastSubWinEnd = chrLen - ((chrLen - windowOffset) % subWinSize)
221293
snpIndicesInSubWins = getSnpIndicesInSubWins(
222-
subWinSize, lastSubWinEnd, positions
294+
subWinSize, lastSubWinEnd, positions, windowOffset
223295
)
224296
subWinIndex = 0
297+
firstSubWinStart = windowOffset + 1
225298
lastSubWinStart = lastSubWinEnd - subWinSize + 1
226299
if statFileName:
227300
statFile = open(statFileName, "w")
228301
statFile.write(statHeader + "\n")
229-
for subWinStart in range(1, lastSubWinStart + 1, subWinSize):
302+
for subWinStart in range(firstSubWinStart, lastSubWinStart + 1, subWinSize):
230303
subWinEnd = subWinStart + subWinSize - 1
304+
if subWinEnd > chrLen: # Skip windows that exceed chromosome length
305+
break
231306
unmaskedFrac = unmasked[subWinStart - 1 : subWinEnd].count(True) / float(
232307
subWinEnd - subWinStart + 1
233308
)

0 commit comments

Comments
 (0)