66import time
77from diploshic .fvTools import *
88
9- if not len (sys .argv ) in [13 , 15 ]:
9+ if not len (sys .argv ) in [13 , 15 , 16 , 17 ]:
1010 sys .exit (
11- "usage:\n python makeFeatureVecsForChrArmFromVcf_ogSHIC.py chrArmFileName chrArm chrLen targetPop winSize numSubWins maskFileName sampleToPopFileName ancestralArmFaFileName statFileName outFileName [segmentStart segmentEnd]\n "
11+ "usage:\n python makeFeatureVecsForChrArmFromVcf_ogSHIC.py chrArmFileName chrArm chrLen targetPop winSize numSubWins maskFileName sampleToPopFileName ancestralArmFaFileName statFileName outFileName [segmentStart segmentEnd] [windowOffset] \n "
1212 )
13- if len (sys .argv ) == 15 :
13+
14+ # Handle different argument combinations
15+ if len (sys .argv ) == 17 : # All optional args: segmentStart segmentEnd windowOffset
16+ (
17+ chrArmFileName ,
18+ chrArm ,
19+ chrLen ,
20+ targetPop ,
21+ winSize ,
22+ numSubWins ,
23+ maskFileName ,
24+ unmaskedFracCutoff ,
25+ sampleToPopFileName ,
26+ ancestralArmFaFileName ,
27+ statFileName ,
28+ outfn ,
29+ segmentStart ,
30+ segmentEnd ,
31+ windowOffset ,
32+ ) = sys .argv [1 :]
33+ segmentStart , segmentEnd , windowOffset = int (segmentStart ), int (segmentEnd ), int (windowOffset )
34+ elif len (sys .argv ) == 16 : # Could be segmentStart+segmentEnd OR just windowOffset
35+ # Check if we have segmentStart and segmentEnd by seeing if the 13th arg looks like a reasonable coordinate
36+ try :
37+ potential_segment_start = int (sys .argv [13 ])
38+ potential_segment_end = int (sys .argv [14 ])
39+ potential_window_offset = int (sys .argv [15 ])
40+ # If all three parse as integers, assume segmentStart, segmentEnd, windowOffset
41+ segmentStart , segmentEnd , windowOffset = potential_segment_start , potential_segment_end , potential_window_offset
42+ # Extract the base arguments
43+ (
44+ chrArmFileName ,
45+ chrArm ,
46+ chrLen ,
47+ targetPop ,
48+ winSize ,
49+ numSubWins ,
50+ maskFileName ,
51+ unmaskedFracCutoff ,
52+ sampleToPopFileName ,
53+ ancestralArmFaFileName ,
54+ statFileName ,
55+ outfn ,
56+ ) = sys .argv [1 :13 ]
57+ except (ValueError , IndexError ):
58+ # If parsing fails, treat as just windowOffset
59+ (
60+ chrArmFileName ,
61+ chrArm ,
62+ chrLen ,
63+ targetPop ,
64+ winSize ,
65+ numSubWins ,
66+ maskFileName ,
67+ unmaskedFracCutoff ,
68+ sampleToPopFileName ,
69+ ancestralArmFaFileName ,
70+ statFileName ,
71+ outfn ,
72+ windowOffset ,
73+ ) = sys .argv [1 :]
74+ segmentStart = None
75+ windowOffset = int (windowOffset )
76+ elif len (sys .argv ) == 15 : # segmentStart and segmentEnd only
1477 (
1578 chrArmFileName ,
1679 chrArm ,
2891 segmentEnd ,
2992 ) = sys .argv [1 :]
3093 segmentStart , segmentEnd = int (segmentStart ), int (segmentEnd )
31- else :
94+ windowOffset = 0
95+ else : # len(sys.argv) == 13, no optional args
3296 (
3397 chrArmFileName ,
3498 chrArm ,
44108 outfn ,
45109 ) = sys .argv [1 :]
46110 segmentStart = None
111+ windowOffset = 0
47112
48113unmaskedFracCutoff = float (unmaskedFracCutoff )
49114chrLen , winSize , numSubWins = int (chrLen ), int (winSize ), int (numSubWins )
50115assert winSize % numSubWins == 0 and numSubWins > 1
51116subWinSize = int (winSize / numSubWins )
52117
53118
54- def getSubWinBounds (chrLen , subWinSize ):
55- lastSubWinEnd = chrLen - chrLen % subWinSize
119+ def getSubWinBounds (chrLen , subWinSize , windowOffset = 0 ):
120+ # Start windows from windowOffset + 1 instead of 1
121+ firstSubWinStart = windowOffset + 1
122+ lastSubWinEnd = chrLen - ((chrLen - windowOffset ) % subWinSize )
56123 lastSubWinStart = lastSubWinEnd - subWinSize + 1
124+
57125 subWinBounds = []
58- for subWinStart in range (1 , lastSubWinStart + 1 , subWinSize ):
126+ for subWinStart in range (firstSubWinStart , lastSubWinStart + 1 , subWinSize ):
59127 subWinEnd = subWinStart + subWinSize - 1
60- subWinBounds .append ((subWinStart , subWinEnd ))
128+ if subWinEnd <= chrLen : # Don't exceed chromosome length
129+ subWinBounds .append ((subWinStart , subWinEnd ))
61130 return subWinBounds
62131
63132
64- def getSnpIndicesInSubWins (subWinSize , lastSubWinEnd , snpLocs ):
65- subWinStart = 1
133+ def getSnpIndicesInSubWins (subWinSize , lastSubWinEnd , snpLocs , windowOffset = 0 ):
134+ subWinStart = windowOffset + 1 # Start from offset
66135 subWinEnd = subWinStart + subWinSize - 1
67136 snpIndicesInSubWins = [[]]
137+
68138 for i in range (len (snpLocs )):
69139 while snpLocs [i ] <= lastSubWinEnd and not (
70140 snpLocs [i ] >= subWinStart and snpLocs [i ] <= subWinEnd
@@ -74,6 +144,8 @@ def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs):
74144 snpIndicesInSubWins .append ([])
75145 if snpLocs [i ] <= lastSubWinEnd :
76146 snpIndicesInSubWins [- 1 ].append (i )
147+
148+ # Add empty windows for any remaining subwindows
77149 while subWinEnd < lastSubWinEnd :
78150 snpIndicesInSubWins .append ([])
79151 subWinStart += subWinSize
@@ -198,7 +270,7 @@ def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs):
198270 alleleCounts = alleleCounts .map_alleles (mapping )
199271haps = genos .to_haplotypes ()
200272
201- subWinBounds = getSubWinBounds (chrLen , subWinSize )
273+ subWinBounds = getSubWinBounds (chrLen , subWinSize , windowOffset )
202274precomputedStats = {} # not using this
203275
204276header = "chrom classifiedWinStart classifiedWinEnd bigWinRange" .split ()
@@ -217,17 +289,20 @@ def getSnpIndicesInSubWins(subWinSize, lastSubWinEnd, snpLocs):
217289
218290startTime = time .perf_counter ()
219291goodSubWins = []
220- lastSubWinEnd = chrLen - chrLen % subWinSize
292+ lastSubWinEnd = chrLen - (( chrLen - windowOffset ) % subWinSize )
221293snpIndicesInSubWins = getSnpIndicesInSubWins (
222- subWinSize , lastSubWinEnd , positions
294+ subWinSize , lastSubWinEnd , positions , windowOffset
223295)
224296subWinIndex = 0
297+ firstSubWinStart = windowOffset + 1
225298lastSubWinStart = lastSubWinEnd - subWinSize + 1
226299if statFileName :
227300 statFile = open (statFileName , "w" )
228301 statFile .write (statHeader + "\n " )
229- for subWinStart in range (1 , lastSubWinStart + 1 , subWinSize ):
302+ for subWinStart in range (firstSubWinStart , lastSubWinStart + 1 , subWinSize ):
230303 subWinEnd = subWinStart + subWinSize - 1
304+ if subWinEnd > chrLen : # Skip windows that exceed chromosome length
305+ break
231306 unmaskedFrac = unmasked [subWinStart - 1 : subWinEnd ].count (True ) / float (
232307 subWinEnd - subWinStart + 1
233308 )
0 commit comments