Skip to content

Commit c933a09

Browse files
committed
Clean up contig group rules and functions
1 parent 758b8ca commit c933a09

File tree

2 files changed

+269
-250
lines changed

2 files changed

+269
-250
lines changed
Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
import json
2+
import math
3+
4+
# =================================================================================================
5+
# Grouping Functions
6+
# =================================================================================================
7+
8+
9+
# Simple greedy solver for the bin packing problem, here used to make bins of roughly equal
10+
# size for the contigs. We expect the input values to be tuples or pairs, where the index [1]
11+
# element is the weight that we use for packing (index[0] here is used for the contig name itself).
12+
def solve_bin_packing(values, max_bin_size):
13+
# First, sort by length, large ones first, decreasing.
14+
# This helps to get closer to an optimal solution.
15+
values.sort(key=lambda x: x[1], reverse=True)
16+
17+
# Fill the bins as needed, using first-fit on the sorted list, and keeping track of
18+
# how much we already put in each of them. We can have at most as many bins as elements,
19+
# so use this to initialize the sums, to keep it simple.
20+
bins = []
21+
sums = [0] * len(values)
22+
for cont in values:
23+
# Find the first bin where the contig fits in.
24+
j = 0
25+
while j < len(bins):
26+
if sums[j] + cont[1] <= max_bin_size:
27+
bins[j].append(cont)
28+
sums[j] += cont[1]
29+
break
30+
j += 1
31+
32+
# If no bin could fit the contig, make a new bin.
33+
# This in paticular catches contigs that are larger than the bin size, for example,
34+
# if some chromosomes are fully assembled and are hence not in scaffolds of small size.
35+
if j == len(bins):
36+
bins.append([])
37+
bins[j].append(cont)
38+
sums[j] += cont[1]
39+
40+
# Log output
41+
# print("Contig group bin assignments for small contigs:")
42+
# for i in range(len(bins)):
43+
# print(str(i) + ": [" + str(sums[i]) + "] " + str(bins[i]))
44+
45+
# Return the bins with all contigs and their sizes
46+
print("Success with", len(bins), "groups")
47+
return bins
48+
49+
50+
# Alternative heuristic to assign contigs to groups, minimizing the number of groups created,
51+
# while also respecting the max contig group size and the max contigs per group, and also
52+
# trying to optimize the load balancing by distributing short and long contigs over groups.
53+
# The result is a list of lists of tuples. The outer list are the groups. Each group then
54+
# contains the list of tuples (contig name and length) of the group.
55+
def optimize_contig_group(contigs, max_contig_group_size, max_contigs_per_group):
56+
# First, sort by length, large ones first, decreasing.
57+
contigs.sort(key=lambda x: x[1], reverse=True)
58+
59+
# First, fill the final group list with all contigs that are larger than the group size.
60+
# We do not split existing chromsomes/contigs, so we have to live with them being too large.
61+
# Also, collect all smaller contigs (smaller than the max group size), for further processing.
62+
large_groups = []
63+
small_contigs = []
64+
for contig in contigs:
65+
if contig[1] >= max_contig_group_size:
66+
large_groups.append([])
67+
large_groups[-1].append(contig)
68+
else:
69+
small_contigs.append(contig)
70+
print("Found ", len(large_groups), "large contigs and", len(small_contigs), "small contigs")
71+
72+
# No small contigs: We are done
73+
if len(small_contigs) == 0:
74+
return large_groups
75+
76+
# Now we have a list of small contigs for which to create new groups.
77+
# We do that in a loop until we succeed to fulfill all boundary conditions.
78+
# In particular, we want to limit the number of contigs per group, and not exceed
79+
# the sum of contig lengths per group. In each iteration, we increase the number
80+
# of groups created by one, hence guaranteeing success at some point.
81+
# There might be a better solution to this, but this is simple and works.
82+
# We start with as many groups as we minimally need such that we can fulfill the max
83+
# contigs per group constraint.
84+
num_groups = max(1, int(math.ceil(len(small_contigs) / max_contigs_per_group)))
85+
need_iteration = True
86+
while need_iteration:
87+
print("Evaluating with", num_groups, " small contig groups")
88+
need_iteration = False
89+
90+
# We fill a temporary list for the small contigs, starting with as many empty lists
91+
# as we have groups, and then fill each group's list with its contigs.
92+
# We do this by alternating to fill forwards and backwards - that is the heuristic
93+
# that is meant to avoid having groups with only small or only large contigs.
94+
# By going back and forth instead of filling from the start, we get a more even spread.
95+
# Let's call this the zig-zag round-robing assignment :-)
96+
# For instance, we should get: [[1, 6, 7], [2, 5, 8], [3, 4, 9]]
97+
small_groups = [[] for _ in range(num_groups)]
98+
round_num = 0
99+
index = 0
100+
101+
# Continue until all contigs have been distributed.
102+
while index < len(small_contigs):
103+
# Determine the order based on whether the round is even (forward) or odd (backward)
104+
if round_num % 2 == 0:
105+
order = range(num_groups)
106+
else:
107+
order = range(num_groups - 1, -1, -1)
108+
109+
# Distribute one element per group in the specified order.
110+
for i in order:
111+
if index >= len(small_contigs):
112+
break
113+
small_groups[i].append(small_contigs[index])
114+
index += 1
115+
round_num += 1
116+
117+
# Now we test if this was successful: Is each group small enough, both in terms
118+
# of total length, and in terms of number of contigs per group?
119+
small_contig_count = 0
120+
for group in small_groups:
121+
total_len = sum(contig[1] for contig in group)
122+
if (total_len > max_contig_group_size) or (len(group) > max_contigs_per_group):
123+
print(
124+
"Not enough groups. Got group length",
125+
total_len,
126+
">",
127+
max_contig_group_size,
128+
"and got contigs per group",
129+
len(group),
130+
">",
131+
max_contigs_per_group,
132+
)
133+
need_iteration = True
134+
num_groups += 1
135+
break
136+
small_contig_count += len(group)
137+
138+
# Now we are done. Make sure that we have processed the right number of contigs.
139+
# Then, add all small contigs as groups to our final result, and return it.
140+
assert small_contig_count == len(small_contigs)
141+
print(
142+
"Success with",
143+
num_groups,
144+
"small contig groups, and",
145+
(len(large_groups) + len(small_groups)),
146+
"total groups",
147+
)
148+
return large_groups + small_groups
149+
150+
151+
# =================================================================================================
152+
# Checkpoints
153+
# =================================================================================================
154+
155+
156+
checkpoint contig_groups:
157+
input:
158+
fai=get_fai,
159+
output:
160+
"calling/contig-groups/contigs.json",
161+
log:
162+
"logs/calling/contig-groups/contigs.log",
163+
params:
164+
min_contig_size=config["settings"].get("min-contig-size", 0),
165+
contig_group_size=config["settings"].get("contig-group-size", 0),
166+
max_contigs_per_group=config["settings"].get("max-contigs-per-group", 0),
167+
run:
168+
# Open the log file in write (or append) mode.
169+
log_file = open(log[0], "w")
170+
# Redirect both stdout and stderr to the log file. Snakemake does not do that for us...
171+
# Also, do not add another line of comment above. It took me half an hour to figure out
172+
# why snakefmt broke... https://github.com/snakemake/snakefmt/issues/196
173+
# Why does the snakemake workflow catalogue require proper snakefmt formatting,
174+
# if that tool is clearly not ready for production usage?
175+
sys.stdout = log_file
176+
sys.stderr = log_file
177+
178+
# Solve the bin packing for the contigs, to get a close to optimal solution
179+
# for putting them in groups. Large contigs (e.g., whole chromosomes) that are larger
180+
# than the bin size will simply get their own (overflowing...) bin.
181+
contig_list = read_contigs_from_fai(input.fai, params.min_contig_size)
182+
if params.max_contigs_per_group == 0:
183+
print("Running bin packing solver")
184+
contig_groups = solve_bin_packing(contig_list, params.contig_group_size)
185+
else:
186+
print("Running heuristic optimizer")
187+
contig_groups = optimize_contig_group(
188+
contig_list, params.contig_group_size, params.max_contigs_per_group
189+
)
190+
191+
# Now turn the contig bins into groups for the result of this function.
192+
# We store our resulting list of contigs containing all contigs,
193+
# in tuples with their sizes. The contigs is a dict from group name to a list
194+
# (over contings in the group) of tuples.
195+
contigs = {}
196+
for group in contig_groups:
197+
groupname = "contig-group-" + str(len(contigs))
198+
contigs[groupname] = group
199+
200+
# We need to store the result in a file, so that the rule that creates the per-contig
201+
# files can access it.
202+
json.dump(contigs, open(output[0], "w"))
203+
204+
205+
# Rule is not submitted as a job to the cluster.
206+
localrules:
207+
contig_groups,
208+
209+
210+
# Due to a new bug in Snakemake after our update to 8.15.2, we now need the following
211+
# function to be called as input whenever the above checkpoint is needed.
212+
# See https://github.com/snakemake/snakemake/issues/2958 for details.
213+
def contigs_groups_input(wildcards):
214+
return checkpoints.contig_groups.get().output[0]
215+
216+
217+
# Make the contig-group list files that contain the names of the contigs/scaffolds
218+
# that have been bin-packed above.
219+
rule contigs_group_list:
220+
input:
221+
contigs="calling/contig-groups/contigs.json",
222+
output:
223+
"calling/contig-groups/{contig}.bed",
224+
# log:
225+
# "logs/calling/contig-groups/{contig}.log",
226+
run:
227+
# Get the contigs file that we created above.
228+
contigs = json.load(open(input.contigs))
229+
230+
# Same for the group name itself: This rule is only executed
231+
# for group names that we actually have made.
232+
if wildcards.contig not in contigs:
233+
raise Exception("Internal error: contig " + wildcards.contig + " not found.")
234+
235+
# Write the output list file, using the contig names and lengths from the group.
236+
# In bed files, the first three columns are the chrom name, start (incluse, zero-based),
237+
# and end (exclusive). Hence, we can simply use 0 and length as start and end here.
238+
with open(output[0], "w") as f:
239+
f.writelines(f"{c[0]}\t0\t{c[1]}\n" for c in contigs[wildcards.contig])
240+
241+
# Rule is not submitted as a job to the cluster.
242+
243+
244+
245+
localrules:
246+
contigs_group_list,
247+
248+
249+
# Conflicts of interest:
250+
if config["settings"].get("restrict-regions"):
251+
raise Exception(
252+
"Cannot combine settings contig-group-size > 0 with restrict-regions "
253+
"at the moment, as we have not implemented this yet. "
254+
"If you need this combination of settings, please submit an issue to "
255+
"https://github.com/lczech/grenepipe/issues and we will see what we can do."
256+
)
257+
# We now extended the rules for bcftools and freebayes to also work with small contig groups.
258+
# The following check is no longer needed - just kept here for reference.
259+
# if config["settings"]["calling-tool"] != "haplotypecaller":
260+
# raise Exception(
261+
# "Can only use setting contig-group-size with calling-tool haplotypecaller "
262+
# "at the moment, as we have not implemented this for other calling tools yet. "
263+
# "If you need this combination of settings, please submit an issue to "
264+
# "https://github.com/lczech/grenepipe/issues and we will see what we can do."
265+
# )

0 commit comments

Comments
 (0)