|
| 1 | +import json |
| 2 | +import math |
| 3 | + |
| 4 | +# ================================================================================================= |
| 5 | +# Grouping Functions |
| 6 | +# ================================================================================================= |
| 7 | + |
| 8 | + |
| 9 | +# Simple greedy solver for the bin packing problem, here used to make bins of roughly equal |
| 10 | +# size for the contigs. We expect the input values to be tuples or pairs, where the index [1] |
| 11 | +# element is the weight that we use for packing (index[0] here is used for the contig name itself). |
| 12 | +def solve_bin_packing(values, max_bin_size): |
| 13 | + # First, sort by length, large ones first, decreasing. |
| 14 | + # This helps to get closer to an optimal solution. |
| 15 | + values.sort(key=lambda x: x[1], reverse=True) |
| 16 | + |
| 17 | + # Fill the bins as needed, using first-fit on the sorted list, and keeping track of |
| 18 | + # how much we already put in each of them. We can have at most as many bins as elements, |
| 19 | + # so use this to initialize the sums, to keep it simple. |
| 20 | + bins = [] |
| 21 | + sums = [0] * len(values) |
| 22 | + for cont in values: |
| 23 | + # Find the first bin where the contig fits in. |
| 24 | + j = 0 |
| 25 | + while j < len(bins): |
| 26 | + if sums[j] + cont[1] <= max_bin_size: |
| 27 | + bins[j].append(cont) |
| 28 | + sums[j] += cont[1] |
| 29 | + break |
| 30 | + j += 1 |
| 31 | + |
| 32 | + # If no bin could fit the contig, make a new bin. |
| 33 | + # This in paticular catches contigs that are larger than the bin size, for example, |
| 34 | + # if some chromosomes are fully assembled and are hence not in scaffolds of small size. |
| 35 | + if j == len(bins): |
| 36 | + bins.append([]) |
| 37 | + bins[j].append(cont) |
| 38 | + sums[j] += cont[1] |
| 39 | + |
| 40 | + # Log output |
| 41 | + # print("Contig group bin assignments for small contigs:") |
| 42 | + # for i in range(len(bins)): |
| 43 | + # print(str(i) + ": [" + str(sums[i]) + "] " + str(bins[i])) |
| 44 | + |
| 45 | + # Return the bins with all contigs and their sizes |
| 46 | + print("Success with", len(bins), "groups") |
| 47 | + return bins |
| 48 | + |
| 49 | + |
| 50 | +# Alternative heuristic to assign contigs to groups, minimizing the number of groups created, |
| 51 | +# while also respecting the max contig group size and the max contigs per group, and also |
| 52 | +# trying to optimize the load balancing by distributing short and long contigs over groups. |
| 53 | +# The result is a list of lists of tuples. The outer list are the groups. Each group then |
| 54 | +# contains the list of tuples (contig name and length) of the group. |
| 55 | +def optimize_contig_group(contigs, max_contig_group_size, max_contigs_per_group): |
| 56 | + # First, sort by length, large ones first, decreasing. |
| 57 | + contigs.sort(key=lambda x: x[1], reverse=True) |
| 58 | + |
| 59 | + # First, fill the final group list with all contigs that are larger than the group size. |
| 60 | + # We do not split existing chromsomes/contigs, so we have to live with them being too large. |
| 61 | + # Also, collect all smaller contigs (smaller than the max group size), for further processing. |
| 62 | + large_groups = [] |
| 63 | + small_contigs = [] |
| 64 | + for contig in contigs: |
| 65 | + if contig[1] >= max_contig_group_size: |
| 66 | + large_groups.append([]) |
| 67 | + large_groups[-1].append(contig) |
| 68 | + else: |
| 69 | + small_contigs.append(contig) |
| 70 | + print("Found ", len(large_groups), "large contigs and", len(small_contigs), "small contigs") |
| 71 | + |
| 72 | + # No small contigs: We are done |
| 73 | + if len(small_contigs) == 0: |
| 74 | + return large_groups |
| 75 | + |
| 76 | + # Now we have a list of small contigs for which to create new groups. |
| 77 | + # We do that in a loop until we succeed to fulfill all boundary conditions. |
| 78 | + # In particular, we want to limit the number of contigs per group, and not exceed |
| 79 | + # the sum of contig lengths per group. In each iteration, we increase the number |
| 80 | + # of groups created by one, hence guaranteeing success at some point. |
| 81 | + # There might be a better solution to this, but this is simple and works. |
| 82 | + # We start with as many groups as we minimally need such that we can fulfill the max |
| 83 | + # contigs per group constraint. |
| 84 | + num_groups = max(1, int(math.ceil(len(small_contigs) / max_contigs_per_group))) |
| 85 | + need_iteration = True |
| 86 | + while need_iteration: |
| 87 | + print("Evaluating with", num_groups, " small contig groups") |
| 88 | + need_iteration = False |
| 89 | + |
| 90 | + # We fill a temporary list for the small contigs, starting with as many empty lists |
| 91 | + # as we have groups, and then fill each group's list with its contigs. |
| 92 | + # We do this by alternating to fill forwards and backwards - that is the heuristic |
| 93 | + # that is meant to avoid having groups with only small or only large contigs. |
| 94 | + # By going back and forth instead of filling from the start, we get a more even spread. |
| 95 | + # Let's call this the zig-zag round-robing assignment :-) |
| 96 | + # For instance, we should get: [[1, 6, 7], [2, 5, 8], [3, 4, 9]] |
| 97 | + small_groups = [[] for _ in range(num_groups)] |
| 98 | + round_num = 0 |
| 99 | + index = 0 |
| 100 | + |
| 101 | + # Continue until all contigs have been distributed. |
| 102 | + while index < len(small_contigs): |
| 103 | + # Determine the order based on whether the round is even (forward) or odd (backward) |
| 104 | + if round_num % 2 == 0: |
| 105 | + order = range(num_groups) |
| 106 | + else: |
| 107 | + order = range(num_groups - 1, -1, -1) |
| 108 | + |
| 109 | + # Distribute one element per group in the specified order. |
| 110 | + for i in order: |
| 111 | + if index >= len(small_contigs): |
| 112 | + break |
| 113 | + small_groups[i].append(small_contigs[index]) |
| 114 | + index += 1 |
| 115 | + round_num += 1 |
| 116 | + |
| 117 | + # Now we test if this was successful: Is each group small enough, both in terms |
| 118 | + # of total length, and in terms of number of contigs per group? |
| 119 | + small_contig_count = 0 |
| 120 | + for group in small_groups: |
| 121 | + total_len = sum(contig[1] for contig in group) |
| 122 | + if (total_len > max_contig_group_size) or (len(group) > max_contigs_per_group): |
| 123 | + print( |
| 124 | + "Not enough groups. Got group length", |
| 125 | + total_len, |
| 126 | + ">", |
| 127 | + max_contig_group_size, |
| 128 | + "and got contigs per group", |
| 129 | + len(group), |
| 130 | + ">", |
| 131 | + max_contigs_per_group, |
| 132 | + ) |
| 133 | + need_iteration = True |
| 134 | + num_groups += 1 |
| 135 | + break |
| 136 | + small_contig_count += len(group) |
| 137 | + |
| 138 | + # Now we are done. Make sure that we have processed the right number of contigs. |
| 139 | + # Then, add all small contigs as groups to our final result, and return it. |
| 140 | + assert small_contig_count == len(small_contigs) |
| 141 | + print( |
| 142 | + "Success with", |
| 143 | + num_groups, |
| 144 | + "small contig groups, and", |
| 145 | + (len(large_groups) + len(small_groups)), |
| 146 | + "total groups", |
| 147 | + ) |
| 148 | + return large_groups + small_groups |
| 149 | + |
| 150 | + |
| 151 | +# ================================================================================================= |
| 152 | +# Checkpoints |
| 153 | +# ================================================================================================= |
| 154 | + |
| 155 | + |
| 156 | +checkpoint contig_groups: |
| 157 | + input: |
| 158 | + fai=get_fai, |
| 159 | + output: |
| 160 | + "calling/contig-groups/contigs.json", |
| 161 | + log: |
| 162 | + "logs/calling/contig-groups/contigs.log", |
| 163 | + params: |
| 164 | + min_contig_size=config["settings"].get("min-contig-size", 0), |
| 165 | + contig_group_size=config["settings"].get("contig-group-size", 0), |
| 166 | + max_contigs_per_group=config["settings"].get("max-contigs-per-group", 0), |
| 167 | + run: |
| 168 | + # Open the log file in write (or append) mode. |
| 169 | + log_file = open(log[0], "w") |
| 170 | + # Redirect both stdout and stderr to the log file. Snakemake does not do that for us... |
| 171 | + # Also, do not add another line of comment above. It took me half an hour to figure out |
| 172 | + # why snakefmt broke... https://github.com/snakemake/snakefmt/issues/196 |
| 173 | + # Why does the snakemake workflow catalogue require proper snakefmt formatting, |
| 174 | + # if that tool is clearly not ready for production usage? |
| 175 | + sys.stdout = log_file |
| 176 | + sys.stderr = log_file |
| 177 | + |
| 178 | + # Solve the bin packing for the contigs, to get a close to optimal solution |
| 179 | + # for putting them in groups. Large contigs (e.g., whole chromosomes) that are larger |
| 180 | + # than the bin size will simply get their own (overflowing...) bin. |
| 181 | + contig_list = read_contigs_from_fai(input.fai, params.min_contig_size) |
| 182 | + if params.max_contigs_per_group == 0: |
| 183 | + print("Running bin packing solver") |
| 184 | + contig_groups = solve_bin_packing(contig_list, params.contig_group_size) |
| 185 | + else: |
| 186 | + print("Running heuristic optimizer") |
| 187 | + contig_groups = optimize_contig_group( |
| 188 | + contig_list, params.contig_group_size, params.max_contigs_per_group |
| 189 | + ) |
| 190 | + |
| 191 | + # Now turn the contig bins into groups for the result of this function. |
| 192 | + # We store our resulting list of contigs containing all contigs, |
| 193 | + # in tuples with their sizes. The contigs is a dict from group name to a list |
| 194 | + # (over contings in the group) of tuples. |
| 195 | + contigs = {} |
| 196 | + for group in contig_groups: |
| 197 | + groupname = "contig-group-" + str(len(contigs)) |
| 198 | + contigs[groupname] = group |
| 199 | + |
| 200 | + # We need to store the result in a file, so that the rule that creates the per-contig |
| 201 | + # files can access it. |
| 202 | + json.dump(contigs, open(output[0], "w")) |
| 203 | + |
| 204 | + |
| 205 | +# Rule is not submitted as a job to the cluster. |
| 206 | +localrules: |
| 207 | + contig_groups, |
| 208 | + |
| 209 | + |
| 210 | +# Due to a new bug in Snakemake after our update to 8.15.2, we now need the following |
| 211 | +# function to be called as input whenever the above checkpoint is needed. |
| 212 | +# See https://github.com/snakemake/snakemake/issues/2958 for details. |
| 213 | +def contigs_groups_input(wildcards): |
| 214 | + return checkpoints.contig_groups.get().output[0] |
| 215 | + |
| 216 | + |
| 217 | + # Make the contig-group list files that contain the names of the contigs/scaffolds |
| 218 | + # that have been bin-packed above. |
| 219 | +rule contigs_group_list: |
| 220 | + input: |
| 221 | + contigs="calling/contig-groups/contigs.json", |
| 222 | + output: |
| 223 | + "calling/contig-groups/{contig}.bed", |
| 224 | + # log: |
| 225 | + # "logs/calling/contig-groups/{contig}.log", |
| 226 | + run: |
| 227 | + # Get the contigs file that we created above. |
| 228 | + contigs = json.load(open(input.contigs)) |
| 229 | + |
| 230 | + # Same for the group name itself: This rule is only executed |
| 231 | + # for group names that we actually have made. |
| 232 | + if wildcards.contig not in contigs: |
| 233 | + raise Exception("Internal error: contig " + wildcards.contig + " not found.") |
| 234 | + |
| 235 | + # Write the output list file, using the contig names and lengths from the group. |
| 236 | + # In bed files, the first three columns are the chrom name, start (incluse, zero-based), |
| 237 | + # and end (exclusive). Hence, we can simply use 0 and length as start and end here. |
| 238 | + with open(output[0], "w") as f: |
| 239 | + f.writelines(f"{c[0]}\t0\t{c[1]}\n" for c in contigs[wildcards.contig]) |
| 240 | + |
| 241 | + # Rule is not submitted as a job to the cluster. |
| 242 | + |
| 243 | + |
| 244 | + |
| 245 | +localrules: |
| 246 | + contigs_group_list, |
| 247 | + |
| 248 | + |
| 249 | +# Conflicts of interest: |
| 250 | +if config["settings"].get("restrict-regions"): |
| 251 | + raise Exception( |
| 252 | + "Cannot combine settings contig-group-size > 0 with restrict-regions " |
| 253 | + "at the moment, as we have not implemented this yet. " |
| 254 | + "If you need this combination of settings, please submit an issue to " |
| 255 | + "https://github.com/lczech/grenepipe/issues and we will see what we can do." |
| 256 | + ) |
| 257 | + # We now extended the rules for bcftools and freebayes to also work with small contig groups. |
| 258 | + # The following check is no longer needed - just kept here for reference. |
| 259 | + # if config["settings"]["calling-tool"] != "haplotypecaller": |
| 260 | + # raise Exception( |
| 261 | + # "Can only use setting contig-group-size with calling-tool haplotypecaller " |
| 262 | + # "at the moment, as we have not implemented this for other calling tools yet. " |
| 263 | + # "If you need this combination of settings, please submit an issue to " |
| 264 | + # "https://github.com/lczech/grenepipe/issues and we will see what we can do." |
| 265 | + # ) |
0 commit comments