|
10 | 10 | It may be a bit slow, but hopefully it will pick the largest possible
|
11 | 11 | consistent sub-selection or, at least, get close to it.
|
12 | 12 |
|
13 |
| -The current method to select the data is rather heuristic. It is based on |
14 |
| -iteratively deleting those data elements whose configuration elements appear |
15 |
| -the least often. This is computed in absolute terms. I actually think |
16 |
| -normalizing this score should be better, but seemingly using the absolute raw |
17 |
| -score creates larger consistent datasets. So for now, this is how we will do |
18 |
| -it. |
| 13 | +The current method to select the data is rather heuristic. It always begins |
| 14 | +with the full set of data and aims to delete the element that will cause the |
| 15 | +least other deletions down the road, until we arrive in a consistent state. |
| 16 | +I strongly suspect that doing this perfectly would be NP-hard, so we cannot |
| 17 | +implement this. Instead, we use different heuristics and then pick the best |
| 18 | +result. |
19 | 19 | """
|
20 | 20 |
|
21 | 21 | from collections import Counter
|
@@ -452,6 +452,14 @@ def select_consistent(data: Iterable[T], log: bool = True,
|
452 | 452 | if log:
|
453 | 453 | logger(f"All setups now have the same score {max_score} under"
|
454 | 454 | f" {scorer_name!r}.")
|
| 455 | + if count <= best_length: |
| 456 | + if log: |
| 457 | + logger(f"We now only have {count} setups, which means we " |
| 458 | + "cannot get better than the current best set with " |
| 459 | + f"{best_length} setups, so we quit after score-" |
| 460 | + f"based cleaning under {scorer_name!r}.") |
| 461 | + count = -1 |
| 462 | + break |
455 | 463 |
|
456 | 464 | # If we get here, all elements have the same score.
|
457 | 465 | # This means that we are basically done.
|
@@ -521,6 +529,15 @@ def select_consistent(data: Iterable[T], log: bool = True,
|
521 | 529 | elif log:
|
522 | 530 | logger("No inconsistencies in algorithm/instance/objective/"
|
523 | 531 | f"encoding possible under {scorer_name!r}.")
|
| 532 | + if count <= best_length: |
| 533 | + if log: |
| 534 | + logger(f"We now only have {count} setups, which means we " |
| 535 | + "cannot get better than the current best set with " |
| 536 | + f"{best_length} setups, so we quit after algorithm" |
| 537 | + "/instance/objective/encoding cleaning under " |
| 538 | + f"{scorer_name!r}.") |
| 539 | + count = -1 |
| 540 | + break |
524 | 541 |
|
525 | 542 | # If we get here, the only problem left could be if algorithms
|
526 | 543 | # have different seeds for the same instances. We thus need to
|
@@ -597,13 +614,23 @@ def select_consistent(data: Iterable[T], log: bool = True,
|
597 | 614 | f"Seeds inconsistent under {scorer_name!r}.")
|
598 | 615 | del must_delete_from_insts
|
599 | 616 |
|
600 |
| - if (not changed) and log: |
601 |
| - logger(f"No seed inconsistencies under {scorer_name!r}.") |
602 | 617 | del seeds
|
| 618 | + if changed: |
| 619 | + if count <= best_length: |
| 620 | + if log: |
| 621 | + logger(f"We now only have {count} setups, which " |
| 622 | + "means we cannot get better than the current " |
| 623 | + f"best set with {best_length} setups, so we " |
| 624 | + "quit after seed-based cleaning under " |
| 625 | + f"{scorer_name!r}.") |
| 626 | + count = -1 |
| 627 | + break |
| 628 | + elif log: |
| 629 | + logger(f"No seed inconsistencies under {scorer_name!r}.") |
603 | 630 | # There should not be any problems left, but we need to check
|
604 |
| - # again. |
| 631 | + # again if something has changed. |
605 | 632 |
|
606 |
| - if count < 0: |
| 633 | + if count <= 0: |
607 | 634 | continue # We can do nothing here
|
608 | 635 |
|
609 | 636 | if count > best_length:
|
|
0 commit comments