Colorkey blit optimization #2696

itzpr3d4t0r · 2024-01-27T14:39:47Z

Most of pygame game types consist of pixel art, which oftentimes utilizes colokey blitting for effective use of pixel assets. This PR optimizes that with AVX2/SSE2 significantly. This strategy is limited to 32bit surfaces that don't use RLE.

The improvements vary widely with surface size so refer to these graphs for a better visualization:

Current VS AVX2

Current VS SSE2

SSE2VS AVX2

My actual timings files:
blit_colorkey_AVX2.json
blit_colorkey_old.json
blit_colorkey_SSE2.json

All made with the following test program:

import pygame.draw

from utils.data_utils import Plotter, Evaluator
from pygame import Surface


def tests_setup(curr_size: int, g: dict):
    base = Surface((curr_size, curr_size))
    surf = Surface((curr_size, curr_size))
    surf.fill(0xFF00FF00)
    surf.set_colorkey(0xFF00FF00)

    cs2 = curr_size // 2

    pygame.draw.circle(surf, 0xFFFF0000, (cs2, cs2), cs2)

    g["surf"] = surf
    g["base"] = base


tests = [
    ("blit_colorkey_PR", "base.blit(surf, (0, 0))"),
]

Evaluator(tests, tests_setup, max_size=1000, reps=100, num=1).run()

files = [
    ("blit_colorkey_old", "white"),
    ("blit_colorkey_AVX2", "green"),
    ("blit_colorkey_SSE2", "blue"),
]

p = Plotter("colorkey_blit", files)
p.plot_tests()
# p.compare([(0, 1)])

And this is the data_utils file if you're interested:

import json

from matplotlib import pyplot as plt
import matplotlib.colors as mcolors

from statistics import mean, stdev, median
from timeit import repeat

COLORS = list(mcolors.CSS4_COLORS.keys())

__all__ = ["Plotter", "Evaluator"]

JSON_DIR = "files_json"

class Plotter:

    def __init__(self, title: str, tests: list, mode: str = "MIN",
                 limit_to_range: int = -1):
        plt.style.use(["dark_background"])
        self.title = title
        self.tests = tests

        self.mode = mode
        self.mode_func = None
        self.limit_to_range = limit_to_range
        self.filter(mode)

    def filter(self, mode: str):
        self.mode = mode

        match mode:
            case "MEAN":
                self.mode_func = mean
            case "MIN":
                self.mode_func = min
            case "MAX":
                self.mode_func = max
            case "MEDIAN":
                self.mode_func = median

    def plot_tests(self):
        for file_name, color in self.tests:
            try:
                with open(f"{JSON_DIR}/{file_name}.json", "r") as f:
                    data = json.load(f)
            except FileNotFoundError:
                print(f"File {file_name}.json not found!")
                quit()

            timings = [self.mode_func(dp) for dp in data["data"]][:self.limit_to_range]

            print(f"=== {file_name} ===")
            print(
                f"Total: {sum([sum(data_point) for data_point in data['data']])}\n"
                f"Mean: {mean(timings)}\n"
                f"Median: {median(timings)}\n"
                f"Stdev: {stdev(timings)}"
            )
            print()
            plt.plot(timings, color=color, label=file_name, linewidth=1)
        plt.legend()
        plt.title(self.title)
        plt.xlabel("Surface size (px)")
        plt.ylabel("Time (s)")
        plt.show()

    def compare(self, indices: list[tuple[int, int]], c1="white", c2="lime"):
        for i1, i2 in indices:
            filename_1, _ = self.tests[i1]
            filename_2, _ = self.tests[i2]

            try:
                with open(f"{JSON_DIR}/{filename_1}.json", "r") as f:
                    data_1 = json.load(f)
            except FileNotFoundError:
                print(f"File {filename_1}.json not found!")
                quit()

            try:
                with open(f"{JSON_DIR}/{filename_2}.json", "r") as f:
                    data_2 = json.load(f)
            except FileNotFoundError:
                print(f"File {filename_2}.json not found!")
                quit()

            timings_1 = [self.mode_func(dp) for dp in data_1["data"]][
                        :self.limit_to_range]
            timings_2 = [self.mode_func(dp) for dp in data_2["data"]][
                        :self.limit_to_range]

            plt.figure(figsize=(10, 5))
            curr_index = 1

            plt.subplot(len(indices), 2, curr_index)
            plt.scatter(range(len(timings_1)), timings_1, color=c1, label=filename_1,
                        s=.5)
            plt.scatter(range(len(timings_1)), timings_2, color=c2, label=filename_2,
                        s=.5)
            plt.legend()
            plt.title("Timings")
            plt.xlabel("Surface size (px)")
            plt.ylabel("Time (s)")

            plt.subplot(len(indices), 2, curr_index + 1)
            comparative_data = [
                100 * ((t1 / t2) - 1) for t1, t2 in zip(timings_1, timings_2)
            ]
            plt.scatter(range(len(comparative_data)), comparative_data, color="red", s=1)
            plt.plot([0] * len(comparative_data), color="green", linewidth=2)
            plt.title("Relative % improvement")
            plt.xlabel("Surface size (px)")

        plt.show()


class Evaluator:
    def __init__(self, tests: list, tests_setup, pre_test_setup=None,
                 max_size: int = 1000, reps: int = 30,
                 num: int = 1):
        self.tests = tests
        self.tests_setup = tests_setup
        self.max_size = max_size
        self.reps = reps
        self.num = num
        self.G = {}

        if pre_test_setup:
            try:
                pre_test_setup(self.G)
            except TypeError:
                raise TypeError("pre_test_setup must be a callable")

    def run(self):
        for test_name, statement in self.tests:
            data = {"title": test_name, "data": []}

            print(f"\n========| {test_name.upper()} |========")

            for curr_size in range(1, self.max_size + 1):

                self.print_progress_bar(curr_size)

                self.tests_setup(curr_size, self.G)
                data["data"].append(self.run_test(statement))

            with open(f"{JSON_DIR}/{test_name}.json", "w") as f:
                json.dump(data, f)

    def print_progress_bar(self, curr_size: int):
        amt = 3 * curr_size // 100
        print(
            "\r[" + "▪" * amt + " " * (3 * 10 - amt) + f"] {curr_size} | {self.max_size}", end=""
        )

    def run_test(self, statement):
        return repeat(statement, globals=self.G, number=self.num, repeat=self.reps)

Starbuck5 · 2024-01-30T07:25:25Z

When I read the title, I assumed this was a SIMD implementation of alphablit_colorkey. As far as I can understand, that's not it at all.

What you've made is not for alpha blitting all, it's another special cased blit routine to take over from SDL.

Therefore I don't think should be in pygameBlit, because everything in pygameBlit is for alpha or blend flags? If you can determine what code path it's meant for in surface.blit, why not send it there directly?

Secondly, could this be upstreamed to SDL? That way we don't need a special path at all, and everyone's code gets faster.

itzpr3d4t0r · 2024-02-04T12:29:47Z

When I read the title, I assumed this was a SIMD implementation of alphablit_colorkey. As far as I can understand, that's not it at all. What you've made is not for alpha blitting all, it's another special cased blit routine to take over from SDL.

Yes, that's correct. It's an AVX/SSE implementation of the staight colorkey blit.

Therefore I don't think should be in pygameBlit, because everything in pygameBlit is for alpha or blend flags? If you can determine what code path it's meant for in surface.blit, why not send it there directly?

I included it here to prevent redundant code. Creating a separate function would entail duplicating the rect clipping logic and other checks, which seemed impractical.

Secondly, could this be upstreamed to SDL? That way we don't need a special path at all, and everyone's code gets faster.

It could but how long would we have to wait for it? I've seen people suggest avx/sse improvements to old code and in all got flagged for 3.2.0 at best so I'm afraid we would never see these improvements soon. And generally speaking SDL doesn't look to implement much past SSE code from what I've seen.
So the wait could be very long for something that would impact tons of pygame games immediately.

And a note, these improvements, even in the sse case, look to be even better than our best AVX alpha/premultiplied alpha implementations.

itzpr3d4t0r · 2024-02-04T12:34:59Z

I believe that with pygame having "game" in its name and many softwares made with it being pixel art games, this change is really important and worth implementing it directly here and not going through SDL. Besides, it won't require us changing it or expanding it again since it's a unique case.

itzpr3d4t0r · 2024-02-04T15:09:56Z

A little test program for reproducibility (without this PR i get around 200/250 fps and with this PR 470/520):
Images:

import pygame
from random import randint

pygame.init()

screen = pygame.display.set_mode((1000, 1000))

N_TREES = 150
N_GRASS = 4000
N_FLOWERS = 1000

tree = pygame.image.load("tree.png").convert()
tile = pygame.image.load("grass_tile.png").convert()
grass_img = pygame.image.load("grass.png").convert()
flower_img = pygame.image.load("flower.png").convert()

for surf in [tree, tile, grass_img, flower_img]:
    surf.set_colorkey((255, 0, 220))

tiles = [
    (tile, (j * tile.get_width(), i * tile.get_height())) for i in range(56) for j in
    range(32)
]
off_x = tile.get_width() // 2
off_y = tile.get_height() // 2

tiles.extend(
    [
        (tile, (j * tile.get_width() - off_x, i * tile.get_height() - off_y)) for i in
        range(56) for j in range(32)
    ]
)

trees = [
    (tree, [randint(0, 1000 - s) for s in tree.get_size()]) for _ in range(N_TREES)
]

grass = [
    (grass_img, [randint(0, 1000 - s) for s in grass_img.get_size()]) for _ in
    range(N_GRASS)
]

flowers = [
    (flower_img, [randint(0, 1000 - s) for s in flower_img.get_size()]) for _ in
    range(N_FLOWERS)
]

for collection in [trees, tiles, grass, flowers]:
    collection.sort(key=lambda x: x[1][1])

clock = pygame.Clock()

font = pygame.font.SysFont("Arial", 25, True)

all = tiles
all.extend(grass)
all.extend(flowers)
all.extend(trees)

total_entities = N_TREES + N_GRASS + N_FLOWERS + len(tiles)

while True:
    clock.tick_busy_loop(1000)

    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            pygame.quit()
            quit()

    screen.fblits(all)

    fps_text = font.render(f"FPS: {clock.get_fps():.2f} | Entities: {total_entities}",
                           True, (255, 255, 255), bgcolor=0)
    screen.blit(fps_text, (0, 0))

    pygame.display.flip()

MyreMylar

LGTM 👍

I think a fair comparison is probably between the current SDL color key path using RLE acceleration and this method which doesn't use RLE. However, AVX2 comes out on top either way and it looks to me to be pretty much a dead heat between SDL RLE path and the SSE2 version of this function.

Might be worth adding a documentation note to the set_colorkey function if this gets merged in that on AVX2 platforms (windows, non-arm linux) that 32 bit colorkeys will be faster without RLE.

Or we just bite the bullet and deprecate RLE for 32 bit entirely in pygame. Though maybe we could AVX2 8bit colorkey stuff as well... hmm.

On SDL2 - while it would be nice to upstream stuff to SDL2 I don't see why we can't slip this in to pygame for now and leave upstreaming stuff as future work. There is probably lots of our AVX2 blitting code that could be upstreamed to SDL, not just this.

Starbuck5 · 2024-02-26T07:00:28Z

I believe that with pygame having "game" in its name and many softwares made with it being pixel art games, this change is really important and worth implementing it directly here and not going through SDL. Besides, it won't require us changing it or expanding it again since it's a unique case.

I think you don't want to implement it in SDL because you're less familiar with the development process and foibles of contributing to SDL, you think it would be way harder than implementing it here, and you're probably right.

It would be the most significant contribution to SDL by a pygame(-ce) contributor in my memory. I only suggested it because I think you have what it takes to actually do it.

On SDL2 - while it would be nice to upstream stuff to SDL2 I don't see why we can't slip this in to pygame for now and leave upstreaming stuff as future work. There is probably lots of our AVX2 blitting code that could be upstreamed to SDL, not just this.

I don't think this is the case. I think the reason we have blitting code at all is because the algorithm changed between SDL1 and SDL2. We can't upstream what isn't compatible with what they have.

itzpr3d4t0r · 2024-03-28T10:29:00Z

I don't get if this is a strong/weak no or a strong/weak yes @Starbuck5.

itzpr3d4t0r added Performance Related to the speed or resource usage of the project SIMD Surface pygame.Surface labels Jan 27, 2024

itzpr3d4t0r requested a review from a team as a code owner January 27, 2024 14:39

itzpr3d4t0r requested review from MyreMylar and Starbuck5 January 28, 2024 10:41

itzpr3d4t0r added 3 commits January 29, 2024 13:39

optimized colorkey blit

7f9794a

format

9b1ae5e

refactors

c79ca8a

itzpr3d4t0r force-pushed the colorkey_blit_optimization branch from 4184342 to c79ca8a Compare January 29, 2024 12:39

MyreMylar approved these changes Feb 4, 2024

View reviewed changes

itzpr3d4t0r added 2 commits February 4, 2024 19:49

Merge branch 'main' into colorkey_blit_optimization

63f6888

fix

0437c07

itzpr3d4t0r added 4 commits May 21, 2024 12:17

Merge branch 'main' into colorkey_blit_optimization

f85650f

fixes

4dd13a5

fix

c7966d9

fix

5635378

itzpr3d4t0r closed this Jun 2, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Colorkey blit optimization #2696

Colorkey blit optimization #2696

itzpr3d4t0r commented Jan 27, 2024 •

edited

Starbuck5 commented Jan 30, 2024

itzpr3d4t0r commented Feb 4, 2024 •

edited

itzpr3d4t0r commented Feb 4, 2024 •

edited

itzpr3d4t0r commented Feb 4, 2024

MyreMylar left a comment

Starbuck5 commented Feb 26, 2024

itzpr3d4t0r commented Mar 28, 2024

Colorkey blit optimization #2696

Colorkey blit optimization #2696

Conversation

itzpr3d4t0r commented Jan 27, 2024 • edited

Starbuck5 commented Jan 30, 2024

itzpr3d4t0r commented Feb 4, 2024 • edited

itzpr3d4t0r commented Feb 4, 2024 • edited

itzpr3d4t0r commented Feb 4, 2024

MyreMylar left a comment

Choose a reason for hiding this comment

Starbuck5 commented Feb 26, 2024

itzpr3d4t0r commented Mar 28, 2024

itzpr3d4t0r commented Jan 27, 2024 •

edited

itzpr3d4t0r commented Feb 4, 2024 •

edited

itzpr3d4t0r commented Feb 4, 2024 •

edited