Skip to content

Commit 4f30afe

Browse files
jcchavezsjptosso
andauthored
chore: adds memoize implementation for regexes and ahocorasick (#836)
* chore: adds memoize implementation for regexes. Currently we create and allocate memory for every regex we compile, however there are cases where you compile the same regex over and over e.g. corazawaf/coraza-caddy#76. Here we implement the memoize pattern to be able to reuse the regex and reduce the memory consumption. * docs: adds comments to code. * chore: simplify the memoize package by using sync.Map. * feat: extends memoize to ahocorasick and allow impl for tinygo but not synced as no concurrency. * tests: covers memoize_builders in tinygo. * chore: fixes nosync for tinygo. * docs: updates docs. --------- Co-authored-by: Juan Pablo Tosso <[email protected]>
1 parent b3490b4 commit 4f30afe

File tree

21 files changed

+518
-26
lines changed

21 files changed

+518
-26
lines changed

.github/workflows/tinygo.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,6 @@ jobs:
4747
4848
- name: Tests
4949
run: tinygo test ./...
50+
51+
- name: Tests memoize
52+
run: tinygo test -tags=memoize_builders ./...

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,9 @@ have compatibility guarantees across minor versions - use with care.
106106
the operator with `plugins.RegisterOperator` to reduce binary size / startup overhead.
107107
* `coraza.rule.multiphase_valuation` - enables evaluation of rule variables in the phases that they are ready, not
108108
only the phase the rule is defined for.
109+
* `memoize_builders` - enables memoization of builders for regex and aho-corasick
110+
dictionaries to reduce memory consumption in deployments that launch several coraza
111+
instances. For more context check [this issue](https://github.com/corazawaf/coraza-caddy/issues/76)
109112

110113
## E2E Testing
111114

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ require (
2424
github.com/petar-dambovaliev/aho-corasick v0.0.0-20211021192214-5ab2d9280aa9
2525
github.com/tidwall/gjson v1.14.4
2626
golang.org/x/net v0.11.0
27+
golang.org/x/sync v0.1.0
2728
rsc.io/binaryregexp v0.2.0
2829
)
2930

go.sum

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ golang.org/x/net v0.11.0/go.mod h1:2L/ixqYpgIVXmeoSA/4Lu7BzTG4KIyPIryS4IsOd1oQ=
3737
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
3838
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
3939
golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o=
40+
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
4041
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
4142
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
4243
golang.org/x/sys v0.0.0-20190922100055-0a153f010e69/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=

internal/corazawaf/rule.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"github.com/corazawaf/coraza/v3/experimental/plugins/macro"
1616
"github.com/corazawaf/coraza/v3/experimental/plugins/plugintypes"
1717
"github.com/corazawaf/coraza/v3/internal/corazarules"
18+
"github.com/corazawaf/coraza/v3/internal/memoize"
1819
"github.com/corazawaf/coraza/v3/types"
1920
"github.com/corazawaf/coraza/v3/types/variables"
2021
)
@@ -456,7 +457,12 @@ func (r *Rule) AddVariable(v variables.RuleVariable, key string, iscount bool) e
456457
var re *regexp.Regexp
457458
if len(key) > 2 && key[0] == '/' && key[len(key)-1] == '/' {
458459
key = key[1 : len(key)-1]
459-
re = regexp.MustCompile(key)
460+
461+
if vare, err := memoize.Do(key, func() (interface{}, error) { return regexp.Compile(key) }); err != nil {
462+
return err
463+
} else {
464+
re = vare.(*regexp.Regexp)
465+
}
460466
}
461467

462468
if multiphaseEvaluation {
@@ -521,7 +527,11 @@ func (r *Rule) AddVariableNegation(v variables.RuleVariable, key string) error {
521527
var re *regexp.Regexp
522528
if len(key) > 2 && key[0] == '/' && key[len(key)-1] == '/' {
523529
key = key[1 : len(key)-1]
524-
re = regexp.MustCompile(key)
530+
if vare, err := memoize.Do(key, func() (interface{}, error) { return regexp.Compile(key) }); err != nil {
531+
return err
532+
} else {
533+
re = vare.(*regexp.Regexp)
534+
}
525535
}
526536
// Prevent sigsev
527537
if r == nil {

internal/memoize/README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Memoize
2+
3+
Memoize allows to cache certain expensive function calls and
4+
cache the result. The main advantage in Coraza is to memoize
5+
the regexes and aho-corasick dictionaries when the connects
6+
spins up more than one WAF in the same process and hence same
7+
regexes are being compiled over and over.
8+
9+
Currently it is opt-in under the `memoize_builders` build tag
10+
as under a misuse (e.g. using after build time) it could lead
11+
to a memory leak as currently the cache is global.
12+
13+
**Important:** Connectors with *live reload* functionality (e.g. Caddy)
14+
could lead to memory leaks which might or might not be negligible in
15+
most of the cases as usually config changes in a WAF are about a few
16+
rules, this is old objects will be still alive in memory until the program
17+
stops.

internal/memoize/noop.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
//go:build !memoize_builders
5+
6+
package memoize
7+
8+
func Do(_ string, fn func() (interface{}, error)) (interface{}, error) {
9+
return fn()
10+
}

internal/memoize/nosync.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
//go:build tinygo && memoize_builders
5+
6+
package memoize
7+
8+
import "sync"
9+
10+
var doer = makeDoer(new(sync.Map))
11+
12+
// Do executes and returns the results of the given function, unless there was a cached
13+
// value of the same key. Only one execution is in-flight for a given key at a time.
14+
// The boolean return value indicates whether v was previously stored.
15+
func Do(key string, fn func() (interface{}, error)) (interface{}, error) {
16+
value, err, _ := doer(key, fn)
17+
return value, err
18+
}
19+
20+
// makeDoer returns a function that executes and returns the results of the given function
21+
func makeDoer(cache *sync.Map) func(string, func() (interface{}, error)) (interface{}, error, bool) {
22+
return func(key string, fn func() (interface{}, error)) (interface{}, error, bool) {
23+
// Check cache
24+
value, found := cache.Load(key)
25+
if found {
26+
return value, nil, true
27+
}
28+
29+
data, err := fn()
30+
if err == nil {
31+
cache.Store(key, data)
32+
}
33+
34+
return data, err, false
35+
}
36+
}

internal/memoize/nosync_test.go

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
// Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
//go:build tinygo && memoize_builders
5+
6+
// https://github.com/kofalt/go-memoize/blob/master/memoize.go
7+
8+
package memoize
9+
10+
import (
11+
"errors"
12+
"sync"
13+
"testing"
14+
)
15+
16+
func TestDo(t *testing.T) {
17+
expensiveCalls := 0
18+
19+
// Function tracks how many times its been called
20+
expensive := func() (interface{}, error) {
21+
expensiveCalls++
22+
return expensiveCalls, nil
23+
}
24+
25+
// First call SHOULD NOT be cached
26+
result, err := Do("key1", expensive)
27+
if err != nil {
28+
t.Fatalf("unexpected error: %s", err.Error())
29+
}
30+
31+
if want, have := 1, result.(int); want != have {
32+
t.Fatalf("unexpected value, want %d, have %d", want, have)
33+
}
34+
35+
// Second call on same key SHOULD be cached
36+
result, err = Do("key1", expensive)
37+
if err != nil {
38+
t.Fatalf("unexpected error: %s", err.Error())
39+
}
40+
41+
if want, have := 1, result.(int); want != have {
42+
t.Fatalf("unexpected value, want %d, have %d", want, have)
43+
}
44+
45+
// First call on a new key SHOULD NOT be cached
46+
result, err = Do("key2", expensive)
47+
if err != nil {
48+
t.Fatalf("unexpected error: %s", err.Error())
49+
}
50+
51+
if want, have := 2, result.(int); want != have {
52+
t.Fatalf("unexpected value, want %d, have %d", want, have)
53+
}
54+
}
55+
56+
func TestSuccessCall(t *testing.T) {
57+
do := makeDoer(new(sync.Map))
58+
59+
expensiveCalls := 0
60+
61+
// Function tracks how many times its been called
62+
expensive := func() (interface{}, error) {
63+
expensiveCalls++
64+
return expensiveCalls, nil
65+
}
66+
67+
// First call SHOULD NOT be cached
68+
result, err, cached := do("key1", expensive)
69+
if err != nil {
70+
t.Fatalf("unexpected error: %s", err.Error())
71+
}
72+
73+
if want, have := 1, result.(int); want != have {
74+
t.Fatalf("unexpected value, want %d, have %d", want, have)
75+
}
76+
77+
if want, have := false, cached; want != have {
78+
t.Fatalf("unexpected caching, want %t, have %t", want, have)
79+
}
80+
81+
// Second call on same key SHOULD be cached
82+
result, err, cached = do("key1", expensive)
83+
if err != nil {
84+
t.Fatalf("unexpected error: %s", err.Error())
85+
}
86+
87+
if want, have := 1, result.(int); want != have {
88+
t.Fatalf("unexpected value, want %d, have %d", want, have)
89+
}
90+
91+
if want, have := true, cached; want != have {
92+
t.Fatalf("unexpected caching, want %t, have %t", want, have)
93+
}
94+
95+
// First call on a new key SHOULD NOT be cached
96+
result, err, cached = do("key2", expensive)
97+
if err != nil {
98+
t.Fatalf("unexpected error: %s", err.Error())
99+
}
100+
101+
if want, have := 2, result.(int); want != have {
102+
t.Fatalf("unexpected value, want %d, have %d", want, have)
103+
}
104+
105+
if want, have := false, cached; want != have {
106+
t.Fatalf("unexpected caching, want %t, have %t", want, have)
107+
}
108+
}
109+
110+
func TestFailedCall(t *testing.T) {
111+
do := makeDoer(new(sync.Map))
112+
113+
calls := 0
114+
115+
// This function will fail IFF it has not been called before.
116+
twoForTheMoney := func() (interface{}, error) {
117+
calls++
118+
119+
if calls == 1 {
120+
return calls, errors.New("Try again")
121+
} else {
122+
return calls, nil
123+
}
124+
}
125+
126+
// First call should fail, and not be cached
127+
result, err, cached := do("key1", twoForTheMoney)
128+
if err == nil {
129+
t.Fatalf("expected error")
130+
}
131+
132+
if want, have := 1, result.(int); want != have {
133+
t.Fatalf("unexpected value, want %d, have %d", want, have)
134+
}
135+
136+
if want, have := false, cached; want != have {
137+
t.Fatalf("unexpected caching, want %t, have %t", want, have)
138+
}
139+
140+
// Second call should succeed, and not be cached
141+
result, err, cached = do("key1", twoForTheMoney)
142+
if err != nil {
143+
t.Fatalf("unexpected error: %s", err.Error())
144+
}
145+
146+
if want, have := 2, result.(int); want != have {
147+
t.Fatalf("unexpected value, want %d, have %d", want, have)
148+
}
149+
150+
if want, have := false, cached; want != have {
151+
t.Fatalf("unexpected caching, want %t, have %t", want, have)
152+
}
153+
154+
// Third call should succeed, and be cached
155+
result, err, cached = do("key1", twoForTheMoney)
156+
if err != nil {
157+
t.Fatalf("unexpected error: %s", err.Error())
158+
}
159+
160+
if want, have := 2, result.(int); want != have {
161+
t.Fatalf("unexpected value, want %d, have %d", want, have)
162+
}
163+
164+
if want, have := true, cached; want != have {
165+
t.Fatalf("unexpected caching, want %t, have %t", want, have)
166+
}
167+
}

internal/memoize/sync.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
// Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
//go:build !tinygo && memoize_builders
5+
6+
// https://github.com/kofalt/go-memoize/blob/master/memoize.go
7+
8+
package memoize
9+
10+
import (
11+
"sync"
12+
13+
"golang.org/x/sync/singleflight"
14+
)
15+
16+
var doer = makeDoer(new(sync.Map), new(singleflight.Group))
17+
18+
// Do executes and returns the results of the given function, unless there was a cached
19+
// value of the same key. Only one execution is in-flight for a given key at a time.
20+
// The boolean return value indicates whether v was previously stored.
21+
func Do(key string, fn func() (interface{}, error)) (interface{}, error) {
22+
value, err, _ := doer(key, fn)
23+
return value, err
24+
}
25+
26+
// makeDoer returns a function that executes and returns the results of the given function
27+
func makeDoer(cache *sync.Map, group *singleflight.Group) func(string, func() (interface{}, error)) (interface{}, error, bool) {
28+
return func(key string, fn func() (interface{}, error)) (interface{}, error, bool) {
29+
// Check cache
30+
value, found := cache.Load(key)
31+
if found {
32+
return value, nil, true
33+
}
34+
35+
// Combine memoized function with a cache store
36+
value, err, _ := group.Do(key, func() (interface{}, error) {
37+
data, innerErr := fn()
38+
if innerErr == nil {
39+
cache.Store(key, data)
40+
}
41+
42+
return data, innerErr
43+
})
44+
45+
return value, err, false
46+
}
47+
}

0 commit comments

Comments
 (0)