forked from dgryski/go-boomphf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
boomphf.go
181 lines (141 loc) · 3.28 KB
/
boomphf.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
// Package boomphf is a fast perfect hash function for massive key sets
/*
https://arxiv.org/abs/1702.03154
*/
package boomphf
// H is hash function data
type H struct {
b []bitvector
ranks [][]uint64
}
// Gamma is good default value for controlling space vs. construction speed
const Gamma = 2
// New contructs a perfect hash function for the keys. The gamma value controls the space used.
func New(gamma float64, keys []uint64) *H {
var h H
var level uint32
size := uint32(gamma * float64(len(keys)))
size = (size + 63) &^ 63
A := newbv(size)
collide := newbv(size)
var redo []uint64
for len(keys) > 0 {
for _, v := range keys {
hash := xorshiftMult64(v)
h1, h2 := uint32(hash), uint32(hash>>32)
idx := (h1 ^ rotl(h2, level)) % size
if collide.get(idx) == 1 {
continue
}
if A.get(idx) == 1 {
collide.set(idx)
continue
}
A.set(idx)
}
bv := newbv(size)
for _, v := range keys {
hash := xorshiftMult64(v)
h1, h2 := uint32(hash), uint32(hash>>32)
idx := (h1 ^ rotl(h2, level)) % size
if collide.get(idx) == 1 {
redo = append(redo, v)
continue
}
bv.set(idx)
}
h.b = append(h.b, bv)
keys = redo
redo = redo[:0] // tricky, sharing space with `keys`
size = uint32(gamma * float64(len(keys)))
size = (size + 63) &^ 63
A.reset()
collide.reset()
level++
}
h.computeRanks()
return &h
}
func (h *H) computeRanks() {
var pop uint64
for _, bv := range h.b {
r := make([]uint64, 0, 1+(len(bv)/8))
for i, v := range bv {
if i%8 == 0 {
r = append(r, pop)
}
pop += popcnt(v)
}
h.ranks = append(h.ranks, r)
}
}
// Query returns the index of the key
func (h *H) Query(k uint64) uint64 {
hash := xorshiftMult64(k)
h1, h2 := uint32(hash), uint32(hash>>32)
for i, bv := range h.b {
idx := (h1 ^ rotl(h2, uint32(i))) % uint32(len(bv)*64)
if bv.get(idx) == 0 {
continue
}
rank := h.ranks[i][idx/512]
for j := (idx / 64) &^ 7; j < idx/64; j++ {
rank += popcnt(bv[j])
}
w := bv[idx/64]
rank += popcnt(w << (64 - (idx % 64)))
return rank + 1
}
return 0
}
// Size returns the size in bytes
func (h *H) Size() int {
var size int
for _, v := range h.b {
size += len(v) * 8
}
for _, v := range h.ranks {
size += len(v) * 8
}
return size
}
func rotl(v uint32, r uint32) uint32 {
return (v << r) | (v >> (32 - r))
}
// 64-bit xorshift multiply rng from http://vigna.di.unimi.it/ftp/papers/xorshift.pdf
func xorshiftMult64(x uint64) uint64 {
x ^= x >> 12 // a
x ^= x << 25 // b
x ^= x >> 27 // c
return x * 2685821657736338717
}
type bitvector []uint64
func newbv(size uint32) bitvector {
return make([]uint64, uint(size+63)/64)
}
// get bit 'bit' in the bitvector d
func (b bitvector) get(bit uint32) uint {
shift := bit % 64
bb := b[bit/64]
bb &= (1 << shift)
return uint(bb >> shift)
}
// set bit 'bit' in the bitvector d
func (b bitvector) set(bit uint32) {
b[bit/64] |= (1 << (bit % 64))
}
func (b bitvector) reset() {
for i := range b {
b[i] = 0
}
}
func popcnt(x uint64) uint64 {
// bit population count, see
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
x -= (x >> 1) & 0x5555555555555555
x = (x>>2)&0x3333333333333333 + x&0x3333333333333333
x += x >> 4
x &= 0x0f0f0f0f0f0f0f0f
x *= 0x0101010101010101
return x >> 56
}