Skip to content

Commit a871d04

Browse files
committed
so/maps: extract memory swapping functions
1 parent 8594e6a commit a871d04

6 files changed

Lines changed: 101 additions & 62 deletions

File tree

bench/README.md

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -45,27 +45,27 @@ Apple M1 • Go 1.26.1 • [details](./bytes/README.md#buffer)
4545

4646
### Int keys
4747

48-
So lookups are 3.4x faster than Go, while modifications are 1.6x slower.
48+
For heap-allocated maps, So is ~1.4x faster than Go across all operations.
4949

50-
So's built-in map is faster than Go's, but I wouldn't call it the winner because it's only useful in certain situations — it's fixed size and stack-allocated.
50+
So's built-in map is even faster, but it's only useful in certain situations — it's fixed size and stack-allocated.
5151

5252
| Benchmark | Go | So (mimalloc) | So (arena) | So (built-in) | Winner |
5353
| --------- | ------: | ------------: | ---------: | ------------: | ------------- |
54-
| Set | 35580ns | 56696ns | 57661ns | n/a | Go - 0.6x |
55-
| Set (pre) | 9608ns | 8821ns | 8767ns | 3242ns | ~same |
56-
| Get | 5573ns | 1638ns | 1583ns | 2733ns | **So** - 3.4x |
57-
| Delete | 23892ns | 38556ns | 38821ns | n/a | Go - 0.6x |
54+
| Set | 35645ns | 26333ns | 25515ns | n/a | **So** - 1.4x |
55+
| Set (pre) | 9676ns | 8813ns | 8704ns | 3109ns | **So** - 1.1x |
56+
| Get | 5594ns | 1581ns | 1537ns | 2577ns | **So** - 3.5x |
57+
| Delete | 23968ns | 14889ns | 14859ns | n/a | **So** - 1.6x |
5858

5959
### String keys
6060

61-
So lookups are on par with Go, while modifications are 1.5x slower.
61+
So modifications are ~1.4x faster than Go, while lookups are slightly slower.
6262

6363
| Benchmark | Go | So (mimalloc) | So (arena) | So (built-in) | Winner |
6464
| --------- | ------: | ------------: | ---------: | ------------: | ------------- |
65-
| Set | 48677ns | 71879ns | 63500ns | n/a | Go - 0.7x |
66-
| Set (pre) | 14620ns | 12313ns | 12115ns | 6970ns | **So** - 1.2x |
67-
| Get | 8990ns | 10206ns | 10083ns | 10735ns | Go - 0.9x |
68-
| Delete | 33878ns | 50111ns | 49507ns | n/a | Go - 0.7x |
65+
| Set | 47805ns | 31055ns | 30749ns | n/a | **So** - 1.5x |
66+
| Set (pre) | 14699ns | 12101ns | 12233ns | 6585ns | **So** - 1.2x |
67+
| Get | 9216ns | 10170ns | 9907ns | 10531ns | Go - 0.9x |
68+
| Delete | 33819ns | 24227ns | 24392ns | n/a | **So** - 1.4x |
6969

7070
Apple M1 • Go 1.26.1 • [details](./maps/README.md)
7171

bench/maps/README.md

Lines changed: 32 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -16,50 +16,51 @@ goarch: arm64
1616
pkg: solod.dev/bench/maps
1717
cpu: Apple M1
1818
19-
Benchmark_IntSet-8 31677 35580 ns/op 74264 B/op 20 allocs/op
20-
Benchmark_IntPre-8 124159 9608 ns/op 36944 B/op 5 allocs/op
21-
Benchmark_IntGet-8 218179 5573 ns/op 0 B/op 0 allocs/op
22-
Benchmark_IntDel-8 50260 23892 ns/op 36944 B/op 5 allocs/op
23-
24-
Benchmark_StrSet-8 24082 48677 ns/op 108760 B/op 20 allocs/op
25-
Benchmark_StrPre-8 80437 14620 ns/op 54608 B/op 5 allocs/op
26-
Benchmark_StrGet-8 134481 8990 ns/op 0 B/op 0 allocs/op
27-
Benchmark_StrDel-8 34094 33878 ns/op 54608 B/op 5 allocs/op
19+
Benchmark_IntSet-8 31490 35645 ns/op 74264 B/op 20 allocs/op
20+
Benchmark_IntPre-8 123108 9676 ns/op 36944 B/op 5 allocs/op
21+
Benchmark_IntGet-8 216240 5594 ns/op 0 B/op 0 allocs/op
22+
Benchmark_IntDel-8 49941 23968 ns/op 36944 B/op 5 allocs/op
23+
24+
Benchmark_StrSet-8 25107 47805 ns/op 108760 B/op 20 allocs/op
25+
Benchmark_StrPre-8 81638 14699 ns/op 54608 B/op 5 allocs/op
26+
Benchmark_StrGet-8 131050 9216 ns/op 0 B/op 0 allocs/op
27+
Benchmark_StrDel-8 35484 33819 ns/op 54608 B/op 5 allocs/op
2828
```
2929

3030
So (mimalloc):
3131

3232
```text
33-
Benchmark_IntSet 20305 56696 ns/op 98112 B/op 27 allocs/op
34-
Benchmark_IntPre 137866 8821 ns/op 49152 B/op 3 allocs/op
35-
Benchmark_IntGet 734978 1638 ns/op 0 B/op 0 allocs/op
36-
Benchmark_IntDel 30958 38556 ns/op 73728 B/op 6 allocs/op
37-
38-
Benchmark_StrSet 18986 71879 ns/op 130816 B/op 27 allocs/op
39-
Benchmark_StrPre 97383 12313 ns/op 65536 B/op 3 allocs/op
40-
Benchmark_StrGet 117218 10206 ns/op 0 B/op 0 allocs/op
41-
Benchmark_StrDel 23670 50111 ns/op 98304 B/op 6 allocs/op
33+
Benchmark_IntSet 41629 26333 ns/op 65472 B/op 15 allocs/op
34+
Benchmark_IntPre 137805 8813 ns/op 49152 B/op 3 allocs/op
35+
Benchmark_IntGet 780385 1581 ns/op 0 B/op 0 allocs/op
36+
Benchmark_IntDel 79515 14889 ns/op 49152 B/op 3 allocs/op
37+
38+
Benchmark_StrSet 38630 31055 ns/op 87296 B/op 15 allocs/op
39+
Benchmark_StrPre 99391 12101 ns/op 65536 B/op 3 allocs/op
40+
Benchmark_StrGet 117486 10170 ns/op 0 B/op 0 allocs/op
41+
Benchmark_StrDel 49550 24227 ns/op 65536 B/op 3 allocs/op
4242
```
4343

4444
So (arena):
4545

4646
```text
47-
Benchmark_IntSet 21157 57661 ns/op 98112 B/op 27 allocs/op
48-
Benchmark_IntPre 137257 8767 ns/op 49152 B/op 3 allocs/op
49-
Benchmark_IntGet 752787 1583 ns/op 0 B/op 0 allocs/op
50-
Benchmark_IntDel 30339 38821 ns/op 73728 B/op 6 allocs/op
51-
52-
Benchmark_StrSet 18884 63500 ns/op 130816 B/op 27 allocs/op
53-
Benchmark_StrPre 99547 12115 ns/op 65536 B/op 3 allocs/op
54-
Benchmark_StrGet 119041 10083 ns/op 0 B/op 0 allocs/op
55-
Benchmark_StrDel 24212 49507 ns/op 98304 B/op 6 allocs/op
47+
Benchmark_IntSet 47002 25515 ns/op 65472 B/op 15 allocs/op
48+
Benchmark_IntPre 137667 8704 ns/op 49152 B/op 3 allocs/op
49+
Benchmark_IntGet 780284 1537 ns/op 0 B/op 0 allocs/op
50+
Benchmark_IntDel 80742 14859 ns/op 49152 B/op 3 allocs/op
51+
52+
Benchmark_StrSet 39026 30749 ns/op 87296 B/op 15 allocs/op
53+
Benchmark_StrPre 98018 12233 ns/op 65536 B/op 3 allocs/op
54+
Benchmark_StrGet 120942 9907 ns/op 0 B/op 0 allocs/op
55+
Benchmark_StrDel 49689 24392 ns/op 65536 B/op 3 allocs/op
5656
```
5757

5858
So (built-in map):
5959

6060
```text
61-
Benchmark_IntSet 374192 3242 ns/op
62-
Benchmark_IntGet 446527 2733 ns/op
63-
Benchmark_StrSet 170769 6970 ns/op
64-
Benchmark_StrGet 114100 10735 ns/op
61+
Benchmark_IntSet 390433 3109 ns/op
62+
Benchmark_IntGet 464288 2577 ns/op
63+
64+
Benchmark_StrSet 180883 6585 ns/op
65+
Benchmark_StrGet 112964 10531 ns/op
6566
```

so/maps/bytemap.go

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ func rehash(dst, src *ByteMap) {
111111
n := len(src.hdib)
112112
for i := range n {
113113
hdI := c.PtrAt(hdib, i)
114+
c.Assert(hdI != nil, "maps: nil hdib pointer") // for gcc analyzer
114115
if *hdI&0xFFFF > 0 {
115116
insert(dst, int(*hdI>>16),
116117
c.PtrAdd(keys, i*ksize),
@@ -133,8 +134,6 @@ func insert(m *ByteMap, h int, key any, val any) {
133134
mem.Copy(ekey, key, ksize)
134135
mem.Copy(eval, val, vsize)
135136
i := h & m.mask
136-
tmpk := c.Alloca[byte](ksize)
137-
tmpv := c.Alloca[byte](vsize)
138137
for {
139138
hdI := c.PtrAt(hdib, i)
140139
if *hdI&0xFFFF == 0 {
@@ -145,15 +144,9 @@ func insert(m *ByteMap, h int, key any, val any) {
145144
return
146145
}
147146
if *hdI&0xFFFF < ehdib&0xFFFF {
148-
te := ehdib
149-
ehdib = *hdI
150-
*hdI = te
151-
mem.Copy(tmpk, ekey, ksize)
152-
mem.Copy(ekey, c.PtrAdd(keys, i*ksize), ksize)
153-
mem.Copy(c.PtrAdd(keys, i*ksize), tmpk, ksize)
154-
mem.Copy(tmpv, eval, vsize)
155-
mem.Copy(eval, c.PtrAdd(vals, i*vsize), vsize)
156-
mem.Copy(c.PtrAdd(vals, i*vsize), tmpv, vsize)
147+
mem.Swap(hdI, &ehdib)
148+
mem.SwapByte(c.PtrAdd(keys, i*ksize), ekey, ksize)
149+
mem.SwapByte(c.PtrAdd(vals, i*vsize), eval, vsize)
157150
}
158151
i = (i + 1) & m.mask
159152
ehdib++

so/maps/maps.go

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -121,15 +121,9 @@ func (m *Map[K, V]) Set(key K, value V) {
121121
break
122122
}
123123
if *_hdi&0xFFFF < _ehdib&0xFFFF {
124-
_tmphdib := _ehdib
125-
_ehdib = *_hdi
126-
*_hdi = _tmphdib
127-
_tmpk := _ekey
128-
_ekey = *c.PtrAt(_keys, _i)
129-
*c.PtrAt(_keys, _i) = _tmpk
130-
_tmpv := _eval
131-
_eval = *c.PtrAt(_vals, _i)
132-
*c.PtrAt(_vals, _i) = _tmpv
124+
mem.Swap(_hdi, &_ehdib)
125+
mem.Swap(c.PtrAt(_keys, _i), &_ekey)
126+
mem.Swap(c.PtrAt(_vals, _i), &_eval)
133127
}
134128
_i = (_i + 1) & _m.mask
135129
_ehdib++

so/mem/mem.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ import (
99
"solod.dev/so/math"
1010
)
1111

12+
//so:embed mem.h
13+
var mem_h string
14+
1215
// ErrOutOfMemory is returned when a memory allocation
1316
// fails due to insufficient memory.
1417
var ErrOutOfMemory = errors.New("out of memory")
@@ -218,6 +221,33 @@ func Move(dst any, src any, n int) any {
218221
return memmove(dst, src, uintptr(n))
219222
}
220223

224+
// Swap swaps the values pointed to by a and b.
225+
// Panics if either a or b is nil.
226+
//
227+
//so:inline
228+
func Swap[T any](a *T, b *T) {
229+
c.Assert(a != nil, "mem: nil pointer")
230+
c.Assert(b != nil, "mem: nil pointer")
231+
_tmp := *a
232+
*a = *b
233+
*b = _tmp
234+
}
235+
236+
// SwapByte swaps n bytes between a and b.
237+
// Panics if either a or b is nil.
238+
//
239+
// SwapByte temporarily allocates a buffer of size n
240+
// on the stack, so it's not suitable for large n.
241+
//
242+
//so:extern
243+
func SwapByte(a any, b any, n int) {
244+
// Has to be implemented as extern because it uses VLA.
245+
tmp := make([]byte, n)
246+
memcpy(tmp, b, uintptr(n))
247+
memcpy(b, a, uintptr(n))
248+
memcpy(a, tmp, uintptr(n))
249+
}
250+
221251
// void* memset(void *dest, int ch, size_t count);
222252
//
223253
//so:extern

so/mem/mem.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#include <assert.h>
2+
#include <string.h>
3+
#include "so/builtin/builtin.h"
4+
5+
// SwapByte swaps n bytes between a and b.
6+
// Panics if either a or b is nil.
7+
//
8+
// SwapByte temporarily allocates a buffer of size n
9+
// on the stack, so it's not suitable for large n.
10+
static inline void mem_SwapByte(void* a, void* b, so_int n) {
11+
assert(a != NULL && "mem: nil pointer");
12+
assert(b != NULL && "mem: nil pointer");
13+
assert(n >= 0 && "mem: negative size");
14+
if (n == 0) return;
15+
16+
size_t size = (size_t)n;
17+
char tmp[size];
18+
memcpy(tmp, a, size);
19+
memcpy(a, b, size);
20+
memcpy(b, tmp, size);
21+
}

0 commit comments

Comments
 (0)