kinesis-consumer/allgroup.go at master · harlow/kinesis-consumer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
package consumer

import (
	"context"
	"fmt"
	"sync"
	"time"

	"github.com/aws/aws-sdk-go-v2/service/kinesis/types"
)

// NewAllGroup returns an initialized AllGroup for consuming
// all shards on a stream
func NewAllGroup(ksis kinesisClient, store Store, streamName string, logger Logger) *AllGroup {
	return &AllGroup{
		ksis:         ksis,
		shards:       make(map[string]types.Shard),
		shardsClosed: make(map[string]chan struct{}),
		streamName:   streamName,
		logger:       logger,
		Store:        store,
	}
}

// AllGroup is used to consume all shards from a single consumer. It
// caches a local list of the shards we are already processing
// and routinely polls the stream looking for new shards to process.
type AllGroup struct {
	ksis       kinesisClient
	streamName string
	logger     Logger
	Store

	shardMu      sync.Mutex
	shards       map[string]types.Shard
	shardsClosed map[string]chan struct{}
}

// Start is a blocking operation which will loop and attempt to find new
// shards on a regular cadence.
func (g *AllGroup) Start(ctx context.Context, shardC chan types.Shard) error {
	// Note: while ticker is a rather naive approach to this problem,
	// it actually simplifies a few things. I.e. If we miss a new shard
	// while AWS is resharding, we'll pick it up max 30 seconds later.

	// It might be worth refactoring this flow to allow the consumer
	// to notify the broker when a shard is closed. However, shards don't
	// necessarily close at the same time, so we could potentially get a
	// thundering heard of notifications from the consumer.

	ticker := time.NewTicker(30 * time.Second)
	defer ticker.Stop()

	for {
		if err := g.findNewShards(ctx, shardC); err != nil {
			return err
		}

		select {
		case <-ctx.Done():
			return nil
		case <-ticker.C:
		}
	}
}

func (g *AllGroup) CloseShard(_ context.Context, shardID string) error {
	g.shardMu.Lock()
	defer g.shardMu.Unlock()
	c, ok := g.shardsClosed[shardID]
	if !ok {
		return fmt.Errorf("closing unknown shard ID %q", shardID)
	}
	// Close channel and remove from map to prevent double-close
	delete(g.shardsClosed, shardID)
	close(c)
	return nil
}

func (g *AllGroup) Flush() error {
	flushable, ok := g.Store.(FlushableStore)
	if !ok {
		return nil
	}
	return flushable.Flush()
}

func waitForCloseChannel(ctx context.Context, c <-chan struct{}) bool {
	if c == nil {
		// no channel means we haven't seen this shard in listShards, so it
		// probably fell off the TRIM_HORIZON, and we can assume it's fully processed.
		return true
	}
	select {
	case <-ctx.Done():
		return false
	case <-c:
		// the channel has been processed and closed by the consumer (CloseShard has been called)
		return true
	}
}

// findNewShards pulls the list of shards from the Kinesis API
// and uses a local cache to determine if we are already processing
// a particular shard.
func (g *AllGroup) findNewShards(ctx context.Context, shardC chan types.Shard) error {
	// Capture parent channels while holding the lock to avoid race conditions.
	// We must capture all references to g.shardsClosed before releasing the lock,
	// since concurrent calls to findNewShards() or CloseShard() may modify the map.
	type shardWithParents struct {
		shard          types.Shard
		parent         <-chan struct{}
		adjacentParent <-chan struct{}
	}

	shardsToProcess, err := func() ([]shardWithParents, error) {
		g.shardMu.Lock()
		defer g.shardMu.Unlock()

		g.logger.Log("[GROUP]", "fetching shards")

		shards, err := listShards(ctx, g.ksis, g.streamName)
		if err != nil {
			g.logger.Log("[GROUP] error:", err)
			return nil, err
		}

		completedAncestors, err := g.inferCompletedAncestors(shards)
		if err != nil {
			g.logger.Log("[GROUP] error inferring completed ancestors:", err)
			return nil, err
		}

		// We do two `for` loops, since we have to set up all the `shardsClosed`
		// channels before we start using any of them.  It's highly probable
		// that Kinesis provides us the shards in dependency order (parents
		// before children), but it doesn't appear to be a guarantee.
		newShards := make(map[string]types.Shard)
		for _, shard := range shards {
			if _, ok := g.shards[*shard.ShardId]; ok {
				continue
			}
			g.shards[*shard.ShardId] = shard
			if _, ok := completedAncestors[*shard.ShardId]; ok {
				// A checkpoint on a descendant implies this shard was already fully
				// consumed before restart, so treat it as closed and do not re-emit it.
				continue
			}
			g.shardsClosed[*shard.ShardId] = make(chan struct{})
			newShards[*shard.ShardId] = shard
		}

		result := make([]shardWithParents, 0, len(newShards))

		// Only new shards need to be checked for parent dependencies
		for _, shard := range newShards {
			var parent, adjacentParent <-chan struct{}
			if shard.ParentShardId != nil {
				parent = g.shardsClosed[*shard.ParentShardId]
			}
			if shard.AdjacentParentShardId != nil {
				adjacentParent = g.shardsClosed[*shard.AdjacentParentShardId]
			}
			result = append(result, shardWithParents{
				shard:          shard,
				parent:         parent,
				adjacentParent: adjacentParent,
			})
		}

		return result, nil
	}()
	if err != nil {
		return err
	}

	// Now spawn goroutines after releasing the lock, using the captured channel references
	for _, sp := range shardsToProcess {
		sp := sp // Shadow variable for goroutine capture
		go func() {
			// Asynchronously wait for all parents of this shard to be processed
			// before providing it out to our client.  Kinesis guarantees that a
			// given partition key's data will be provided to clients in-order,
			// but when splits or joins happen, we need to process all parents prior
			// to processing children or that ordering guarantee is not maintained.
			if waitForCloseChannel(ctx, sp.parent) && waitForCloseChannel(ctx, sp.adjacentParent) {
				select {
				case <-ctx.Done():
				case shardC <- sp.shard:
				}
			}
		}()
	}
	return nil
}

func (g *AllGroup) inferCompletedAncestors(shards []types.Shard) (map[string]struct{}, error) {
	byShardID := make(map[string]types.Shard, len(shards))
	for _, shard := range shards {
		if shard.ShardId == nil {
			continue
		}
		byShardID[*shard.ShardId] = shard
	}

	completed := make(map[string]struct{})
	var markAncestors func(types.Shard)
	markAncestors = func(shard types.Shard) {
		if shard.ParentShardId != nil {
			parentShardID := *shard.ParentShardId
			if _, ok := completed[parentShardID]; !ok {
				completed[parentShardID] = struct{}{}
				if parent, exists := byShardID[parentShardID]; exists {
					markAncestors(parent)
				}
			}
		}
		if shard.AdjacentParentShardId != nil {
			parentShardID := *shard.AdjacentParentShardId
			if _, ok := completed[parentShardID]; !ok {
				completed[parentShardID] = struct{}{}
				if parent, exists := byShardID[parentShardID]; exists {
					markAncestors(parent)
				}
			}
		}
	}

	for _, shard := range shards {
		if shard.ShardId == nil {
			continue
		}
		checkpoint, err := g.Store.GetCheckpoint(g.streamName, *shard.ShardId)
		if err != nil {
			return nil, err
		}
		if checkpoint == "" {
			continue
		}
		markAncestors(shard)
	}

	return completed, nil
}