Skip to content

Commit 849ebab

Browse files
author
Ed Holland
committed
Implement DDB storage layer
1 parent 7e5b21c commit 849ebab

2 files changed

Lines changed: 119 additions & 4 deletions

File tree

datasketch/lsh.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,14 @@ def _integration(f, a, b):
1818
# For when no scipy installed
1919
integrate = _integration
2020

21-
21+
def _ensure_bytestring(bytes_or_str):
22+
if isinstance(bytes_or_str, str):
23+
return bytes_or_str.encode('utf-8')
24+
elif isinstance(bytes_or_str, bytes):
25+
return bytes_or_str
26+
else:
27+
raise ValueError("basename must be either bytes or string type")
28+
2229
def _false_positive_probability(threshold, b, r):
2330
_probability = lambda s : 1 - (1 - s**float(r))**float(b)
2431
a, err = integrate(_probability, 0.0, threshold)
@@ -115,7 +122,7 @@ def __init__(self, threshold=0.9, num_perm=128, weights=(0.5, 0.5),
115122

116123
self.prepickle = storage_config['type'] == 'redis' if prepickle is None else prepickle
117124

118-
basename = storage_config.get('basename', _random_name(11))
125+
basename = _ensure_bytestring(storage_config.get('basename', _random_name(11)))
119126
self.hashtables = [
120127
unordered_storage(storage_config, name=b''.join([basename, b'_bucket_', bytes([i])]))
121128
for i in range(self.b)]

datasketch/storage.py

Lines changed: 110 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,16 @@
99
except ImportError:
1010
redis = None
1111

12+
try:
13+
from pynamodb.models import Model
14+
from pynamodb.models import MetaModel
15+
from pynamodb.connection.util import pythonic
16+
from pynamodb.attributes import UnicodeAttribute, BinaryAttribute
17+
import ulid
18+
ddb = True
19+
except ImportError:
20+
ddb = None
21+
1222

1323
def ordered_storage(config, name=None):
1424
'''Return ordered storage system based on the specified config.
@@ -44,6 +54,8 @@ def ordered_storage(config, name=None):
4454
return DictListStorage(config)
4555
if tp == 'redis':
4656
return RedisListStorage(config, name=name)
57+
if tp == 'ddb':
58+
return DDBListStorage(config, name=name)
4759

4860

4961
def unordered_storage(config, name=None):
@@ -79,6 +91,8 @@ def unordered_storage(config, name=None):
7991
return DictSetStorage(config)
8092
if tp == 'redis':
8193
return RedisSetStorage(config, name=name)
94+
if tp == 'ddb':
95+
return DDBSetStorage(config, name=name)
8296

8397

8498
class Storage(ABC):
@@ -178,7 +192,7 @@ def get(self, key):
178192
def remove(self, *keys):
179193
for key in keys:
180194
del self._dict[key]
181-
195+
182196
def remove_val(self, key, val):
183197
self._dict[key].remove(val)
184198

@@ -198,7 +212,6 @@ def itemcounts(self, **kwargs):
198212
def has_key(self, key):
199213
return key in self._dict
200214

201-
202215
class DictSetStorage(UnorderedStorage, DictListStorage):
203216
'''This is a wrapper class around ``defaultdict(set)`` enabling
204217
it to support an API consistent with `Storage`
@@ -434,3 +447,98 @@ def _random_name(length):
434447
# For use with Redis, we return bytes
435448
return ''.join(random.choice(string.ascii_lowercase)
436449
for _ in range(length)).encode('utf8')
450+
451+
if ddb is not None:
452+
class ModelMeta(MetaModel):
453+
def __new__(cls, name, bases, d, **kwargs):
454+
d['Meta'] = type('Meta', (), {
455+
'table_name': make_safe_table_name(kwargs['table_name']),
456+
'region': kwargs['config']['region'],
457+
'read_capacity_units': kwargs['config']['read_capacity'],
458+
'write_capacity_units': kwargs['config']['write_capacity']
459+
})
460+
return MetaModel.__new__(cls, name, bases, d)
461+
def __init__(self, *args, **kwargs):
462+
del kwargs['table_name']
463+
del kwargs['config']
464+
return super().__init__(*args, **kwargs)
465+
466+
class DDBSet(Model):
467+
set_name = BinaryAttribute(hash_key = True)
468+
value = UnicodeAttribute(range_key = True)
469+
470+
class DDBList(Model):
471+
set_name = UnicodeAttribute(hash_key = True)
472+
insert_order = UnicodeAttribute(range_key = True)
473+
value = BinaryAttribute()
474+
475+
class DDBListStorage(OrderedStorage):
476+
def __init__(self, config, name = None):
477+
self.name = name
478+
class ListModel(DDBList, metaclass = ModelMeta, table_name = name, config = config):
479+
pass
480+
self.model_class = ListModel
481+
if not self.model_class.exists():
482+
self.model_class.create_table(wait = True, read_capacity_units=10, write_capacity_units=10)
483+
484+
def keys(self):
485+
res = [ item.set_name for item in self.model_class.scan() ]
486+
return res
487+
488+
def get(self, key):
489+
res = [ item.value for item in self.model_class.query(key, scan_index_forward = True) ]
490+
return res
491+
492+
def remove(self, *keys):
493+
for key in keys:
494+
count = 0
495+
for item in self.model_class.query(key):
496+
count += 1
497+
item.delete()
498+
499+
def remove_val(self, key, val):
500+
count = 0
501+
for item in self.model_class.query(key, self.model_class.value == val):
502+
count += 1
503+
item.delete()
504+
505+
def insert(self, key, *vals, **kwargs):
506+
with self.model_class.batch_write() as batch:
507+
for val in vals:
508+
ulid_str = ulid.new().str
509+
batch.save(self.model_class(key.decode('utf-8'), ulid_str, value = val))
510+
511+
def size(self):
512+
return self.model_class.count()
513+
514+
def itemcounts(self, **kwargs):
515+
dict = {}
516+
for item in self.model_class.scan():
517+
dict[item.set_name] = self.model_class.count(set_name)
518+
return dict
519+
520+
def has_key(self, key):
521+
result = self.get(key)
522+
if result == []:
523+
return False
524+
return True
525+
526+
def make_safe_table_name(name):
527+
return str(name)[2:-1].replace('\\', '')
528+
529+
class DDBSetStorage(UnorderedStorage, DDBListStorage):
530+
def __init__(self, config, name = None):
531+
self.name = name
532+
class SetModel(DDBSet, metaclass = ModelMeta, table_name = name, config = config):
533+
pass
534+
self.model_class = SetModel
535+
if not self.model_class.exists():
536+
self.model_class.create_table(wait = True, read_capacity_units=10, write_capacity_units=10)
537+
538+
def get(self, key):
539+
res = [ item.value for item in self.model_class.query(key) ]
540+
return set(res)
541+
def insert(self, key, *vals, **kwargs):
542+
with self.model_class.batch_write() as batch:
543+
for val in vals:
544+
batch.save(self.model_class(key, val.decode('utf-8')))

0 commit comments

Comments
 (0)