Skip to content

Commit 5b02967

Browse files
authored
Merge pull request #3 from WISPR-lab/device_normalize
Device normalize, still need attributes merge and ungroup
2 parents be67f23 + 99e1eed commit 5b02967

42 files changed

Lines changed: 3191 additions & 1088 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# HARD MERGE
2+
# merge devices if they have the same deterministic, unique identifier
3+
# this is done by the system, user cannot unmerge
4+
5+
import re
6+
from utils.redaction_utils import compare_redacted_vals, get_unredacted_val
7+
from collections import defaultdict
8+
import uuid
9+
import json
10+
11+
HARD_KEYS = {
12+
"device_id", # anything in this family
13+
"device_serial_number",
14+
"device_imei",
15+
"device_meid"
16+
}
17+
18+
IS_HARD_KEY = lambda k: any(k == hk or k.startswith(hk) for hk in HARD_KEYS)
19+
20+
21+
def hard_match(attrs_a: dict, attrs_b: dict) -> bool:
22+
keys_a = {k for k in attrs_a if IS_HARD_KEY(k)}
23+
keys_b = {k for k in attrs_b if IS_HARD_KEY(k)}
24+
for k in keys_a & keys_b:
25+
if compare_redacted_vals(attrs_a[k], attrs_b[k]):
26+
return True
27+
return False
28+
29+
30+
def _merge_attrs_pairwise(attrs_a: dict, attrs_b: dict) -> dict:
31+
merged = {}
32+
for k in set(attrs_a) | set(attrs_b):
33+
v_a, v_b = attrs_a.get(k), attrs_b.get(k)
34+
if k in HARD_KEYS:
35+
merged[k] = get_unredacted_val(v_a, v_b)[0] or v_a or v_b
36+
else:
37+
merged[k] = v_a if (v_a and v_a != '') else v_b
38+
# TODO add more granularity
39+
return merged
40+
41+
def merge_attrs(attrs_list: list[dict]) -> dict:
42+
if not attrs_list:
43+
return {}
44+
merged = attrs_list[0].copy()
45+
for attrs in attrs_list[1:]:
46+
merged = _merge_attrs_pairwise(merged, attrs)
47+
return merged
48+
49+
50+
def _find(parent: dict, x: str) -> str:
51+
while parent[x] != x:
52+
parent[x] = parent[parent[x]]
53+
x = parent[x]
54+
return x
55+
56+
57+
def hard_merge(records: list[dict]) -> list[dict]:
58+
parent = {r.get("id"): r.get("id") for r in records}
59+
60+
for i, dct_a in enumerate(records):
61+
id_a, attrs_a = dct_a.get('id'), dct_a.get('attributes', {})
62+
for dct_b in records[i + 1:]:
63+
id_b, attrs_b = dct_b.get('id'), dct_b.get('attributes', {})
64+
# TODO have some kind of guardrail here to make sure not merging records
65+
# with obviously different attributes (i.e. different OS types)
66+
if hard_match(attrs_a, attrs_b):
67+
parent[_find(parent, id_a)] = _find(parent, id_b)
68+
69+
children = defaultdict(list)
70+
for r in records:
71+
children[_find(parent, r.get("id"))].append(r.get("id"))
72+
# children[parent] = [list, of, child, ids]
73+
74+
rows = []
75+
record_map = {r.get("id"): r for r in records}
76+
for parent_id, child_id_list in children.items():
77+
id_list = sorted(list(set([parent_id] + child_id_list)))
78+
child_records = [record_map[id] for id in set(id_list)]
79+
attrs = merge_attrs([r.get("attributes", {}) for r in child_records])
80+
81+
rows.append({
82+
'id': str(uuid.uuid4()),
83+
'upload_ids': json.dumps([r.get("upload_id") for r in child_records]),
84+
'file_ids': json.dumps([r.get("file_id") for r in child_records]),
85+
'auth_devices_initial_ids': json.dumps(id_list),
86+
'attributes': json.dumps(attrs),
87+
})
88+
89+
return rows
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
# SOFT MERGE
2+
# Merge devices on manufacturer + model match, accounting for specificity levels.
3+
# Model/manufacturer sourced from device_* or user_agent_device_* fields (picks most specific).
4+
# Specificity: 0=generic (brand only), 1=specific (has version/variant).
5+
# Rules: both specific→exact match; generic+specific→merge except Apple (UA masking); generic+generic→merge except Apple.
6+
7+
import re
8+
from utils.device_lookup import VARIANT_SUFFIXES
9+
from collections import defaultdict
10+
import uuid
11+
import json
12+
13+
14+
GENERIC = {'other', 'unknown', 'phone', 'smartphone', 'tablet', 'android', 'iphone', 'ipad', ''}
15+
UA_MASKING_BLACKLIST = {'apple'}
16+
17+
18+
def _specificity(name: str) -> int:
19+
if not name:
20+
return -1
21+
name = name.strip()
22+
if not name or name.lower() in GENERIC:
23+
return 0
24+
words = {w.lower().rstrip('.,') for w in name.split()}
25+
if words & VARIANT_SUFFIXES:
26+
return 1
27+
if any(c.isdigit() for c in name):
28+
return 1
29+
return 0
30+
31+
32+
def _best_value(values: list[str]) -> str:
33+
candidates = [v.strip() for v in values if v and v.strip()]
34+
if not candidates:
35+
return ''
36+
return max(candidates, key=_specificity)
37+
38+
39+
def _soft_match(attrs_a: dict, attrs_b: dict) -> bool:
40+
model_a = _best_value([
41+
attrs_a.get('device_model_name', ''),
42+
attrs_a.get('user_agent_device_model', ''),
43+
])
44+
model_b = _best_value([
45+
attrs_b.get('device_model_name', ''),
46+
attrs_b.get('user_agent_device_model', ''),
47+
])
48+
49+
mfr_a = _best_value([
50+
attrs_a.get('device_manufacturer', ''),
51+
attrs_a.get('user_agent_device_manufacturer', ''),
52+
])
53+
mfr_b = _best_value([
54+
attrs_b.get('device_manufacturer', ''),
55+
attrs_b.get('user_agent_device_manufacturer', ''),
56+
])
57+
58+
if not model_a or not model_b or not mfr_a or not mfr_b:
59+
return False
60+
61+
62+
if mfr_a.lower() != mfr_b.lower():
63+
return False
64+
65+
spec_a = _specificity(model_a)
66+
spec_b = _specificity(model_b)
67+
68+
# print(f"[SOFT_MERGE] Comparing: '{model_a}' (spec={spec_a}) vs '{model_b}' (spec={spec_b}) | Mfr: {mfr_a}")
69+
70+
if spec_a >= 1 and spec_b >= 1:
71+
res = model_a.lower() == model_b.lower()
72+
# print(f"[SOFT_MERGE] Both specific. Match: {res}")
73+
return res
74+
75+
if mfr_a.lower() in UA_MASKING_BLACKLIST or mfr_b.lower() in UA_MASKING_BLACKLIST:
76+
# print(f"[SOFT_MERGE] Apple masking check... No merge for generic/specific mix.")
77+
return False
78+
79+
# print(f"[SOFT_MERGE] Generic fallback merge enabled.")
80+
return True
81+
82+
83+
def _merge_attrs_pairwise(attrs_a: dict, attrs_b: dict) -> dict:
84+
merged = {}
85+
for k in set(attrs_a) | set(attrs_b):
86+
v_a, v_b = attrs_a.get(k), attrs_b.get(k)
87+
merged[k] = v_a if (v_a and v_a != '') else v_b
88+
return merged
89+
90+
91+
def merge_attrs(attrs_list: list[dict]) -> dict:
92+
if not attrs_list:
93+
return {}
94+
merged = attrs_list[0].copy()
95+
for attrs in attrs_list[1:]:
96+
merged = _merge_attrs_pairwise(merged, attrs)
97+
return merged
98+
99+
100+
def _find(parent: dict, x: str) -> str:
101+
while parent[x] != x:
102+
parent[x] = parent[parent[x]]
103+
x = parent[x]
104+
return x
105+
106+
107+
108+
109+
def soft_merge(records: list[dict]) -> list[dict]:
110+
parent = {r.get("id"): r.get("id") for r in records}
111+
112+
for i, dct_a in enumerate(records):
113+
id_a, attrs_a = dct_a.get('id'), dct_a.get('attributes', {})
114+
for dct_b in records[i + 1:]:
115+
id_b, attrs_b = dct_b.get('id'), dct_b.get('attributes', {})
116+
if _soft_match(attrs_a, attrs_b):
117+
parent[_find(parent, id_a)] = _find(parent, id_b)
118+
119+
children = defaultdict(list)
120+
for r in records:
121+
children[_find(parent, r.get("id"))].append(r.get("id"))
122+
123+
rows = []
124+
for parent_id, child_id_list in children.items():
125+
id_list = sorted(list(set([parent_id] + child_id_list)))
126+
rows.append({
127+
'id': str(uuid.uuid4()),
128+
'auth_devices_ids': json.dumps(id_list),
129+
'initial_soft_merge': 1 if len(id_list) > 1 else 0,
130+
})
131+
132+
return rows

0 commit comments

Comments
 (0)