|
| 1 | +# SOFT MERGE |
| 2 | +# Merge devices on manufacturer + model match, accounting for specificity levels. |
| 3 | +# Model/manufacturer sourced from device_* or user_agent_device_* fields (picks most specific). |
| 4 | +# Specificity: 0=generic (brand only), 1=specific (has version/variant). |
| 5 | +# Rules: both specific→exact match; generic+specific→merge except Apple (UA masking); generic+generic→merge except Apple. |
| 6 | + |
| 7 | +import re |
| 8 | +from utils.device_lookup import VARIANT_SUFFIXES |
| 9 | +from collections import defaultdict |
| 10 | +import uuid |
| 11 | +import json |
| 12 | + |
| 13 | + |
| 14 | +GENERIC = {'other', 'unknown', 'phone', 'smartphone', 'tablet', 'android', 'iphone', 'ipad', ''} |
| 15 | +UA_MASKING_BLACKLIST = {'apple'} |
| 16 | + |
| 17 | + |
| 18 | +def _specificity(name: str) -> int: |
| 19 | + if not name: |
| 20 | + return -1 |
| 21 | + name = name.strip() |
| 22 | + if not name or name.lower() in GENERIC: |
| 23 | + return 0 |
| 24 | + words = {w.lower().rstrip('.,') for w in name.split()} |
| 25 | + if words & VARIANT_SUFFIXES: |
| 26 | + return 1 |
| 27 | + if any(c.isdigit() for c in name): |
| 28 | + return 1 |
| 29 | + return 0 |
| 30 | + |
| 31 | + |
| 32 | +def _best_value(values: list[str]) -> str: |
| 33 | + candidates = [v.strip() for v in values if v and v.strip()] |
| 34 | + if not candidates: |
| 35 | + return '' |
| 36 | + return max(candidates, key=_specificity) |
| 37 | + |
| 38 | + |
| 39 | +def _soft_match(attrs_a: dict, attrs_b: dict) -> bool: |
| 40 | + model_a = _best_value([ |
| 41 | + attrs_a.get('device_model_name', ''), |
| 42 | + attrs_a.get('user_agent_device_model', ''), |
| 43 | + ]) |
| 44 | + model_b = _best_value([ |
| 45 | + attrs_b.get('device_model_name', ''), |
| 46 | + attrs_b.get('user_agent_device_model', ''), |
| 47 | + ]) |
| 48 | + |
| 49 | + mfr_a = _best_value([ |
| 50 | + attrs_a.get('device_manufacturer', ''), |
| 51 | + attrs_a.get('user_agent_device_manufacturer', ''), |
| 52 | + ]) |
| 53 | + mfr_b = _best_value([ |
| 54 | + attrs_b.get('device_manufacturer', ''), |
| 55 | + attrs_b.get('user_agent_device_manufacturer', ''), |
| 56 | + ]) |
| 57 | + |
| 58 | + if not model_a or not model_b or not mfr_a or not mfr_b: |
| 59 | + return False |
| 60 | + |
| 61 | + |
| 62 | + if mfr_a.lower() != mfr_b.lower(): |
| 63 | + return False |
| 64 | + |
| 65 | + spec_a = _specificity(model_a) |
| 66 | + spec_b = _specificity(model_b) |
| 67 | + |
| 68 | + # print(f"[SOFT_MERGE] Comparing: '{model_a}' (spec={spec_a}) vs '{model_b}' (spec={spec_b}) | Mfr: {mfr_a}") |
| 69 | + |
| 70 | + if spec_a >= 1 and spec_b >= 1: |
| 71 | + res = model_a.lower() == model_b.lower() |
| 72 | + # print(f"[SOFT_MERGE] Both specific. Match: {res}") |
| 73 | + return res |
| 74 | + |
| 75 | + if mfr_a.lower() in UA_MASKING_BLACKLIST or mfr_b.lower() in UA_MASKING_BLACKLIST: |
| 76 | + # print(f"[SOFT_MERGE] Apple masking check... No merge for generic/specific mix.") |
| 77 | + return False |
| 78 | + |
| 79 | + # print(f"[SOFT_MERGE] Generic fallback merge enabled.") |
| 80 | + return True |
| 81 | + |
| 82 | + |
| 83 | +def _merge_attrs_pairwise(attrs_a: dict, attrs_b: dict) -> dict: |
| 84 | + merged = {} |
| 85 | + for k in set(attrs_a) | set(attrs_b): |
| 86 | + v_a, v_b = attrs_a.get(k), attrs_b.get(k) |
| 87 | + merged[k] = v_a if (v_a and v_a != '') else v_b |
| 88 | + return merged |
| 89 | + |
| 90 | + |
| 91 | +def merge_attrs(attrs_list: list[dict]) -> dict: |
| 92 | + if not attrs_list: |
| 93 | + return {} |
| 94 | + merged = attrs_list[0].copy() |
| 95 | + for attrs in attrs_list[1:]: |
| 96 | + merged = _merge_attrs_pairwise(merged, attrs) |
| 97 | + return merged |
| 98 | + |
| 99 | + |
| 100 | +def _find(parent: dict, x: str) -> str: |
| 101 | + while parent[x] != x: |
| 102 | + parent[x] = parent[parent[x]] |
| 103 | + x = parent[x] |
| 104 | + return x |
| 105 | + |
| 106 | + |
| 107 | + |
| 108 | + |
| 109 | +def soft_merge(records: list[dict]) -> list[dict]: |
| 110 | + parent = {r.get("id"): r.get("id") for r in records} |
| 111 | + |
| 112 | + for i, dct_a in enumerate(records): |
| 113 | + id_a, attrs_a = dct_a.get('id'), dct_a.get('attributes', {}) |
| 114 | + for dct_b in records[i + 1:]: |
| 115 | + id_b, attrs_b = dct_b.get('id'), dct_b.get('attributes', {}) |
| 116 | + if _soft_match(attrs_a, attrs_b): |
| 117 | + parent[_find(parent, id_a)] = _find(parent, id_b) |
| 118 | + |
| 119 | + children = defaultdict(list) |
| 120 | + for r in records: |
| 121 | + children[_find(parent, r.get("id"))].append(r.get("id")) |
| 122 | + |
| 123 | + rows = [] |
| 124 | + for parent_id, child_id_list in children.items(): |
| 125 | + id_list = sorted(list(set([parent_id] + child_id_list))) |
| 126 | + rows.append({ |
| 127 | + 'id': str(uuid.uuid4()), |
| 128 | + 'auth_devices_ids': json.dumps(id_list), |
| 129 | + 'initial_soft_merge': 1 if len(id_list) > 1 else 0, |
| 130 | + }) |
| 131 | + |
| 132 | + return rows |
0 commit comments