@@ -18,6 +18,17 @@ def normalize_name(value: str) -> str:
1818 return re .sub (r"\s+" , " " , value ).strip ()
1919
2020
21+ def strip_numeric_tokens (value : str ) -> str :
22+ """Return a normalized name with digits removed (keeps non-numeric tokens)."""
23+ value = re .sub (r"\d+" , " " , value )
24+ return re .sub (r"\s+" , " " , value ).strip ()
25+
26+
27+ def numeric_tokens (value : str ):
28+ """Extract numeric token sequences from a normalized name."""
29+ return tuple (re .findall (r"\d+" , value ))
30+
31+
2132def load_threat_actor_names (cluster_path : Path ):
2233 """Return a map: normalized name -> map of canonical threat actor name -> UUID."""
2334 data = json .loads (cluster_path .read_text (encoding = "utf-8" ))
@@ -46,7 +57,11 @@ def load_threat_actor_names(cluster_path: Path):
4657
4758
4859def find_similar_name_pairs (names_to_actors , min_similarity = 0.88 , max_results = 200 ):
49- """Compute similar name pairs using difflib.SequenceMatcher."""
60+ """Compute similar name pairs using difflib.SequenceMatcher.
61+
62+ Numeric-heavy variants (same non-numeric stem, different numeric tokens) are
63+ intentionally skipped to reduce noisy matches like "apt 28" vs "apt 29".
64+ """
5065 names = sorted (names_to_actors )
5166 results = []
5267
@@ -63,6 +78,23 @@ def find_similar_name_pairs(names_to_actors, min_similarity=0.88, max_results=20
6378 if left_actors == right_actors :
6479 continue
6580
81+ left_non_numeric = strip_numeric_tokens (left )
82+ right_non_numeric = strip_numeric_tokens (right )
83+ left_numbers = numeric_tokens (left )
84+ right_numbers = numeric_tokens (right )
85+
86+ # Threat-actor names often reuse a textual stem with a different numeric id.
87+ # Treat these as distinct identifiers to avoid over-reporting false positives.
88+ if (
89+ left_numbers
90+ and right_numbers
91+ and left_non_numeric
92+ and right_non_numeric
93+ and left_non_numeric == right_non_numeric
94+ and left_numbers != right_numbers
95+ ):
96+ continue
97+
6698 longer = max (len (left ), len (right ))
6799 shorter = min (len (left ), len (right ))
68100 if shorter == 0 :
0 commit comments