Skip to content

Commit 30e4b8c

Browse files
authored
Merge pull request #1172 from MISP/codex/update-threat_actor_similarity_report.py-for-name-handling
Reduce numeric-identifier noise in threat actor similarity report
2 parents 56da703 + ade4272 commit 30e4b8c

1 file changed

Lines changed: 33 additions & 1 deletion

File tree

tools/threat_actor_similarity_report.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,17 @@ def normalize_name(value: str) -> str:
1818
return re.sub(r"\s+", " ", value).strip()
1919

2020

21+
def strip_numeric_tokens(value: str) -> str:
22+
"""Return a normalized name with digits removed (keeps non-numeric tokens)."""
23+
value = re.sub(r"\d+", " ", value)
24+
return re.sub(r"\s+", " ", value).strip()
25+
26+
27+
def numeric_tokens(value: str):
28+
"""Extract numeric token sequences from a normalized name."""
29+
return tuple(re.findall(r"\d+", value))
30+
31+
2132
def load_threat_actor_names(cluster_path: Path):
2233
"""Return a map: normalized name -> map of canonical threat actor name -> UUID."""
2334
data = json.loads(cluster_path.read_text(encoding="utf-8"))
@@ -46,7 +57,11 @@ def load_threat_actor_names(cluster_path: Path):
4657

4758

4859
def find_similar_name_pairs(names_to_actors, min_similarity=0.88, max_results=200):
49-
"""Compute similar name pairs using difflib.SequenceMatcher."""
60+
"""Compute similar name pairs using difflib.SequenceMatcher.
61+
62+
Numeric-heavy variants (same non-numeric stem, different numeric tokens) are
63+
intentionally skipped to reduce noisy matches like "apt 28" vs "apt 29".
64+
"""
5065
names = sorted(names_to_actors)
5166
results = []
5267

@@ -63,6 +78,23 @@ def find_similar_name_pairs(names_to_actors, min_similarity=0.88, max_results=20
6378
if left_actors == right_actors:
6479
continue
6580

81+
left_non_numeric = strip_numeric_tokens(left)
82+
right_non_numeric = strip_numeric_tokens(right)
83+
left_numbers = numeric_tokens(left)
84+
right_numbers = numeric_tokens(right)
85+
86+
# Threat-actor names often reuse a textual stem with a different numeric id.
87+
# Treat these as distinct identifiers to avoid over-reporting false positives.
88+
if (
89+
left_numbers
90+
and right_numbers
91+
and left_non_numeric
92+
and right_non_numeric
93+
and left_non_numeric == right_non_numeric
94+
and left_numbers != right_numbers
95+
):
96+
continue
97+
6698
longer = max(len(left), len(right))
6799
shorter = min(len(left), len(right))
68100
if shorter == 0:

0 commit comments

Comments
 (0)