Skip to content

Commit 26c7eb2

Browse files
authored
Merge pull request #1173 from MISP/codex/update-threat_actor_similarity_report.py-with-algorithms
Add multiple name-similarity algorithms to threat actor report tool
2 parents 30e4b8c + b3f1017 commit 26c7eb2

1 file changed

Lines changed: 204 additions & 47 deletions

File tree

tools/threat_actor_similarity_report.py

Lines changed: 204 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,18 @@
55
import argparse
66
import itertools
77
import json
8+
import math
89
import re
10+
import zlib
11+
from collections import Counter
912
from datetime import datetime, timezone
1013
from difflib import SequenceMatcher
1114
from pathlib import Path
1215

1316

17+
ALGORITHMS = ("sequence", "levenshtein", "compression", "vector")
18+
19+
1420
def normalize_name(value: str) -> str:
1521
"""Normalize actor/alias names for comparison."""
1622
value = value.lower().strip()
@@ -56,62 +62,160 @@ def load_threat_actor_names(cluster_path: Path):
5662
return names_to_actors
5763

5864

59-
def find_similar_name_pairs(names_to_actors, min_similarity=0.88, max_results=200):
60-
"""Compute similar name pairs using difflib.SequenceMatcher.
61-
62-
Numeric-heavy variants (same non-numeric stem, different numeric tokens) are
63-
intentionally skipped to reduce noisy matches like "apt 28" vs "apt 29".
64-
"""
65-
names = sorted(names_to_actors)
66-
results = []
65+
def should_compare_pair(left: str, right: str, min_similarity: float) -> bool:
66+
"""Fast filters to skip noisy/clearly impossible candidates."""
67+
if left == right:
68+
return False
69+
70+
left_non_numeric = strip_numeric_tokens(left)
71+
right_non_numeric = strip_numeric_tokens(right)
72+
left_numbers = numeric_tokens(left)
73+
right_numbers = numeric_tokens(right)
74+
75+
# Threat-actor names often reuse a textual stem with a different numeric id.
76+
# Treat these as distinct identifiers to avoid over-reporting false positives.
77+
if (
78+
left_numbers
79+
and right_numbers
80+
and left_non_numeric
81+
and right_non_numeric
82+
and left_non_numeric == right_non_numeric
83+
and left_numbers != right_numbers
84+
):
85+
return False
6786

6887
# A cheap pre-filter: similarity cannot pass threshold if string lengths differ too much.
6988
max_len_ratio_delta = (1.0 - min_similarity) / max(min_similarity, 1e-9)
89+
longer = max(len(left), len(right))
90+
shorter = min(len(left), len(right))
91+
if shorter == 0:
92+
return False
93+
if (longer - shorter) / shorter > max_len_ratio_delta:
94+
return False
95+
96+
return True
97+
98+
99+
def sequence_similarity(left: str, right: str) -> float:
100+
"""difflib.SequenceMatcher ratio."""
101+
matcher = SequenceMatcher(None, left, right)
102+
return matcher.ratio()
103+
104+
105+
def levenshtein_similarity(left: str, right: str) -> float:
106+
"""Normalized Levenshtein similarity: 1 - distance / max_len."""
107+
if left == right:
108+
return 1.0
109+
if not left or not right:
110+
return 0.0
111+
112+
if len(left) < len(right):
113+
left, right = right, left
114+
115+
previous_row = list(range(len(right) + 1))
116+
for i, left_ch in enumerate(left, start=1):
117+
current_row = [i]
118+
for j, right_ch in enumerate(right, start=1):
119+
insertions = previous_row[j] + 1
120+
deletions = current_row[j - 1] + 1
121+
substitutions = previous_row[j - 1] + (left_ch != right_ch)
122+
current_row.append(min(insertions, deletions, substitutions))
123+
previous_row = current_row
124+
125+
distance = previous_row[-1]
126+
return 1.0 - (distance / max(len(left), len(right)))
127+
128+
129+
def compression_similarity(left: str, right: str) -> float:
130+
"""Compression-based similarity derived from Normalized Compression Distance."""
131+
132+
def clen(text: str) -> int:
133+
return len(zlib.compress(text.encode("utf-8")))
134+
135+
c_left = clen(left)
136+
c_right = clen(right)
137+
c_joined = clen(f"{left}|{right}")
138+
ncd = (c_joined - min(c_left, c_right)) / max(c_left, c_right)
139+
return max(0.0, min(1.0, 1.0 - ncd))
140+
141+
142+
def char_ngram_counter(text: str, n: int = 3) -> Counter:
143+
"""Character n-gram bag for vector comparison."""
144+
padded = f" {text} "
145+
if len(padded) < n:
146+
return Counter({padded: 1})
147+
return Counter(padded[i : i + n] for i in range(len(padded) - n + 1))
148+
149+
150+
def vector_similarity(left: str, right: str) -> float:
151+
"""Cosine similarity over character n-gram count vectors."""
152+
vec_left = char_ngram_counter(left)
153+
vec_right = char_ngram_counter(right)
154+
155+
common_keys = set(vec_left) & set(vec_right)
156+
numerator = sum(vec_left[k] * vec_right[k] for k in common_keys)
157+
if numerator == 0:
158+
return 0.0
159+
160+
norm_left = math.sqrt(sum(v * v for v in vec_left.values()))
161+
norm_right = math.sqrt(sum(v * v for v in vec_right.values()))
162+
if not norm_left or not norm_right:
163+
return 0.0
164+
165+
return numerator / (norm_left * norm_right)
166+
167+
168+
def get_similarity(algorithm: str, left: str, right: str) -> float:
169+
"""Dispatch algorithm scorer."""
170+
if algorithm == "sequence":
171+
return sequence_similarity(left, right)
172+
if algorithm == "levenshtein":
173+
return levenshtein_similarity(left, right)
174+
if algorithm == "compression":
175+
return compression_similarity(left, right)
176+
if algorithm == "vector":
177+
return vector_similarity(left, right)
178+
raise ValueError(f"Unknown algorithm: {algorithm}")
179+
180+
181+
def find_similar_name_pairs(
182+
names_to_actors,
183+
algorithms,
184+
min_similarity=0.88,
185+
max_results=200,
186+
combine_mode="union",
187+
):
188+
"""Compute similar name pairs using one or more similarity algorithms."""
189+
names = sorted(names_to_actors)
190+
results = []
70191

71192
for left, right in itertools.combinations(names, 2):
72-
# Skip identical forms and aliases of exactly the same actor only.
73-
if left == right:
74-
continue
75-
76193
left_actors = names_to_actors[left]
77194
right_actors = names_to_actors[right]
78195
if left_actors == right_actors:
79196
continue
80-
81-
left_non_numeric = strip_numeric_tokens(left)
82-
right_non_numeric = strip_numeric_tokens(right)
83-
left_numbers = numeric_tokens(left)
84-
right_numbers = numeric_tokens(right)
85-
86-
# Threat-actor names often reuse a textual stem with a different numeric id.
87-
# Treat these as distinct identifiers to avoid over-reporting false positives.
88-
if (
89-
left_numbers
90-
and right_numbers
91-
and left_non_numeric
92-
and right_non_numeric
93-
and left_non_numeric == right_non_numeric
94-
and left_numbers != right_numbers
95-
):
197+
if not should_compare_pair(left, right, min_similarity):
96198
continue
97199

98-
longer = max(len(left), len(right))
99-
shorter = min(len(left), len(right))
100-
if shorter == 0:
101-
continue
102-
if (longer - shorter) / shorter > max_len_ratio_delta:
103-
continue
200+
algorithm_scores = {}
201+
for algorithm in algorithms:
202+
score = get_similarity(algorithm, left, right)
203+
if score >= min_similarity:
204+
algorithm_scores[algorithm] = round(score, 4)
104205

105-
matcher = SequenceMatcher(None, left, right)
106-
if matcher.quick_ratio() < min_similarity:
107-
continue
108-
score = matcher.ratio()
109-
if score < min_similarity:
206+
if combine_mode == "intersection":
207+
include = len(algorithm_scores) == len(algorithms)
208+
else:
209+
include = bool(algorithm_scores)
210+
211+
if not include:
110212
continue
111213

214+
aggregate_score = sum(algorithm_scores.values()) / len(algorithm_scores)
112215
results.append(
113216
{
114-
"score": round(score, 4),
217+
"score": round(aggregate_score, 4),
218+
"algorithm_scores": algorithm_scores,
115219
"name_1": left,
116220
"actors_1": sorted(left_actors.items()),
117221
"name_2": right,
@@ -123,9 +227,17 @@ def find_similar_name_pairs(names_to_actors, min_similarity=0.88, max_results=20
123227
return results[:max_results]
124228

125229

126-
def build_markdown_report(results, source_path: Path, min_similarity: float, max_results: int):
230+
def build_markdown_report(
231+
results,
232+
source_path: Path,
233+
min_similarity: float,
234+
max_results: int,
235+
algorithms,
236+
combine_mode,
237+
):
127238
"""Build a markdown report containing potential similar threat-actor names."""
128239
generated_at = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
240+
129241
def format_actors(actors):
130242
return ", ".join(f"`{name}` ({uuid or 'N/A'})" for name, uuid in actors)
131243

@@ -134,27 +246,30 @@ def format_actors(actors):
134246
"",
135247
f"- Generated: {generated_at}",
136248
f"- Source cluster: `{source_path}`",
137-
f"- Similarity method: `difflib.SequenceMatcher.ratio()`",
249+
f"- Similarity algorithms: `{', '.join(algorithms)}`",
250+
f"- Combine mode: `{combine_mode}`",
138251
f"- Threshold: `{min_similarity}`",
139252
f"- Max results: `{max_results}`",
140253
f"- Matches returned: `{len(results)}`",
141254
"",
142255
]
143256

144257
if not results:
145-
lines.append("No potential similar names found with current threshold.")
258+
lines.append("No potential similar names found with current threshold/settings.")
146259
return "\n".join(lines) + "\n"
147260

148261
lines.extend(
149262
[
150-
"| score | name_1 | actor(s)_1 | name_2 | actor(s)_2 |",
151-
"|---:|---|---|---|---|",
263+
"| score | algorithm_scores | name_1 | actor(s)_1 | name_2 | actor(s)_2 |",
264+
"|---:|---|---|---|---|---|",
152265
]
153266
)
154267
for item in results:
268+
score_parts = ", ".join(f"{k}:{v:.4f}" for k, v in sorted(item["algorithm_scores"].items()))
155269
lines.append(
156-
"| {score:.4f} | `{name_1}` | {actors_1} | `{name_2}` | {actors_2} |".format(
270+
"| {score:.4f} | `{scores}` | `{name_1}` | {actors_1} | `{name_2}` | {actors_2} |".format(
157271
score=item["score"],
272+
scores=score_parts,
158273
name_1=item["name_1"],
159274
actors_1=format_actors(item["actors_1"]),
160275
name_2=item["name_2"],
@@ -165,11 +280,28 @@ def format_actors(actors):
165280
return "\n".join(lines) + "\n"
166281

167282

283+
def parse_algorithms(value: str):
284+
"""Parse comma-separated algorithms; special keyword: all."""
285+
raw = [part.strip().lower() for part in value.split(",") if part.strip()]
286+
if not raw:
287+
raise ValueError("--algorithms cannot be empty")
288+
if "all" in raw:
289+
return list(ALGORITHMS)
290+
291+
invalid = [name for name in raw if name not in ALGORITHMS]
292+
if invalid:
293+
allowed = ", ".join(ALGORITHMS) + ", all"
294+
raise ValueError(f"Unknown algorithm(s): {', '.join(invalid)}. Allowed: {allowed}")
295+
296+
# Preserve user order while removing duplicates.
297+
return list(dict.fromkeys(raw))
298+
299+
168300
def main():
169301
parser = argparse.ArgumentParser(
170302
description=(
171303
"Find potential similar threat-actor names and aliases in a MISP galaxy "
172-
"cluster using SequenceMatcher."
304+
"cluster using configurable similarity algorithms."
173305
)
174306
)
175307
parser.add_argument(
@@ -189,6 +321,23 @@ def main():
189321
default=200,
190322
help="Maximum number of similar pairs to report (default: 200)",
191323
)
324+
parser.add_argument(
325+
"--algorithms",
326+
default="sequence",
327+
help=(
328+
"Comma-separated algorithms to use: sequence, levenshtein, compression, "
329+
"vector, or all (default: sequence)"
330+
),
331+
)
332+
parser.add_argument(
333+
"--combine-mode",
334+
choices=("union", "intersection"),
335+
default="union",
336+
help=(
337+
"How to combine multi-algorithm results: union (any algorithm passes) or "
338+
"intersection (all selected algorithms must pass). Default: union"
339+
),
340+
)
192341
parser.add_argument(
193342
"--markdown-output",
194343
default="threat_actor_similarity_report.md",
@@ -204,19 +353,25 @@ def main():
204353
if not 0.0 <= args.threshold <= 1.0:
205354
raise ValueError("--threshold must be in [0.0, 1.0]")
206355

356+
algorithms = parse_algorithms(args.algorithms)
357+
207358
cluster_path = Path(args.cluster)
208359
names_to_actors = load_threat_actor_names(cluster_path)
209360
results = find_similar_name_pairs(
210361
names_to_actors,
362+
algorithms=algorithms,
211363
min_similarity=args.threshold,
212364
max_results=args.max_results,
365+
combine_mode=args.combine_mode,
213366
)
214367

215368
markdown = build_markdown_report(
216369
results,
217370
source_path=cluster_path,
218371
min_similarity=args.threshold,
219372
max_results=args.max_results,
373+
algorithms=algorithms,
374+
combine_mode=args.combine_mode,
220375
)
221376
output_path = Path(args.markdown_output)
222377
output_path.write_text(markdown, encoding="utf-8")
@@ -227,6 +382,8 @@ def main():
227382

228383
print(f"Analyzed normalized names: {len(names_to_actors)}")
229384
print(f"Potential similar name pairs: {len(results)}")
385+
print(f"Algorithms: {', '.join(algorithms)}")
386+
print(f"Combine mode: {args.combine_mode}")
230387
print(f"Markdown report written to: {output_path}")
231388
if args.json_output:
232389
print(f"JSON report written to: {args.json_output}")

0 commit comments

Comments
 (0)