55import argparse
66import itertools
77import json
8+ import math
89import re
10+ import zlib
11+ from collections import Counter
912from datetime import datetime , timezone
1013from difflib import SequenceMatcher
1114from pathlib import Path
1215
1316
17+ ALGORITHMS = ("sequence" , "levenshtein" , "compression" , "vector" )
18+
19+
1420def normalize_name (value : str ) -> str :
1521 """Normalize actor/alias names for comparison."""
1622 value = value .lower ().strip ()
@@ -56,62 +62,160 @@ def load_threat_actor_names(cluster_path: Path):
5662 return names_to_actors
5763
5864
59- def find_similar_name_pairs (names_to_actors , min_similarity = 0.88 , max_results = 200 ):
60- """Compute similar name pairs using difflib.SequenceMatcher.
61-
62- Numeric-heavy variants (same non-numeric stem, different numeric tokens) are
63- intentionally skipped to reduce noisy matches like "apt 28" vs "apt 29".
64- """
65- names = sorted (names_to_actors )
66- results = []
65+ def should_compare_pair (left : str , right : str , min_similarity : float ) -> bool :
66+ """Fast filters to skip noisy/clearly impossible candidates."""
67+ if left == right :
68+ return False
69+
70+ left_non_numeric = strip_numeric_tokens (left )
71+ right_non_numeric = strip_numeric_tokens (right )
72+ left_numbers = numeric_tokens (left )
73+ right_numbers = numeric_tokens (right )
74+
75+ # Threat-actor names often reuse a textual stem with a different numeric id.
76+ # Treat these as distinct identifiers to avoid over-reporting false positives.
77+ if (
78+ left_numbers
79+ and right_numbers
80+ and left_non_numeric
81+ and right_non_numeric
82+ and left_non_numeric == right_non_numeric
83+ and left_numbers != right_numbers
84+ ):
85+ return False
6786
6887 # A cheap pre-filter: similarity cannot pass threshold if string lengths differ too much.
6988 max_len_ratio_delta = (1.0 - min_similarity ) / max (min_similarity , 1e-9 )
89+ longer = max (len (left ), len (right ))
90+ shorter = min (len (left ), len (right ))
91+ if shorter == 0 :
92+ return False
93+ if (longer - shorter ) / shorter > max_len_ratio_delta :
94+ return False
95+
96+ return True
97+
98+
99+ def sequence_similarity (left : str , right : str ) -> float :
100+ """difflib.SequenceMatcher ratio."""
101+ matcher = SequenceMatcher (None , left , right )
102+ return matcher .ratio ()
103+
104+
105+ def levenshtein_similarity (left : str , right : str ) -> float :
106+ """Normalized Levenshtein similarity: 1 - distance / max_len."""
107+ if left == right :
108+ return 1.0
109+ if not left or not right :
110+ return 0.0
111+
112+ if len (left ) < len (right ):
113+ left , right = right , left
114+
115+ previous_row = list (range (len (right ) + 1 ))
116+ for i , left_ch in enumerate (left , start = 1 ):
117+ current_row = [i ]
118+ for j , right_ch in enumerate (right , start = 1 ):
119+ insertions = previous_row [j ] + 1
120+ deletions = current_row [j - 1 ] + 1
121+ substitutions = previous_row [j - 1 ] + (left_ch != right_ch )
122+ current_row .append (min (insertions , deletions , substitutions ))
123+ previous_row = current_row
124+
125+ distance = previous_row [- 1 ]
126+ return 1.0 - (distance / max (len (left ), len (right )))
127+
128+
129+ def compression_similarity (left : str , right : str ) -> float :
130+ """Compression-based similarity derived from Normalized Compression Distance."""
131+
132+ def clen (text : str ) -> int :
133+ return len (zlib .compress (text .encode ("utf-8" )))
134+
135+ c_left = clen (left )
136+ c_right = clen (right )
137+ c_joined = clen (f"{ left } |{ right } " )
138+ ncd = (c_joined - min (c_left , c_right )) / max (c_left , c_right )
139+ return max (0.0 , min (1.0 , 1.0 - ncd ))
140+
141+
142+ def char_ngram_counter (text : str , n : int = 3 ) -> Counter :
143+ """Character n-gram bag for vector comparison."""
144+ padded = f" { text } "
145+ if len (padded ) < n :
146+ return Counter ({padded : 1 })
147+ return Counter (padded [i : i + n ] for i in range (len (padded ) - n + 1 ))
148+
149+
150+ def vector_similarity (left : str , right : str ) -> float :
151+ """Cosine similarity over character n-gram count vectors."""
152+ vec_left = char_ngram_counter (left )
153+ vec_right = char_ngram_counter (right )
154+
155+ common_keys = set (vec_left ) & set (vec_right )
156+ numerator = sum (vec_left [k ] * vec_right [k ] for k in common_keys )
157+ if numerator == 0 :
158+ return 0.0
159+
160+ norm_left = math .sqrt (sum (v * v for v in vec_left .values ()))
161+ norm_right = math .sqrt (sum (v * v for v in vec_right .values ()))
162+ if not norm_left or not norm_right :
163+ return 0.0
164+
165+ return numerator / (norm_left * norm_right )
166+
167+
168+ def get_similarity (algorithm : str , left : str , right : str ) -> float :
169+ """Dispatch algorithm scorer."""
170+ if algorithm == "sequence" :
171+ return sequence_similarity (left , right )
172+ if algorithm == "levenshtein" :
173+ return levenshtein_similarity (left , right )
174+ if algorithm == "compression" :
175+ return compression_similarity (left , right )
176+ if algorithm == "vector" :
177+ return vector_similarity (left , right )
178+ raise ValueError (f"Unknown algorithm: { algorithm } " )
179+
180+
181+ def find_similar_name_pairs (
182+ names_to_actors ,
183+ algorithms ,
184+ min_similarity = 0.88 ,
185+ max_results = 200 ,
186+ combine_mode = "union" ,
187+ ):
188+ """Compute similar name pairs using one or more similarity algorithms."""
189+ names = sorted (names_to_actors )
190+ results = []
70191
71192 for left , right in itertools .combinations (names , 2 ):
72- # Skip identical forms and aliases of exactly the same actor only.
73- if left == right :
74- continue
75-
76193 left_actors = names_to_actors [left ]
77194 right_actors = names_to_actors [right ]
78195 if left_actors == right_actors :
79196 continue
80-
81- left_non_numeric = strip_numeric_tokens (left )
82- right_non_numeric = strip_numeric_tokens (right )
83- left_numbers = numeric_tokens (left )
84- right_numbers = numeric_tokens (right )
85-
86- # Threat-actor names often reuse a textual stem with a different numeric id.
87- # Treat these as distinct identifiers to avoid over-reporting false positives.
88- if (
89- left_numbers
90- and right_numbers
91- and left_non_numeric
92- and right_non_numeric
93- and left_non_numeric == right_non_numeric
94- and left_numbers != right_numbers
95- ):
197+ if not should_compare_pair (left , right , min_similarity ):
96198 continue
97199
98- longer = max (len (left ), len (right ))
99- shorter = min (len (left ), len (right ))
100- if shorter == 0 :
101- continue
102- if (longer - shorter ) / shorter > max_len_ratio_delta :
103- continue
200+ algorithm_scores = {}
201+ for algorithm in algorithms :
202+ score = get_similarity (algorithm , left , right )
203+ if score >= min_similarity :
204+ algorithm_scores [algorithm ] = round (score , 4 )
104205
105- matcher = SequenceMatcher (None , left , right )
106- if matcher .quick_ratio () < min_similarity :
107- continue
108- score = matcher .ratio ()
109- if score < min_similarity :
206+ if combine_mode == "intersection" :
207+ include = len (algorithm_scores ) == len (algorithms )
208+ else :
209+ include = bool (algorithm_scores )
210+
211+ if not include :
110212 continue
111213
214+ aggregate_score = sum (algorithm_scores .values ()) / len (algorithm_scores )
112215 results .append (
113216 {
114- "score" : round (score , 4 ),
217+ "score" : round (aggregate_score , 4 ),
218+ "algorithm_scores" : algorithm_scores ,
115219 "name_1" : left ,
116220 "actors_1" : sorted (left_actors .items ()),
117221 "name_2" : right ,
@@ -123,9 +227,17 @@ def find_similar_name_pairs(names_to_actors, min_similarity=0.88, max_results=20
123227 return results [:max_results ]
124228
125229
126- def build_markdown_report (results , source_path : Path , min_similarity : float , max_results : int ):
230+ def build_markdown_report (
231+ results ,
232+ source_path : Path ,
233+ min_similarity : float ,
234+ max_results : int ,
235+ algorithms ,
236+ combine_mode ,
237+ ):
127238 """Build a markdown report containing potential similar threat-actor names."""
128239 generated_at = datetime .now (timezone .utc ).strftime ("%Y-%m-%d %H:%M:%S UTC" )
240+
129241 def format_actors (actors ):
130242 return ", " .join (f"`{ name } ` ({ uuid or 'N/A' } )" for name , uuid in actors )
131243
@@ -134,27 +246,30 @@ def format_actors(actors):
134246 "" ,
135247 f"- Generated: { generated_at } " ,
136248 f"- Source cluster: `{ source_path } `" ,
137- f"- Similarity method: `difflib.SequenceMatcher.ratio()`" ,
249+ f"- Similarity algorithms: `{ ', ' .join (algorithms )} `" ,
250+ f"- Combine mode: `{ combine_mode } `" ,
138251 f"- Threshold: `{ min_similarity } `" ,
139252 f"- Max results: `{ max_results } `" ,
140253 f"- Matches returned: `{ len (results )} `" ,
141254 "" ,
142255 ]
143256
144257 if not results :
145- lines .append ("No potential similar names found with current threshold." )
258+ lines .append ("No potential similar names found with current threshold/settings ." )
146259 return "\n " .join (lines ) + "\n "
147260
148261 lines .extend (
149262 [
150- "| score | name_1 | actor(s)_1 | name_2 | actor(s)_2 |" ,
151- "|---:|---|---|---|---|" ,
263+ "| score | algorithm_scores | name_1 | actor(s)_1 | name_2 | actor(s)_2 |" ,
264+ "|---:|---|---|---|---|---| " ,
152265 ]
153266 )
154267 for item in results :
268+ score_parts = ", " .join (f"{ k } :{ v :.4f} " for k , v in sorted (item ["algorithm_scores" ].items ()))
155269 lines .append (
156- "| {score:.4f} | `{name_1}` | {actors_1} | `{name_2}` | {actors_2} |" .format (
270+ "| {score:.4f} | `{scores}` | `{ name_1}` | {actors_1} | `{name_2}` | {actors_2} |" .format (
157271 score = item ["score" ],
272+ scores = score_parts ,
158273 name_1 = item ["name_1" ],
159274 actors_1 = format_actors (item ["actors_1" ]),
160275 name_2 = item ["name_2" ],
@@ -165,11 +280,28 @@ def format_actors(actors):
165280 return "\n " .join (lines ) + "\n "
166281
167282
283+ def parse_algorithms (value : str ):
284+ """Parse comma-separated algorithms; special keyword: all."""
285+ raw = [part .strip ().lower () for part in value .split ("," ) if part .strip ()]
286+ if not raw :
287+ raise ValueError ("--algorithms cannot be empty" )
288+ if "all" in raw :
289+ return list (ALGORITHMS )
290+
291+ invalid = [name for name in raw if name not in ALGORITHMS ]
292+ if invalid :
293+ allowed = ", " .join (ALGORITHMS ) + ", all"
294+ raise ValueError (f"Unknown algorithm(s): { ', ' .join (invalid )} . Allowed: { allowed } " )
295+
296+ # Preserve user order while removing duplicates.
297+ return list (dict .fromkeys (raw ))
298+
299+
168300def main ():
169301 parser = argparse .ArgumentParser (
170302 description = (
171303 "Find potential similar threat-actor names and aliases in a MISP galaxy "
172- "cluster using SequenceMatcher ."
304+ "cluster using configurable similarity algorithms ."
173305 )
174306 )
175307 parser .add_argument (
@@ -189,6 +321,23 @@ def main():
189321 default = 200 ,
190322 help = "Maximum number of similar pairs to report (default: 200)" ,
191323 )
324+ parser .add_argument (
325+ "--algorithms" ,
326+ default = "sequence" ,
327+ help = (
328+ "Comma-separated algorithms to use: sequence, levenshtein, compression, "
329+ "vector, or all (default: sequence)"
330+ ),
331+ )
332+ parser .add_argument (
333+ "--combine-mode" ,
334+ choices = ("union" , "intersection" ),
335+ default = "union" ,
336+ help = (
337+ "How to combine multi-algorithm results: union (any algorithm passes) or "
338+ "intersection (all selected algorithms must pass). Default: union"
339+ ),
340+ )
192341 parser .add_argument (
193342 "--markdown-output" ,
194343 default = "threat_actor_similarity_report.md" ,
@@ -204,19 +353,25 @@ def main():
204353 if not 0.0 <= args .threshold <= 1.0 :
205354 raise ValueError ("--threshold must be in [0.0, 1.0]" )
206355
356+ algorithms = parse_algorithms (args .algorithms )
357+
207358 cluster_path = Path (args .cluster )
208359 names_to_actors = load_threat_actor_names (cluster_path )
209360 results = find_similar_name_pairs (
210361 names_to_actors ,
362+ algorithms = algorithms ,
211363 min_similarity = args .threshold ,
212364 max_results = args .max_results ,
365+ combine_mode = args .combine_mode ,
213366 )
214367
215368 markdown = build_markdown_report (
216369 results ,
217370 source_path = cluster_path ,
218371 min_similarity = args .threshold ,
219372 max_results = args .max_results ,
373+ algorithms = algorithms ,
374+ combine_mode = args .combine_mode ,
220375 )
221376 output_path = Path (args .markdown_output )
222377 output_path .write_text (markdown , encoding = "utf-8" )
@@ -227,6 +382,8 @@ def main():
227382
228383 print (f"Analyzed normalized names: { len (names_to_actors )} " )
229384 print (f"Potential similar name pairs: { len (results )} " )
385+ print (f"Algorithms: { ', ' .join (algorithms )} " )
386+ print (f"Combine mode: { args .combine_mode } " )
230387 print (f"Markdown report written to: { output_path } " )
231388 if args .json_output :
232389 print (f"JSON report written to: { args .json_output } " )
0 commit comments