HeimdallBlocklists/blocklists_update.py at main · MaximeWewer/HeimdallBlocklists · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
"""Blocklist update module for downloading and processing IP blocklists."""

import ipaddress
import json
import logging
import os
import re
import string
import time
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import requests

# Setup logger
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

# Constants and configuration
CONFIG_FILE: Path = Path("./blocklists_config.json")
BLOCKLISTS_DIR: Path = Path("./blocklists")
BLOCKLISTS_SPLIT_DIR: Path = Path("./blocklists_split")
BLOCKLISTS_MERGED_FILE_PATH: Path = Path("./blocklists/all_blocklists_merged.txt")
MAX_LINES_PER_FILE: int = 130000
MAX_RETRIES: int = 10
RETRY_DELAY: int = 2  # Seconds between each attempt
REQUEST_TIMEOUT: int = 5

# Ensure necessary directory exists
BLOCKLISTS_DIR.mkdir(parents=True, exist_ok=True)
BLOCKLISTS_SPLIT_DIR.mkdir(parents=True, exist_ok=True)
logging.info("Necessary directories checked or created")


def clear_directory(directory: Path) -> None:
    """Remove all contents of the specified directory.

    Args:
        directory: Path object pointing to the directory to clear.
    """
    for file_path in directory.iterdir():
        try:
            if file_path.is_file():
                file_path.unlink()
            elif file_path.is_dir():
                file_path.rmdir()
        except OSError as e:
            logging.error(f"Error removing {file_path}: {e}")


def remove_empty_files(directory: Path) -> None:
    """Remove all empty files from the specified directory.

    Args:
        directory: Path object pointing to the directory to clean.
    """
    for file_path in directory.iterdir():
        if file_path.is_file() and file_path.stat().st_size == 0:
            try:
                file_path.unlink()
                logging.debug(f"Removed empty file: {file_path}")
            except OSError as e:
                logging.error(f"Error removing empty file {file_path}: {e}")


GITHUB_TREE_URL_RE = re.compile(
    r"^https?://github\.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)"
    r"(?:/tree/(?P<branch>[^/]+)(?:/(?P<path>.*))?)?/?$"
)


def parse_github_tree_url(github_url: str) -> dict[str, str]:
    """Parse a GitHub tree URL into owner/repo/branch/path components.

    Args:
        github_url: URL like https://github.com/{owner}/{repo}/tree/{branch}/{path}.

    Returns:
        Dict with keys owner, repo, branch, path. Branch defaults to main, path to "".

    Raises:
        ValueError: If URL doesn't match expected pattern.
    """
    match = GITHUB_TREE_URL_RE.match(github_url)
    if not match:
        raise ValueError(f"Invalid GitHub URL: {github_url}")
    return {
        "owner": match.group("owner"),
        "repo": match.group("repo"),
        "branch": match.group("branch") or "main",
        "path": match.group("path") or "",
    }


def get_fragment_list(github_url: str) -> list[str]:
    """Retrieve the list of .txt fragment files via GitHub Contents API.

    Args:
        github_url: GitHub tree URL pointing to a directory.

    Returns:
        List of raw download URLs for .txt files in the directory.
    """
    try:
        parts = parse_github_tree_url(github_url)
    except ValueError as e:
        logging.error(e)
        return []

    api_url = (
        f"https://api.github.com/repos/{parts['owner']}/{parts['repo']}"
        f"/contents/{parts['path']}"
    )
    headers = {
        "Accept": "application/vnd.github+json",
        "X-GitHub-Api-Version": "2026-03-10",
    }
    params = {"ref": parts["branch"]}

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            response = requests.get(
                api_url, headers=headers, params=params, timeout=REQUEST_TIMEOUT
            )
            response.raise_for_status()
            items = response.json()

            if not isinstance(items, list):
                logging.error(f"Unexpected API response (not a directory): {api_url}")
                return []

            return [
                item["download_url"]
                for item in items
                if item.get("type") == "file"
                and item.get("name", "").endswith(".txt")
                and item.get("download_url")
            ]

        except (requests.RequestException, ValueError, KeyError) as e:
            logging.error(f"Attempt {attempt}: GitHub API error - {e}")
            if attempt < MAX_RETRIES:
                time.sleep(RETRY_DELAY * attempt)

    logging.error("Failed to retrieve the file list after several attempts")
    return []


def download_file(url: str, filename: str) -> None:
    """Download a file from a given URL into BLOCKLISTS_DIR.

    Args:
        url: URL to download the file from.
        filename: Name to save the file as.
    """
    try:
        response = requests.get(url, timeout=REQUEST_TIMEOUT)
        response.raise_for_status()

        file_path = BLOCKLISTS_DIR / filename
        file_path.write_bytes(response.content)
        logging.info(f"Downloaded {filename}")

    except requests.RequestException as e:
        logging.error(f"Error downloading {filename} from {url}: {e}")


def download_all_files(file_urls: list[str]) -> None:
    """Download all files using multithreading.

    Args:
        file_urls: List of URLs to download.
    """
    max_workers = min(os.cpu_count() or 4, len(file_urls))

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for url in sorted(file_urls):
            filename = url.split("/")[-1]
            executor.submit(download_file, url, filename)


def merge_fragments(prefix: str, blocklist_name: str) -> Path:
    """Merge fragments of files with the same prefix into a single file.

    Args:
        prefix: Prefix to use for the merged file name.
        blocklist_name: Base name of the blocklist.

    Returns:
        Path to the merged file.
    """
    merged_file_path = BLOCKLISTS_DIR / f"{prefix}_{blocklist_name}.txt"

    with merged_file_path.open("w", encoding="utf-8", errors="ignore") as merged_file:
        for filename in sorted(BLOCKLISTS_DIR.glob(f"{blocklist_name}-a*.txt")):
            with filename.open("r", encoding="utf-8", errors="ignore") as fragment:
                merged_file.write(fragment.read())

    return merged_file_path


def merge_and_clean_fragments(merge_prefix: str) -> None:
    """Extract, merge, and sort IPs in files with specified prefixes, then delete fragments.

    Args:
        merge_prefix: Prefix to use for merged files.
    """
    blocklists_names = sorted(
        {
            re.sub(r"-(a[a-z])\.txt$", "", f.name)
            for f in BLOCKLISTS_DIR.glob("*-a*.txt")
        }
    )

    for name in blocklists_names:
        merged_file_path = merge_fragments(merge_prefix, name)
        extract_and_sort_ipv4(merged_file_path)

        # Clean up fragment files
        for fragment in BLOCKLISTS_DIR.glob(f"{name}-a*.txt"):
            try:
                fragment.unlink()
            except OSError as e:
                logging.error(f"Error removing fragment {fragment}: {e}")


def is_valid_ip(ip: str) -> bool:
    """Return True if the given IP address is valid, otherwise False.

    Args:
        ip: IP address string to validate.

    Returns:
        True if IP is valid, False otherwise.
    """
    try:
        ipaddress.ip_address(ip)
        return True
    except ValueError:
        return False


def extract_and_sort_ipv4(file_path: Path) -> None:
    """Extract all IPv4 addresses from a file, validate, sort, and overwrite the file.

    Args:
        file_path: Path to the file to process.
    """
    try:
        with file_path.open("r", encoding="utf-8", errors="ignore") as f:
            content = f.read()

        ips = re.findall(r"\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b", content)
        valid_unique_ips = {ip for ip in ips if is_valid_ip(ip)}
        sorted_ips = sorted(valid_unique_ips, key=ipaddress.ip_address)

        with file_path.open("w", encoding="utf-8", errors="ignore") as f:
            f.write("\n".join(sorted_ips))

        logging.debug(f"Processed {len(sorted_ips)} unique IPs in {file_path}")

    except OSError as e:
        logging.error(f"Error processing file {file_path}: {e}")


def load_config(file_path: Path) -> list[dict[str, str]]:
    """Load configuration from a JSON file.

    Args:
        file_path: Path to the JSON configuration file.

    Returns:
        List of configuration dictionaries.

    Raises:
        FileNotFoundError: If the configuration file doesn't exist.
        json.JSONDecodeError: If the JSON is invalid.
    """
    try:
        with file_path.open("r", encoding="utf-8") as f:
            return json.load(f)
    except FileNotFoundError:
        logging.error(f"Configuration file not found: {file_path}")
        raise
    except json.JSONDecodeError as e:
        logging.error(f"Invalid JSON in configuration file {file_path}: {e}")
        raise


def handle_resource(resource: dict[str, str]) -> None:
    """Download and process a resource based on the configuration.

    Args:
        resource: Dictionary containing resource configuration.
    """
    if "github_url" in resource:
        file_urls = get_fragment_list(resource["github_url"])
        if file_urls:
            download_all_files(file_urls)
            remove_empty_files(BLOCKLISTS_DIR)
            merge_and_clean_fragments(resource["merge_prefix"])
        else:
            logging.warning(f"No files found for resource: {resource}")

    elif "url" in resource and "filename" in resource:
        download_file(resource["url"], resource["filename"])
        remove_empty_files(BLOCKLISTS_DIR)

        file_path = BLOCKLISTS_DIR / resource["filename"]
        if file_path.exists():
            extract_and_sort_ipv4(file_path)
        else:
            logging.warning(f"Downloaded file not found: {file_path}")

    else:
        logging.error(f"Invalid resource configuration: {resource}")


def process_all_resources(config_file: Path) -> None:
    """Load resources configuration and process each resource.

    Args:
        config_file: Path to the JSON configuration file.
    """
    try:
        resources = load_config(config_file)
        logging.info(f"Processing {len(resources)} resources")

        for i, resource in enumerate(resources, 1):
            logging.info(f"Processing resource {i}/{len(resources)}")
            handle_resource(resource)

    except (FileNotFoundError, json.JSONDecodeError) as e:
        logging.error(f"Failed to process resources: {e}")
        raise


def split_large_blocklists(
    input_directory: Path, output_directory: Path, max_lines: int
) -> None:
    """Split large blocklist files if they exceed a given number of lines.

    Renames files according to the pattern `name-aa.txt`, `name-ab.txt`, etc.

    Args:
        input_directory: Directory containing source files.
        output_directory: Directory to write split files to.
        max_lines: Maximum number of lines per output file.
    """
    processed_files = 0

    for file_path in input_directory.iterdir():
        if not (file_path.is_file() and file_path.suffix == ".txt"):
            continue

        try:
            with file_path.open("r", encoding="utf-8", errors="ignore") as f:
                lines = f.readlines()

            line_count = len(lines)
            base_name = file_path.stem

            if line_count <= max_lines:
                # Single file, rename with -aa suffix
                new_file_name = f"{base_name}-aa.txt"
                output_file_path = output_directory / new_file_name

                with output_file_path.open("w", encoding="utf-8") as output_file:
                    output_file.writelines(lines)
            else:
                # Split into multiple files
                alphabet = string.ascii_lowercase
                part_index = 0
                start_line = 0

                while start_line < line_count:
                    end_line = min(start_line + max_lines, line_count)

                    # Generate suffix (aa, ab, ac, ...)
                    part_suffix = (
                        f"{alphabet[part_index // 26]}{alphabet[part_index % 26]}"
                    )
                    part_file_name = f"{base_name}-{part_suffix}.txt"
                    part_file_path = output_directory / part_file_name

                    with part_file_path.open("w", encoding="utf-8") as part_file:
                        part_file.writelines(lines[start_line:end_line])

                    start_line += max_lines
                    part_index += 1

            processed_files += 1

        except OSError as e:
            logging.error(f"Error processing file {file_path}: {e}")

    logging.info(
        f"Processed {processed_files} files, split into chunks of {max_lines} lines"
    )


def merge_all_blocklists(blocklist_directory: Path, output_file: Path) -> None:
    """Merge all blocklist files into a single file, ensuring all IPs are unique.

    Args:
        blocklist_directory: Directory containing blocklist files.
        output_file: Path for the merged output file.
    """
    ip_set = set()
    processed_files = 0

    # Collect all IPs from the blocklist files
    for file_path in blocklist_directory.iterdir():
        if not (file_path.is_file() and file_path.suffix == ".txt"):
            continue

        try:
            with file_path.open("r", encoding="utf-8", errors="ignore") as f:
                for line in f:
                    ip = line.strip()
                    if ip and is_valid_ip(ip):
                        ip_set.add(ip)

            processed_files += 1

        except OSError as e:
            logging.error(f"Error reading file {file_path}: {e}")

    # Write the unique IPs to the output file
    try:
        with output_file.open("w", encoding="utf-8") as f:
            for ip in sorted(ip_set, key=ipaddress.ip_address):
                f.write(f"{ip}\n")

        logging.info(
            f"Merged {processed_files} files with {len(ip_set)} unique IPs into {output_file}"
        )

    except OSError as e:
        logging.error(f"Error writing merged file {output_file}: {e}")
        raise


def main() -> None:
    """Main function to orchestrate the blocklist update process."""
    logging.info("Starting blocklist update process")

    try:
        clear_directory(BLOCKLISTS_DIR)
        process_all_resources(CONFIG_FILE)
        split_large_blocklists(BLOCKLISTS_DIR, BLOCKLISTS_SPLIT_DIR, MAX_LINES_PER_FILE)
        merge_all_blocklists(BLOCKLISTS_DIR, BLOCKLISTS_MERGED_FILE_PATH)

        logging.info("Blocklist update process completed successfully")

    except Exception as e:
        logging.error(f"Blocklist update process failed: {e}")
        raise


if __name__ == "__main__":
    main()