Skip to content

Commit 0217bb2

Browse files
committed
Add HTML parsing for Azure Artifacts version retrieval
This commit introduces a new SimpleIndexParser class to parse the PEP 503 simple index HTML format from Azure Artifacts, allowing the extraction of package versions from the HTML response. The query_azure_artifacts_version function is updated to utilize this parser, improving the handling of version retrieval. Additionally, tests are added to verify the parsing functionality, including scenarios for successful version extraction, handling 404 responses, and cases with empty HTML responses.
1 parent cb564ab commit 0217bb2

File tree

2 files changed

+151
-11
lines changed

2 files changed

+151
-11
lines changed

src/python_package_folder/version_calculator.py

Lines changed: 97 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import logging
1414
import re
1515
import subprocess
16+
from html.parser import HTMLParser
1617
from pathlib import Path
1718

1819
import requests
@@ -110,15 +111,77 @@ def _query_pypi_version(package_name: str, registry: str) -> str | None:
110111
return None
111112

112113

114+
class SimpleIndexParser(HTMLParser):
115+
"""Parser for PEP 503 simple index HTML to extract package versions."""
116+
117+
def __init__(self, package_name: str):
118+
super().__init__()
119+
self.package_name = package_name
120+
self.versions: set[str] = set()
121+
self.in_anchor = False
122+
self.current_href = ""
123+
124+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
125+
if tag == "a":
126+
self.in_anchor = True
127+
# Extract href attribute
128+
for attr_name, attr_value in attrs:
129+
if attr_name == "href" and attr_value:
130+
self.current_href = attr_value
131+
break
132+
133+
def handle_data(self, data: str) -> None:
134+
if self.in_anchor:
135+
# Extract version from link text or href
136+
# Format: package-name-version-... or package-name-version.tar.gz
137+
version = self._extract_version_from_filename(data.strip())
138+
if version:
139+
self.versions.add(version)
140+
# Also check href if it contains version info
141+
if self.current_href:
142+
version = self._extract_version_from_filename(self.current_href)
143+
if version:
144+
self.versions.add(version)
145+
146+
def handle_endtag(self, tag: str) -> None:
147+
if tag == "a":
148+
self.in_anchor = False
149+
self.current_href = ""
150+
151+
def _extract_version_from_filename(self, filename: str) -> str | None:
152+
"""Extract version number from package filename."""
153+
# Pattern: package-name-version-... or package-name-version.tar.gz
154+
# Examples: data-0.1.0-py3-none-any.whl, data-0.1.0.tar.gz
155+
# The version is between the package name and the next separator
156+
157+
# Normalize package name (replace - with _ for matching)
158+
normalized_package = self.package_name.replace("-", "_").replace(".", "_")
159+
160+
# Try to match: package-name-version- or package-name-version.
161+
# Version format: X.Y.Z (semantic versioning)
162+
pattern = rf"{re.escape(self.package_name)}-(\d+\.\d+\.\d+(?:\.\d+)?(?:[a-zA-Z0-9]+)?)"
163+
match = re.search(pattern, filename, re.IGNORECASE)
164+
if match:
165+
return match.group(1)
166+
167+
# Fallback: try with normalized package name
168+
pattern = rf"{re.escape(normalized_package)}-(\d+\.\d+\.\d+(?:\.\d+)?(?:[a-zA-Z0-9]+)?)"
169+
match = re.search(pattern, filename, re.IGNORECASE)
170+
if match:
171+
return match.group(1)
172+
173+
return None
174+
175+
113176
def _query_azure_artifacts_version(
114177
package_name: str,
115178
repository_url: str,
116179
) -> str | None:
117180
"""
118181
Query Azure Artifacts for the latest version.
119182
120-
Azure Artifacts uses a simple index format (HTML) which is more complex to parse.
121-
For now, we'll attempt to query but fall back gracefully if it fails.
183+
Azure Artifacts uses a simple index format (HTML) following PEP 503.
184+
Parses the HTML to extract version numbers from package filenames.
122185
123186
Args:
124187
package_name: Package name to query
@@ -141,24 +204,49 @@ def _query_azure_artifacts_version(
141204
return None
142205

143206
try:
144-
response = requests.get(simple_index_url, timeout=5)
207+
response = requests.get(simple_index_url, timeout=10)
145208
logger.debug(f"Azure Artifacts response status: {response.status_code}")
146209

147210
if response.status_code == 401:
148211
logger.warning(f"Authentication required for Azure Artifacts (401). Package '{package_name}' may require authentication to query.")
212+
return None
149213
elif response.status_code == 403:
150214
logger.warning(f"Access forbidden for Azure Artifacts (403). Package '{package_name}' may not be accessible or requires different permissions.")
215+
return None
151216
elif response.status_code == 404:
152217
logger.debug(f"Package '{package_name}' not found on Azure Artifacts (404) - first release")
218+
return None
153219
elif response.status_code != 200:
154220
logger.warning(f"Unexpected status code {response.status_code} from Azure Artifacts for '{package_name}'")
221+
return None
155222

156-
# Azure Artifacts simple index returns HTML, not JSON
157-
# Parsing HTML is complex and may require authentication
158-
# For now, we'll return None to fall back to git tags
159-
# This can be enhanced later with proper HTML parsing or API endpoint discovery
160-
logger.info(f"Azure Artifacts version query not fully implemented (HTML parsing required). Falling back to git tags.")
161-
return None
223+
# Parse HTML to extract versions
224+
parser = SimpleIndexParser(package_name)
225+
try:
226+
parser.feed(response.text)
227+
except Exception as e:
228+
logger.warning(f"Error parsing Azure Artifacts HTML for '{package_name}': {e}")
229+
return None
230+
231+
if not parser.versions:
232+
logger.debug(f"No versions found in Azure Artifacts HTML for '{package_name}'")
233+
return None
234+
235+
# Find the latest version
236+
versions = list(parser.versions)
237+
logger.debug(f"Found {len(versions)} versions in Azure Artifacts: {versions}")
238+
239+
# Sort versions to find the latest
240+
try:
241+
sorted_versions = sorted(versions, key=_parse_version_for_sort, reverse=True)
242+
latest_version = sorted_versions[0]
243+
logger.info(f"Found latest version {latest_version} on Azure Artifacts for '{package_name}'")
244+
return latest_version
245+
except Exception as e:
246+
logger.warning(f"Error sorting versions for '{package_name}': {e}")
247+
# Fallback: return the first version found
248+
return versions[0]
249+
162250
except requests.RequestException as e:
163251
logger.warning(f"Network error querying Azure Artifacts for '{package_name}': {e}")
164252
return None

tests/test_version_calculator.py

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,17 +86,69 @@ def test_query_pypi_version_fallback_to_releases(self, mock_get: MagicMock) -> N
8686

8787
@patch("python_package_folder.version_calculator.requests.get")
8888
def test_query_azure_artifacts_version(self, mock_get: MagicMock) -> None:
89-
"""Test querying Azure Artifacts (basic support, returns None for now)."""
89+
"""Test querying Azure Artifacts with HTML parsing."""
9090
mock_response = Mock()
9191
mock_response.status_code = 200
92+
# Simulate PEP 503 simple index HTML response
93+
mock_response.text = """<!DOCTYPE html>
94+
<html>
95+
<head>
96+
<title>Links for test-package</title>
97+
</head>
98+
<body>
99+
<h1>Links for test-package</h1>
100+
<a href="test-package-0.1.0-py3-none-any.whl">test-package-0.1.0-py3-none-any.whl</a>
101+
<a href="test-package-0.2.0-py3-none-any.whl">test-package-0.2.0-py3-none-any.whl</a>
102+
<a href="test-package-0.1.5.tar.gz">test-package-0.1.5.tar.gz</a>
103+
</body>
104+
</html>"""
92105
mock_get.return_value = mock_response
93106

94107
version = query_registry_version(
95108
"test-package",
96109
"azure",
97110
repository_url="https://pkgs.dev.azure.com/ORG/PROJECT/_packaging/FEED/pypi/upload",
98111
)
99-
# Azure Artifacts parsing not fully implemented, returns None
112+
# Should parse HTML and return the latest version
113+
assert version == "0.2.0"
114+
115+
@patch("python_package_folder.version_calculator.requests.get")
116+
def test_query_azure_artifacts_version_not_found(self, mock_get: MagicMock) -> None:
117+
"""Test querying Azure Artifacts when package doesn't exist (404)."""
118+
mock_response = Mock()
119+
mock_response.status_code = 404
120+
mock_get.return_value = mock_response
121+
122+
version = query_registry_version(
123+
"test-package",
124+
"azure",
125+
repository_url="https://pkgs.dev.azure.com/ORG/PROJECT/_packaging/FEED/pypi/upload",
126+
)
127+
# Should return None for 404 (first release)
128+
assert version is None
129+
130+
@patch("python_package_folder.version_calculator.requests.get")
131+
def test_query_azure_artifacts_version_empty_html(self, mock_get: MagicMock) -> None:
132+
"""Test querying Azure Artifacts with empty HTML (no versions)."""
133+
mock_response = Mock()
134+
mock_response.status_code = 200
135+
mock_response.text = """<!DOCTYPE html>
136+
<html>
137+
<head>
138+
<title>Links for test-package</title>
139+
</head>
140+
<body>
141+
<h1>Links for test-package</h1>
142+
</body>
143+
</html>"""
144+
mock_get.return_value = mock_response
145+
146+
version = query_registry_version(
147+
"test-package",
148+
"azure",
149+
repository_url="https://pkgs.dev.azure.com/ORG/PROJECT/_packaging/FEED/pypi/upload",
150+
)
151+
# Should return None when no versions found in HTML
100152
assert version is None
101153

102154
@patch("python_package_folder.version_calculator.requests.get")

0 commit comments

Comments
 (0)