1313import logging
1414import re
1515import subprocess
16+ from html .parser import HTMLParser
1617from pathlib import Path
1718
1819import requests
@@ -110,15 +111,77 @@ def _query_pypi_version(package_name: str, registry: str) -> str | None:
110111 return None
111112
112113
114+ class SimpleIndexParser (HTMLParser ):
115+ """Parser for PEP 503 simple index HTML to extract package versions."""
116+
117+ def __init__ (self , package_name : str ):
118+ super ().__init__ ()
119+ self .package_name = package_name
120+ self .versions : set [str ] = set ()
121+ self .in_anchor = False
122+ self .current_href = ""
123+
124+ def handle_starttag (self , tag : str , attrs : list [tuple [str , str | None ]]) -> None :
125+ if tag == "a" :
126+ self .in_anchor = True
127+ # Extract href attribute
128+ for attr_name , attr_value in attrs :
129+ if attr_name == "href" and attr_value :
130+ self .current_href = attr_value
131+ break
132+
133+ def handle_data (self , data : str ) -> None :
134+ if self .in_anchor :
135+ # Extract version from link text or href
136+ # Format: package-name-version-... or package-name-version.tar.gz
137+ version = self ._extract_version_from_filename (data .strip ())
138+ if version :
139+ self .versions .add (version )
140+ # Also check href if it contains version info
141+ if self .current_href :
142+ version = self ._extract_version_from_filename (self .current_href )
143+ if version :
144+ self .versions .add (version )
145+
146+ def handle_endtag (self , tag : str ) -> None :
147+ if tag == "a" :
148+ self .in_anchor = False
149+ self .current_href = ""
150+
151+ def _extract_version_from_filename (self , filename : str ) -> str | None :
152+ """Extract version number from package filename."""
153+ # Pattern: package-name-version-... or package-name-version.tar.gz
154+ # Examples: data-0.1.0-py3-none-any.whl, data-0.1.0.tar.gz
155+ # The version is between the package name and the next separator
156+
157+ # Normalize package name (replace - with _ for matching)
158+ normalized_package = self .package_name .replace ("-" , "_" ).replace ("." , "_" )
159+
160+ # Try to match: package-name-version- or package-name-version.
161+ # Version format: X.Y.Z (semantic versioning)
162+ pattern = rf"{ re .escape (self .package_name )} -(\d+\.\d+\.\d+(?:\.\d+)?(?:[a-zA-Z0-9]+)?)"
163+ match = re .search (pattern , filename , re .IGNORECASE )
164+ if match :
165+ return match .group (1 )
166+
167+ # Fallback: try with normalized package name
168+ pattern = rf"{ re .escape (normalized_package )} -(\d+\.\d+\.\d+(?:\.\d+)?(?:[a-zA-Z0-9]+)?)"
169+ match = re .search (pattern , filename , re .IGNORECASE )
170+ if match :
171+ return match .group (1 )
172+
173+ return None
174+
175+
113176def _query_azure_artifacts_version (
114177 package_name : str ,
115178 repository_url : str ,
116179) -> str | None :
117180 """
118181 Query Azure Artifacts for the latest version.
119182
120- Azure Artifacts uses a simple index format (HTML) which is more complex to parse .
121- For now, we'll attempt to query but fall back gracefully if it fails .
183+ Azure Artifacts uses a simple index format (HTML) following PEP 503 .
184+ Parses the HTML to extract version numbers from package filenames .
122185
123186 Args:
124187 package_name: Package name to query
@@ -141,24 +204,49 @@ def _query_azure_artifacts_version(
141204 return None
142205
143206 try :
144- response = requests .get (simple_index_url , timeout = 5 )
207+ response = requests .get (simple_index_url , timeout = 10 )
145208 logger .debug (f"Azure Artifacts response status: { response .status_code } " )
146209
147210 if response .status_code == 401 :
148211 logger .warning (f"Authentication required for Azure Artifacts (401). Package '{ package_name } ' may require authentication to query." )
212+ return None
149213 elif response .status_code == 403 :
150214 logger .warning (f"Access forbidden for Azure Artifacts (403). Package '{ package_name } ' may not be accessible or requires different permissions." )
215+ return None
151216 elif response .status_code == 404 :
152217 logger .debug (f"Package '{ package_name } ' not found on Azure Artifacts (404) - first release" )
218+ return None
153219 elif response .status_code != 200 :
154220 logger .warning (f"Unexpected status code { response .status_code } from Azure Artifacts for '{ package_name } '" )
221+ return None
155222
156- # Azure Artifacts simple index returns HTML, not JSON
157- # Parsing HTML is complex and may require authentication
158- # For now, we'll return None to fall back to git tags
159- # This can be enhanced later with proper HTML parsing or API endpoint discovery
160- logger .info (f"Azure Artifacts version query not fully implemented (HTML parsing required). Falling back to git tags." )
161- return None
223+ # Parse HTML to extract versions
224+ parser = SimpleIndexParser (package_name )
225+ try :
226+ parser .feed (response .text )
227+ except Exception as e :
228+ logger .warning (f"Error parsing Azure Artifacts HTML for '{ package_name } ': { e } " )
229+ return None
230+
231+ if not parser .versions :
232+ logger .debug (f"No versions found in Azure Artifacts HTML for '{ package_name } '" )
233+ return None
234+
235+ # Find the latest version
236+ versions = list (parser .versions )
237+ logger .debug (f"Found { len (versions )} versions in Azure Artifacts: { versions } " )
238+
239+ # Sort versions to find the latest
240+ try :
241+ sorted_versions = sorted (versions , key = _parse_version_for_sort , reverse = True )
242+ latest_version = sorted_versions [0 ]
243+ logger .info (f"Found latest version { latest_version } on Azure Artifacts for '{ package_name } '" )
244+ return latest_version
245+ except Exception as e :
246+ logger .warning (f"Error sorting versions for '{ package_name } ': { e } " )
247+ # Fallback: return the first version found
248+ return versions [0 ]
249+
162250 except requests .RequestException as e :
163251 logger .warning (f"Network error querying Azure Artifacts for '{ package_name } ': { e } " )
164252 return None
0 commit comments