fix: extract and validate links within HTML blocks in Markdown files (#3688)

* fix: extract and validate links within HTML blocks in Markdown files --------- Co-authored-by: Samuele Perticarari <sperticarari@google.com>
2026-01-30 10:42:08 +01:00
parent dc2ccfe518
commit f9b94955a9
1 changed files with 10 additions and 4 deletions
--- a/tools/check_links.py
+++ b/tools/check_links.py
@@ -23,6 +23,7 @@ import collections
 import pathlib
 import requests
 import urllib.parse
+import re

 import click
 import marko
@@ -35,21 +36,21 @@ LINK = collections.namedtuple('LINK', 'dest valid')
 def check_link(link, readme_path, external):
  'Checks if a link element has a valid destination.'
  link_valid = None
-  url = urllib.parse.urlparse(link.dest)
+  url = urllib.parse.urlparse(link)
  # If the link is public, say the link is anyway valid
  # if --external is not set; check the link otherwise
  if url.scheme:
    link_valid = True
    if external:
      try:
-        response = requests.get(link.dest)
+        response = requests.get(link)
        link_valid = response.ok
      except requests.exceptions.RequestException:
        link_valid = False
  # The link is private
  else:
    link_valid = (readme_path.parent / url.path).exists()
-  return LINK(link.dest, link_valid)
+  return LINK(link, link_valid)


 def check_docs(dir_name, external=False):
@@ -67,7 +68,12 @@ def check_docs(dir_name, external=False):
    while elements:
      el = elements.popleft()
      if isinstance(el, marko.inline.Link):
-        links.append(check_link(el, readme_path, external))
+        links.append(check_link(el.dest, readme_path, external))
+      elif isinstance(el, marko.block.HTMLBlock):
+        pattern = r'(?:href|src)=([\'"])(.*?)\1'
+        extracted_links = [match[1] for match in re.findall(pattern, el.body)]
+        for link in extracted_links:
+          links.append(check_link(link, readme_path, external))
      elif hasattr(el, 'children'):
        elements.extend(el.children)