check_readme_links.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. #!/usr/bin/env python
  2. #
  3. # Checks that all links in the readme markdown files are valid
  4. #
  5. # SPDX-FileCopyrightText: 2020-2022 Espressif Systems (Shanghai) CO LTD
  6. # SPDX-License-Identifier: Apache-2.0
  7. #
  8. import argparse
  9. import concurrent.futures
  10. import os
  11. import os.path
  12. import re
  13. import sys
  14. import urllib.error
  15. import urllib.request
  16. from collections import defaultdict, namedtuple
  17. from pathlib import Path
  18. from typing import List
  19. # The apple apps links are not accessible from the company network for some reason
  20. EXCLUDE_URL_LIST = ['https://apps.apple.com/in/app/esp-ble-provisioning/id1473590141', 'https://apps.apple.com/in/app/esp-softap-provisioning/id1474040630']
  21. Link = namedtuple('Link', ['file', 'url'])
  22. class ReadmeLinkError(Exception):
  23. def __init__(self, file: str, url: str) -> None:
  24. self.file = file
  25. self.url = url
  26. class RelativeLinkError(ReadmeLinkError):
  27. def __str__(self) -> str:
  28. return 'Relative link error, file - {} not found, linked from {}'.format(self.url, self.file)
  29. class UrlLinkError(ReadmeLinkError):
  30. def __init__(self, file: str, url: str, error_code: str):
  31. self.error_code = error_code
  32. super().__init__(file, url)
  33. def __str__(self) -> str:
  34. files = [str(f) for f in self.file]
  35. return 'URL error, url - {} in files - {} is not accessible, request returned {}'.format(self.url, ', '.join(files), self.error_code)
  36. # we do not want a failed test just due to bad network conditions, for non 404 errors we simply print a warning
  37. def check_url(url: str, files: str, timeout: float) -> None:
  38. try:
  39. with urllib.request.urlopen(url, timeout=timeout):
  40. return
  41. except urllib.error.HTTPError as e:
  42. if e.code == 404:
  43. raise UrlLinkError(files, url, str(e))
  44. else:
  45. print('Unable to access {}, err = {}'.format(url, str(e)))
  46. except Exception as e:
  47. print('Unable to access {}, err = {}'.format(url, str(e)))
  48. def check_web_links(web_links: defaultdict) -> List:
  49. with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
  50. errors = []
  51. future_to_url = {executor.submit(check_url, url, files, timeout=30): (url, files) for url, files in web_links.items()}
  52. for future in concurrent.futures.as_completed(future_to_url):
  53. try:
  54. future.result()
  55. except UrlLinkError as e:
  56. errors.append(e)
  57. return errors
  58. def check_file_links(file_links: List) -> List:
  59. errors = []
  60. for link in file_links:
  61. link_path = link.file.parent / link.url
  62. if not Path.exists(link_path):
  63. errors.append(RelativeLinkError(link.file, link.url))
  64. print('Found {} errors with relative links'.format(len(errors)))
  65. return errors
  66. def get_md_links(folder: str) -> List:
  67. MD_LINK_RE = r'\[.+?\]\((.+?)(#.+)?\)'
  68. idf_path_str = os.getenv('IDF_PATH')
  69. if idf_path_str is None:
  70. raise RuntimeError("Environment variable 'IDF_PATH' wasn't set.")
  71. idf_path = Path(idf_path_str)
  72. links = []
  73. for path in (idf_path / folder).rglob('*.md'):
  74. with path.open(encoding='utf8') as f:
  75. content = f.read()
  76. for url in re.findall(MD_LINK_RE, content):
  77. link = Link(path, url[0].lstrip())
  78. # Ignore "local" links
  79. if not link.url.startswith('#'):
  80. links.append(link)
  81. return links
  82. def check_readme_links(args: argparse.Namespace) -> int:
  83. links = get_md_links('examples')
  84. print('Found {} links'.format(len(links)))
  85. errors = []
  86. web_links = defaultdict(list)
  87. file_links = []
  88. # Sort links into file and web links
  89. for link in links:
  90. if link.url.startswith('http'):
  91. web_links[link.url].append(link.file)
  92. else:
  93. file_links.append(link)
  94. for url in EXCLUDE_URL_LIST:
  95. del web_links[url]
  96. errors.extend(check_file_links(file_links))
  97. if not args.skip_weburl:
  98. errors.extend(check_web_links(web_links))
  99. print('Found {} errors:'.format(len(errors)))
  100. for e in errors:
  101. print(e)
  102. return 1 if len(errors) > 0 else 0
  103. if __name__ == '__main__':
  104. parser = argparse.ArgumentParser(description='check_readme_links.py: Checks for dead links in example READMEs', prog='check_readme_links.py')
  105. parser.add_argument('--skip-weburl', '-w', action='store_true', help='Skip checking of web URLs, only check links to local files')
  106. args = parser.parse_args()
  107. sys.exit(check_readme_links(args))