parse_html5_entities.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. #!/usr/bin/env python3
  2. """
  3. Utility for parsing HTML5 entity definitions available from:
  4. http://dev.w3.org/html5/spec/entities.json
  5. Written by Ezio Melotti and Iuliia Proskurnia.
  6. """
  7. import os
  8. import sys
  9. import json
  10. from urllib.request import urlopen
  11. from html.entities import html5
  12. entities_url = 'http://dev.w3.org/html5/spec/entities.json'
  13. def get_json(url):
  14. """Download the json file from the url and returns a decoded object."""
  15. with urlopen(url) as f:
  16. data = f.read().decode('utf-8')
  17. return json.loads(data)
  18. def create_dict(entities):
  19. """Create the html5 dict from the decoded json object."""
  20. new_html5 = {}
  21. for name, value in entities.items():
  22. new_html5[name.lstrip('&')] = value['characters']
  23. return new_html5
  24. def compare_dicts(old, new):
  25. """Compare the old and new dicts and print the differences."""
  26. added = new.keys() - old.keys()
  27. if added:
  28. print('{} entitie(s) have been added:'.format(len(added)))
  29. for name in sorted(added):
  30. print(' {!r}: {!r}'.format(name, new[name]))
  31. removed = old.keys() - new.keys()
  32. if removed:
  33. print('{} entitie(s) have been removed:'.format(len(removed)))
  34. for name in sorted(removed):
  35. print(' {!r}: {!r}'.format(name, old[name]))
  36. changed = set()
  37. for name in (old.keys() & new.keys()):
  38. if old[name] != new[name]:
  39. changed.add((name, old[name], new[name]))
  40. if changed:
  41. print('{} entitie(s) have been modified:'.format(len(changed)))
  42. for item in sorted(changed):
  43. print(' {!r}: {!r} -> {!r}'.format(*item))
  44. def write_items(entities, file=sys.stdout):
  45. """Write the items of the dictionary in the specified file."""
  46. # The keys in the generated dictionary should be sorted
  47. # in a case-insensitive way, however, when two keys are equal,
  48. # the uppercase version should come first so that the result
  49. # looks like: ['Aacute', 'aacute', 'Aacute;', 'aacute;', ...]
  50. # To do this we first sort in a case-sensitive way (so all the
  51. # uppercase chars come first) and then sort with key=str.lower.
  52. # Since the sorting is stable the uppercase keys will eventually
  53. # be before their equivalent lowercase version.
  54. keys = sorted(entities.keys())
  55. keys = sorted(keys, key=str.lower)
  56. print('html5 = {', file=file)
  57. for name in keys:
  58. print(' {!r}: {!a},'.format(name, entities[name]), file=file)
  59. print('}', file=file)
  60. if __name__ == '__main__':
  61. # without args print a diff between html.entities.html5 and new_html5
  62. # with --create print the new html5 dict
  63. # with --patch patch the Lib/html/entities.py file
  64. new_html5 = create_dict(get_json(entities_url))
  65. if '--create' in sys.argv:
  66. print('# map the HTML5 named character references to the '
  67. 'equivalent Unicode character(s)')
  68. print('# Generated by {}. Do not edit manually.'.format(__file__))
  69. write_items(new_html5)
  70. elif '--patch' in sys.argv:
  71. fname = 'Lib/html/entities.py'
  72. temp_fname = fname + '.temp'
  73. with open(fname) as f1, open(temp_fname, 'w') as f2:
  74. skip = False
  75. for line in f1:
  76. if line.startswith('html5 = {'):
  77. write_items(new_html5, file=f2)
  78. skip = True
  79. continue
  80. if skip:
  81. # skip the old items until the }
  82. if line.startswith('}'):
  83. skip = False
  84. continue
  85. f2.write(line)
  86. os.remove(fname)
  87. os.rename(temp_fname, fname)
  88. else:
  89. if html5 == new_html5:
  90. print('The current dictionary is updated.')
  91. else:
  92. compare_dicts(html5, new_html5)
  93. print('Run "./python {0} --patch" to update Lib/html/entities.html '
  94. 'or "./python {0} --create" to see the generated ' 'dictionary.'.format(__file__))