parseentities.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. #!/usr/bin/env python3
  2. """ Utility for parsing HTML entity definitions available from:
  3. http://www.w3.org/ as e.g.
  4. http://www.w3.org/TR/REC-html40/HTMLlat1.ent
  5. Input is read from stdin, output is written to stdout in form of a
  6. Python snippet defining a dictionary "entitydefs" mapping literal
  7. entity name to character or numeric entity.
  8. Marc-Andre Lemburg, mal@lemburg.com, 1999.
  9. Use as you like. NO WARRANTIES.
  10. """
  11. import re,sys
  12. entityRE = re.compile(r'<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')
  13. def parse(text,pos=0,endpos=None):
  14. pos = 0
  15. if endpos is None:
  16. endpos = len(text)
  17. d = {}
  18. while 1:
  19. m = entityRE.search(text,pos,endpos)
  20. if not m:
  21. break
  22. name,charcode,comment = m.groups()
  23. d[name] = charcode,comment
  24. pos = m.end()
  25. return d
  26. def writefile(f,defs):
  27. f.write("entitydefs = {\n")
  28. items = sorted(defs.items())
  29. for name, (charcode,comment) in items:
  30. if charcode[:2] == '&#':
  31. code = int(charcode[2:-1])
  32. if code < 256:
  33. charcode = r"'\%o'" % code
  34. else:
  35. charcode = repr(charcode)
  36. else:
  37. charcode = repr(charcode)
  38. comment = ' '.join(comment.split())
  39. f.write(" '%s':\t%s, \t# %s\n" % (name,charcode,comment))
  40. f.write('\n}\n')
  41. if __name__ == '__main__':
  42. if len(sys.argv) > 1:
  43. infile = open(sys.argv[1])
  44. else:
  45. infile = sys.stdin
  46. if len(sys.argv) > 2:
  47. outfile = open(sys.argv[2],'w')
  48. else:
  49. outfile = sys.stdout
  50. text = infile.read()
  51. defs = parse(text)
  52. writefile(outfile,defs)