makecompresseddata.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. from __future__ import print_function
  2. import collections
  3. import re
  4. import sys
  5. import gzip
  6. import zlib
  7. _COMPRESSED_MARKER = 0xFF
  8. def check_non_ascii(msg):
  9. for c in msg:
  10. if ord(c) >= 0x80:
  11. print(
  12. 'Unable to generate compressed data: message "{}" contains a non-ascii character "{}".'.format(
  13. msg, c
  14. ),
  15. file=sys.stderr,
  16. )
  17. sys.exit(1)
  18. # Replace <char><space> with <char | 0x80>.
  19. # Trival scheme to demo/test.
  20. def space_compression(error_strings):
  21. for line in error_strings:
  22. check_non_ascii(line)
  23. result = ""
  24. for i in range(len(line)):
  25. if i > 0 and line[i] == " ":
  26. result = result[:-1]
  27. result += "\\{:03o}".format(ord(line[i - 1]))
  28. else:
  29. result += line[i]
  30. error_strings[line] = result
  31. return None
  32. # Replace common words with <0x80 | index>.
  33. # Index is into a table of words stored as aaaaa<0x80|a>bbb<0x80|b>...
  34. # Replaced words are assumed to have spaces either side to avoid having to store the spaces in the compressed strings.
  35. def word_compression(error_strings):
  36. topn = collections.Counter()
  37. for line in error_strings.keys():
  38. check_non_ascii(line)
  39. for word in line.split(" "):
  40. topn[word] += 1
  41. # Order not just by frequency, but by expected saving. i.e. prefer a longer string that is used less frequently.
  42. # Use the word itself for ties so that compression is deterministic.
  43. def bytes_saved(item):
  44. w, n = item
  45. return -((len(w) + 1) * (n - 1)), w
  46. top128 = sorted(topn.items(), key=bytes_saved)[:128]
  47. index = [w for w, _ in top128]
  48. index_lookup = {w: i for i, w in enumerate(index)}
  49. for line in error_strings.keys():
  50. result = ""
  51. need_space = False
  52. for word in line.split(" "):
  53. if word in index_lookup:
  54. result += "\\{:03o}".format(0b10000000 | index_lookup[word])
  55. need_space = False
  56. else:
  57. if need_space:
  58. result += " "
  59. need_space = True
  60. result += word
  61. error_strings[line] = result.strip()
  62. return "".join(w[:-1] + "\\{:03o}".format(0b10000000 | ord(w[-1])) for w in index)
  63. # Replace chars in text with variable length bit sequence.
  64. # For comparison only (the table is not emitted).
  65. def huffman_compression(error_strings):
  66. # https://github.com/tannewt/huffman
  67. import huffman
  68. all_strings = "".join(error_strings)
  69. cb = huffman.codebook(collections.Counter(all_strings).items())
  70. for line in error_strings:
  71. b = "1"
  72. for c in line:
  73. b += cb[c]
  74. n = len(b)
  75. if n % 8 != 0:
  76. n += 8 - (n % 8)
  77. result = ""
  78. for i in range(0, n, 8):
  79. result += "\\{:03o}".format(int(b[i : i + 8], 2))
  80. if len(result) > len(line) * 4:
  81. result = line
  82. error_strings[line] = result
  83. # TODO: This would be the prefix lengths and the table ordering.
  84. return "_" * (10 + len(cb))
  85. # Replace common N-letter sequences with <0x80 | index>, where
  86. # the common sequences are stored in a separate table.
  87. # This isn't very useful, need a smarter way to find top-ngrams.
  88. def ngram_compression(error_strings):
  89. topn = collections.Counter()
  90. N = 2
  91. for line in error_strings.keys():
  92. check_non_ascii(line)
  93. if len(line) < N:
  94. continue
  95. for i in range(0, len(line) - N, N):
  96. topn[line[i : i + N]] += 1
  97. def bytes_saved(item):
  98. w, n = item
  99. return -(len(w) * (n - 1))
  100. top128 = sorted(topn.items(), key=bytes_saved)[:128]
  101. index = [w for w, _ in top128]
  102. index_lookup = {w: i for i, w in enumerate(index)}
  103. for line in error_strings.keys():
  104. result = ""
  105. for i in range(0, len(line) - N + 1, N):
  106. word = line[i : i + N]
  107. if word in index_lookup:
  108. result += "\\{:03o}".format(0b10000000 | index_lookup[word])
  109. else:
  110. result += word
  111. if len(line) % N != 0:
  112. result += line[len(line) - len(line) % N :]
  113. error_strings[line] = result.strip()
  114. return "".join(index)
  115. def main(collected_path, fn):
  116. error_strings = collections.OrderedDict()
  117. max_uncompressed_len = 0
  118. num_uses = 0
  119. # Read in all MP_ERROR_TEXT strings.
  120. with open(collected_path, "r") as f:
  121. for line in f:
  122. line = line.strip()
  123. if not line:
  124. continue
  125. num_uses += 1
  126. error_strings[line] = None
  127. max_uncompressed_len = max(max_uncompressed_len, len(line))
  128. # So that objexcept.c can figure out how big the buffer needs to be.
  129. print("#define MP_MAX_UNCOMPRESSED_TEXT_LEN ({})".format(max_uncompressed_len))
  130. # Run the compression.
  131. compressed_data = fn(error_strings)
  132. # Print the data table.
  133. print('MP_COMPRESSED_DATA("{}")'.format(compressed_data))
  134. # Print the replacements.
  135. for uncomp, comp in error_strings.items():
  136. if uncomp == comp:
  137. prefix = ""
  138. else:
  139. prefix = "\\{:03o}".format(_COMPRESSED_MARKER)
  140. print('MP_MATCH_COMPRESSED("{}", "{}{}")'.format(uncomp, prefix, comp))
  141. # Used to calculate the "true" length of the (escaped) compressed strings.
  142. def unescape(s):
  143. return re.sub(r"\\\d\d\d", "!", s)
  144. # Stats. Note this doesn't include the cost of the decompressor code.
  145. uncomp_len = sum(len(s) + 1 for s in error_strings.keys())
  146. comp_len = sum(1 + len(unescape(s)) + 1 for s in error_strings.values())
  147. data_len = len(compressed_data) + 1 if compressed_data else 0
  148. print("// Total input length: {}".format(uncomp_len))
  149. print("// Total compressed length: {}".format(comp_len))
  150. print("// Total data length: {}".format(data_len))
  151. print("// Predicted saving: {}".format(uncomp_len - comp_len - data_len))
  152. # Somewhat meaningless comparison to zlib/gzip.
  153. all_input_bytes = "\\0".join(error_strings.keys()).encode()
  154. print()
  155. if hasattr(gzip, "compress"):
  156. gzip_len = len(gzip.compress(all_input_bytes)) + num_uses * 4
  157. print("// gzip length: {}".format(gzip_len))
  158. print("// Percentage of gzip: {:.1f}%".format(100 * (comp_len + data_len) / gzip_len))
  159. if hasattr(zlib, "compress"):
  160. zlib_len = len(zlib.compress(all_input_bytes)) + num_uses * 4
  161. print("// zlib length: {}".format(zlib_len))
  162. print("// Percentage of zlib: {:.1f}%".format(100 * (comp_len + data_len) / zlib_len))
  163. if __name__ == "__main__":
  164. main(sys.argv[1], word_compression)