zishrink.awk 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356
  1. # Convert tzdata source into a smaller version of itself.
  2. # Contributed by Paul Eggert. This file is in the public domain.
  3. # This is not a general-purpose converter; it is designed for current tzdata.
  4. # 'zic' should treat this script's output as if it were identical to
  5. # this script's input.
  6. # Record a hash N for the new name NAME, checking for collisions.
  7. function record_hash(n, name)
  8. {
  9. if (used_hashes[n]) {
  10. printf "# ! collision: %s %s\n", used_hashes[n], name
  11. exit 1
  12. }
  13. used_hashes[n] = name
  14. }
  15. # Return a shortened rule name representing NAME,
  16. # and record this relationship to the hash table.
  17. function gen_rule_name(name, \
  18. n)
  19. {
  20. # Use a simple mnemonic: the first two letters.
  21. n = substr(name, 1, 2)
  22. record_hash(n, name)
  23. # printf "# %s = %s\n", n, name
  24. return n
  25. }
  26. function prehash_rule_names( \
  27. name)
  28. {
  29. # Rule names are not part of the tzdb API, so substitute shorter
  30. # ones. Shortening them consistently from one release to the next
  31. # simplifies comparison of the output. That being said, the
  32. # 1-letter names below are not standardized in any way, and can
  33. # change arbitrarily from one release to the next, as the main goal
  34. # here is compression not comparison.
  35. # Abbreviating these rules names to one letter saved the most space
  36. # circa 2018e.
  37. rule["Arg"] = "A"
  38. rule["Brazil"] = "B"
  39. rule["Canada"] = "C"
  40. rule["Denmark"] = "D"
  41. rule["EU"] = "E"
  42. rule["France"] = "F"
  43. rule["GB-Eire"] = "G"
  44. rule["Halifax"] = "H"
  45. rule["Italy"] = "I"
  46. rule["Jordan"] = "J"
  47. rule["Egypt"] = "K" # "Kemet" in ancient Egyptian
  48. rule["Libya"] = "L"
  49. rule["Morocco"] = "M"
  50. rule["Neth"] = "N"
  51. rule["Poland"] = "O" # arbitrary
  52. rule["Palestine"] = "P"
  53. rule["Cuba"] = "Q" # Its start sounds like "Q".
  54. rule["Russia"] = "R"
  55. rule["Syria"] = "S"
  56. rule["Turkey"] = "T"
  57. rule["Uruguay"] = "U"
  58. rule["Vincennes"] = "V"
  59. rule["Winn"] = "W"
  60. rule["Mongol"] = "X" # arbitrary
  61. rule["NT_YK"] = "Y"
  62. rule["Zion"] = "Z"
  63. rule["Austria"] = "a"
  64. rule["Belgium"] = "b"
  65. rule["C-Eur"] = "c"
  66. rule["Algeria"] = "d" # country code DZ
  67. rule["E-Eur"] = "e"
  68. rule["Taiwan"] = "f" # Formosa
  69. rule["Greece"] = "g"
  70. rule["Hungary"] = "h"
  71. rule["Iran"] = "i"
  72. rule["StJohns"] = "j"
  73. rule["Chatham"] = "k" # arbitrary
  74. rule["Lebanon"] = "l"
  75. rule["Mexico"] = "m"
  76. rule["Tunisia"] = "n" # country code TN
  77. rule["Moncton"] = "o" # arbitrary
  78. rule["Port"] = "p"
  79. rule["Albania"] = "q" # arbitrary
  80. rule["Regina"] = "r"
  81. rule["Spain"] = "s"
  82. rule["Toronto"] = "t"
  83. rule["US"] = "u"
  84. rule["Louisville"] = "v" # ville
  85. rule["Iceland"] = "w" # arbitrary
  86. rule["Chile"] = "x" # arbitrary
  87. rule["Para"] = "y" # country code PY
  88. rule["Romania"] = "z" # arbitrary
  89. rule["Macau"] = "_" # arbitrary
  90. # Use ISO 3166 alpha-2 country codes for remaining names that are countries.
  91. # This is more systematic, and avoids collisions (e.g., Malta and Moldova).
  92. rule["Armenia"] = "AM"
  93. rule["Aus"] = "AU"
  94. rule["Azer"] = "AZ"
  95. rule["Barb"] = "BB"
  96. rule["Dhaka"] = "BD"
  97. rule["Bulg"] = "BG"
  98. rule["Bahamas"] = "BS"
  99. rule["Belize"] = "BZ"
  100. rule["Swiss"] = "CH"
  101. rule["Cook"] = "CK"
  102. rule["PRC"] = "CN"
  103. rule["Cyprus"] = "CY"
  104. rule["Czech"] = "CZ"
  105. rule["Germany"] = "DE"
  106. rule["DR"] = "DO"
  107. rule["Ecuador"] = "EC"
  108. rule["Finland"] = "FI"
  109. rule["Fiji"] = "FJ"
  110. rule["Falk"] = "FK"
  111. rule["Ghana"] = "GH"
  112. rule["Guat"] = "GT"
  113. rule["Hond"] = "HN"
  114. rule["Haiti"] = "HT"
  115. rule["Eire"] = "IE"
  116. rule["Iraq"] = "IQ"
  117. rule["Japan"] = "JP"
  118. rule["Kyrgyz"] = "KG"
  119. rule["ROK"] = "KR"
  120. rule["Latvia"] = "LV"
  121. rule["Lux"] = "LX"
  122. rule["Moldova"] = "MD"
  123. rule["Malta"] = "MT"
  124. rule["Mauritius"] = "MU"
  125. rule["Namibia"] = "NA"
  126. rule["Nic"] = "NI"
  127. rule["Norway"] = "NO"
  128. rule["Peru"] = "PE"
  129. rule["Phil"] = "PH"
  130. rule["Pakistan"] = "PK"
  131. rule["Sudan"] = "SD"
  132. rule["Salv"] = "SV"
  133. rule["Tonga"] = "TO"
  134. rule["Vanuatu"] = "VU"
  135. # Avoid collisions.
  136. rule["Detroit"] = "Dt" # De = Denver
  137. for (name in rule) {
  138. record_hash(rule[name], name)
  139. }
  140. }
  141. function make_line(n, field, \
  142. f, r)
  143. {
  144. r = field[1]
  145. for (f = 2; f <= n; f++)
  146. r = r " " field[f]
  147. return r
  148. }
  149. # Process the input line LINE and save it for later output.
  150. function process_input_line(line, \
  151. f, field, end, i, n, r, startdef, \
  152. linkline, ruleline, zoneline)
  153. {
  154. # Remove comments, normalize spaces, and append a space to each line.
  155. sub(/#.*/, "", line)
  156. line = line " "
  157. gsub(/[\t ]+/, " ", line)
  158. # Abbreviate keywords and determine line type.
  159. linkline = sub(/^Link /, "L ", line)
  160. ruleline = sub(/^Rule /, "R ", line)
  161. zoneline = sub(/^Zone /, "Z ", line)
  162. # Replace FooAsia rules with the same rules without "Asia", as they
  163. # are duplicates.
  164. if (match(line, /[^ ]Asia /)) {
  165. if (ruleline) return
  166. line = substr(line, 1, RSTART) substr(line, RSTART + 5)
  167. }
  168. # Abbreviate times.
  169. while (match(line, /[: ]0+[0-9]/))
  170. line = substr(line, 1, RSTART) substr(line, RSTART + RLENGTH - 1)
  171. while (match(line, /:0[^:]/))
  172. line = substr(line, 1, RSTART - 1) substr(line, RSTART + 2)
  173. # Abbreviate weekday names.
  174. while (match(line, / (last)?(Mon|Wed|Fri)[ <>]/)) {
  175. end = RSTART + RLENGTH
  176. line = substr(line, 1, end - 4) substr(line, end - 1)
  177. }
  178. while (match(line, / (last)?(Sun|Tue|Thu|Sat)[ <>]/)) {
  179. end = RSTART + RLENGTH
  180. line = substr(line, 1, end - 3) substr(line, end - 1)
  181. }
  182. # Abbreviate "max", "min", "only" and month names.
  183. gsub(/ max /, " ma ", line)
  184. gsub(/ min /, " mi ", line)
  185. gsub(/ only /, " o ", line)
  186. gsub(/ Jan /, " Ja ", line)
  187. gsub(/ Feb /, " F ", line)
  188. gsub(/ Apr /, " Ap ", line)
  189. gsub(/ Aug /, " Au ", line)
  190. gsub(/ Sep /, " S ", line)
  191. gsub(/ Oct /, " O ", line)
  192. gsub(/ Nov /, " N ", line)
  193. gsub(/ Dec /, " D ", line)
  194. # Strip leading and trailing space.
  195. sub(/^ /, "", line)
  196. sub(/ $/, "", line)
  197. # Remove unnecessary trailing zero fields.
  198. sub(/ 0+$/, "", line)
  199. # Remove unnecessary trailing days-of-month "1".
  200. if (match(line, /[A-Za-z] 1$/))
  201. line = substr(line, 1, RSTART)
  202. # Remove unnecessary trailing " Ja" (for January).
  203. sub(/ Ja$/, "", line)
  204. n = split(line, field)
  205. # Record which rule names are used, and generate their abbreviations.
  206. f = zoneline ? 4 : linkline || ruleline ? 0 : 2
  207. r = field[f]
  208. if (r ~ /^[^-+0-9]/) {
  209. rule_used[r] = 1
  210. }
  211. # If this zone supersedes an earlier one, delete the earlier one
  212. # from the saved output lines.
  213. startdef = ""
  214. if (zoneline)
  215. zonename = startdef = field[2]
  216. else if (linkline)
  217. zonename = startdef = field[3]
  218. else if (ruleline)
  219. zonename = ""
  220. if (startdef) {
  221. i = zonedef[startdef]
  222. if (i) {
  223. do
  224. output_line[i - 1] = ""
  225. while (output_line[i++] ~ /^[-+0-9]/);
  226. }
  227. }
  228. zonedef[zonename] = nout + 1
  229. # Save the line for later output.
  230. output_line[nout++] = make_line(n, field)
  231. }
  232. function omit_unused_rules( \
  233. i, field)
  234. {
  235. for (i = 0; i < nout; i++) {
  236. split(output_line[i], field)
  237. if (field[1] == "R" && !rule_used[field[2]]) {
  238. output_line[i] = ""
  239. }
  240. }
  241. }
  242. function abbreviate_rule_names( \
  243. abbr, f, field, i, n, r)
  244. {
  245. for (i = 0; i < nout; i++) {
  246. n = split(output_line[i], field)
  247. if (n) {
  248. f = field[1] == "Z" ? 4 : field[1] == "L" ? 0 : 2
  249. r = field[f]
  250. if (r ~ /^[^-+0-9]/) {
  251. abbr = rule[r]
  252. if (!abbr) {
  253. rule[r] = abbr = gen_rule_name(r)
  254. }
  255. field[f] = abbr
  256. output_line[i] = make_line(n, field)
  257. }
  258. }
  259. }
  260. }
  261. function output_saved_lines( \
  262. i)
  263. {
  264. for (i = 0; i < nout; i++)
  265. if (output_line[i])
  266. print output_line[i]
  267. }
  268. BEGIN {
  269. # Files that the output normally depends on.
  270. default_dep["africa"] = 1
  271. default_dep["antarctica"] = 1
  272. default_dep["asia"] = 1
  273. default_dep["australasia"] = 1
  274. default_dep["backward"] = 1
  275. default_dep["etcetera"] = 1
  276. default_dep["europe"] = 1
  277. default_dep["factory"] = 1
  278. default_dep["northamerica"] = 1
  279. default_dep["southamerica"] = 1
  280. default_dep["ziguard.awk"] = 1
  281. default_dep["zishrink.awk"] = 1
  282. # Output a version string from 'version' and related configuration variables
  283. # supported by tzdb's Makefile. If you change the makefile or any other files
  284. # that affect the output of this script, you should append '-SOMETHING'
  285. # to the contents of 'version', where SOMETHING identifies what was changed.
  286. ndeps = split(deps, dep)
  287. ddeps = ""
  288. for (i = 1; i <= ndeps; i++) {
  289. if (default_dep[dep[i]]) {
  290. default_dep[dep[i]]++
  291. } else {
  292. ddeps = ddeps " " dep[i]
  293. }
  294. }
  295. for (d in default_dep) {
  296. if (default_dep[d] == 1) {
  297. ddeps = ddeps " !" d
  298. }
  299. }
  300. print "# version", version
  301. if (dataform != "main") {
  302. print "# dataform", dataform
  303. }
  304. if (redo != "posix_right") {
  305. print "# redo " redo
  306. }
  307. if (ddeps) {
  308. print "# ddeps" ddeps
  309. }
  310. print "# This zic input file is in the public domain."
  311. prehash_rule_names()
  312. }
  313. /^[\t ]*[^#\t ]/ {
  314. process_input_line($0)
  315. }
  316. END {
  317. omit_unused_rules()
  318. abbreviate_rule_names()
  319. output_saved_lines()
  320. }