瀏覽代碼

Shrink tzdata.zi somewhat

Without this change, tzdata source's 669742 bytes (180832 bytes
compressed) shrank to tzdata.zi's 123627 bytes (22247 bytes compressed).
With this change, tzdata.zi is 106273 bytes (21203 bytes compressed).
That is, the change's data compression ratio is about 1.16 (1.05 for
compressed data), and the total data compression ratio of tzdata.zi is
now about 6.3 (8.5 for compressed data).  These figures assume
lzip -9 compression.
* Makefile (tzdata.zi): Do not set the Awk PACKRATDATA var,
as zishrink.awk now handles duplicates directly.
* Makefile (zonenames, $(TZS_NEW)):
* checklinks.awk:
Work even when line codes are abbreviated.
* zishrink.awk (paw_through_packratdata): Remove; no longer needed.
Caller removed.
(gen_rule_name, output_saved_lines): New functions.
(process_input_line): Use it to abbreviate rule names.
Abbreviate line codes and "max" too.  Save output lines
instead of printing them immediately, so that later output
lines can supersede earlier.
(END): Output saved lines.
Paul Eggert 8 年之前
父節點
當前提交
ebaf3a0f8f
共有 3 個文件被更改,包括 85 次插入34 次删除
  1. 4 6
      Makefile
  2. 2 2
      checklinks.awk
  3. 79 26
      zishrink.awk

+ 4 - 6
Makefile

@@ -478,9 +478,7 @@ version:	$(VERSION_DEPS)
 
 
 # This file can be tailored by setting BACKWARD, PACKRATDATA, etc.
 # This file can be tailored by setting BACKWARD, PACKRATDATA, etc.
 tzdata.zi:	$(TZDATA_ZI_DEPS)
 tzdata.zi:	$(TZDATA_ZI_DEPS)
-		LC_ALL=C $(AWK) -v PACKRATDATA='$(PACKRATDATA)' \
-		  -f zishrink.awk \
-		  $(TDATA) $(PACKRATDATA) >$@.out
+		LC_ALL=C $(AWK) -f zishrink.awk $(TDATA) $(PACKRATDATA) >$@.out
 		mv $@.out $@
 		mv $@.out $@
 
 
 version.h:	version
 version.h:	version
@@ -558,11 +556,11 @@ zones:		$(REDO)
 $(TZS_NEW):	tzdata.zi zdump zic
 $(TZS_NEW):	tzdata.zi zdump zic
 		mkdir -p tzs.dir
 		mkdir -p tzs.dir
 		$(zic) -d tzs.dir tzdata.zi
 		$(zic) -d tzs.dir tzdata.zi
-		$(AWK) '/^Link/{print $$1 "\t" $$2 "\t" $$3}' \
+		$(AWK) '/^L/{print "Link\t" $$2 "\t" $$3}' \
 		   tzdata.zi | LC_ALL=C sort >$@.out
 		   tzdata.zi | LC_ALL=C sort >$@.out
 		wd=`pwd` && \
 		wd=`pwd` && \
 		zones=`$(AWK) -v wd="$$wd" \
 		zones=`$(AWK) -v wd="$$wd" \
-				'/^Zone/{print wd "/tzs.dir/" $$2}' tzdata.zi \
+				'/^Z/{print wd "/tzs.dir/" $$2}' tzdata.zi \
 			 | LC_ALL=C sort` && \
 			 | LC_ALL=C sort` && \
 		./zdump -i -c $(TZS_YEAR) $$zones >>$@.out
 		./zdump -i -c $(TZS_YEAR) $$zones >>$@.out
 		sed 's,^TZ=".*tzs\.dir/,TZ=",' $@.out >$@.sed.out
 		sed 's,^TZ=".*tzs\.dir/,TZ=",' $@.out >$@.sed.out
@@ -826,7 +824,7 @@ typecheck:
 		done
 		done
 
 
 zonenames:	tzdata.zi
 zonenames:	tzdata.zi
-		@$(AWK) '/^Zone/ { print $$2 } /^Link/ { print $$3 }' tzdata.zi
+		@$(AWK) '/^Z/ { print $$2 } /^L/ { print $$3 }' tzdata.zi
 
 
 asctime.o:	private.h tzfile.h
 asctime.o:	private.h tzfile.h
 date.o:		private.h
 date.o:		private.h

+ 2 - 2
checklinks.awk

@@ -9,7 +9,7 @@ BEGIN {
     Zone = "\n"
     Zone = "\n"
 }
 }
 
 
-/^Zone/ {
+/^Z/ {
     if (defined[$2]) {
     if (defined[$2]) {
 	if (defined[$2] == Zone) {
 	if (defined[$2] == Zone) {
 	    printf "%s: Zone has duplicate definition\n", $2
 	    printf "%s: Zone has duplicate definition\n", $2
@@ -21,7 +21,7 @@ BEGIN {
     defined[$2] = Zone
     defined[$2] = Zone
 }
 }
 
 
-/^Link/ {
+/^L/ {
     if (defined[$3]) {
     if (defined[$3]) {
 	if (defined[$3] == Zone) {
 	if (defined[$3] == Zone) {
 	    printf "%s: Link with same name as Zone\n", $3
 	    printf "%s: Link with same name as Zone\n", $3

+ 79 - 26
zishrink.awk

@@ -6,33 +6,52 @@
 # 'zic' should treat this script's output as if it were identical to
 # 'zic' should treat this script's output as if it were identical to
 # this script's input.
 # this script's input.
 
 
-function paw_through_packratdata(line)
+
+# Return a new rule name.
+# N_RULE_NAMES keeps track of how many rule names have been generated.
+
+function gen_rule_name(alphabet, base, rule_name, n, digit)
 {
 {
-  if (PACKRATDATA) {
-    while (0 < (getline line <PACKRATDATA)) {
-      if (split(line, field)) {
-	if (field[1] == "Zone") packrat_zone[field[2]] = 1
-	if (field[1] == "Link") packrat_zone[field[3]] = 1
-      }
-    }
-    close(PACKRATDATA)
-  }
+  alphabet = ""
+  alphabet = alphabet "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+  alphabet = alphabet "abcdefghijklmnopqrstuvwxyz"
+  alphabet = alphabet "!$%&'()*+,./:;<=>?@[\\]^_`{|}~"
+  base = length(alphabet)
+  rule_name = ""
+  n = n_rule_names++
+
+  do {
+    n -= rule_name && n <= base
+    digit = n % base
+    rule_name = substr(alphabet, digit + 1, 1) rule_name
+    n = (n - digit) / base
+  } while (n);
+
+  return rule_name
 }
 }
 
 
-function process_input_line(line, field, end)
+# Process an input line and save it for later output.
+
+function process_input_line(line, field, end, i, n, startdef)
 {
 {
   # Remove comments, normalize spaces, and append a space to each line.
   # Remove comments, normalize spaces, and append a space to each line.
   sub(/#.*/, "", line)
   sub(/#.*/, "", line)
   line = line " "
   line = line " "
   gsub(/[[:space:]]+/, " ", line)
   gsub(/[[:space:]]+/, " ", line)
 
 
+  # Abbreviate keywords.  Do not abbreviate "Link" to just "L",
+  # as pre-2017c zic erroneously diagnoses "Li" as ambiguous.
+  sub(/^Link /, "Li ", line)
+  sub(/^Rule /, "R ", line)
+  sub(/^Zone /, "Z ", line)
+
   # SystemV rules are not needed.
   # SystemV rules are not needed.
-  if (line ~ /^Rule SystemV /) next
+  if (line ~ /^R SystemV /) next
 
 
   # Replace FooAsia rules with the same rules without "Asia", as they
   # Replace FooAsia rules with the same rules without "Asia", as they
   # are duplicates.
   # are duplicates.
   if (match(line, /[^ ]Asia /)) {
   if (match(line, /[^ ]Asia /)) {
-    if (line ~ /^Rule /) next
+    if (line ~ /^R /) next
     line = substr(line, 1, RSTART) substr(line, RSTART + 5)
     line = substr(line, 1, RSTART) substr(line, RSTART + 5)
   }
   }
 
 
@@ -53,7 +72,10 @@ function process_input_line(line, field, end)
     line = substr(line, 1, end - 3) substr(line, end - 1)
     line = substr(line, 1, end - 3) substr(line, end - 1)
   }
   }
 
 
-  # Abbreviate "only" and month names.
+  # Abbreviate "max", "only" and month names.
+  # Do not abbreviate "min", as pre-2017c zic erroneously diagnoses "mi"
+  # as ambiguous.
+  gsub(/ max /, " ma ", line)
   gsub(/ only /, " o ", line)
   gsub(/ only /, " o ", line)
   gsub(/ Jan /, " Ja ", line)
   gsub(/ Jan /, " Ja ", line)
   gsub(/ Feb /, " F ", line)
   gsub(/ Feb /, " F ", line)
@@ -78,26 +100,57 @@ function process_input_line(line, field, end)
   # Remove unnecessary trailing " Ja" (for January).
   # Remove unnecessary trailing " Ja" (for January).
   sub(/ Ja$/, "", line)
   sub(/ Ja$/, "", line)
 
 
-  # Output lines unless they are later overridden in PACKRATDATA.
-  if (line ~ /^[LRZ]/) {
-    overridden = 0
-    if (FILENAME != PACKRATDATA) {
-      split(line, field)
-      if (field[1] == "Zone")
-	overridden = packrat_zone[field[2]]
-      else if (field[1] == "Link" && packrat_zone[field[3]])
-	next
+  n = split(line, field)
+
+  # Abbreviate rule names.
+  i = field[1] == "Z" ? 4 : field[1] == "Li" ? 0 : 2
+  if (i && field[i] ~ /^[^-+0-9]/) {
+    if (!rule[field[i]])
+      rule[field[i]] = gen_rule_name()
+    field[i] = rule[field[i]]
+  }
+
+  # If this zone supersedes an earlier one, delete the earlier one
+  # from the saved output lines.
+  startdef = ""
+  if (field[1] == "Z")
+    zonename = startdef = field[2]
+  else if (field[1] == "Li")
+    zonename = startdef = field[3]
+  else if (field[1] == "R")
+    zonename = ""
+  if (startdef) {
+    i = zonedef[startdef]
+    if (i) {
+      do
+	output_line[i - 1] = ""
+      while (output_line[i++] ~ /^[-+0-9]/);
     }
     }
   }
   }
-  if (!overridden)
-    print line
+  zonedef[zonename] = nout + 1
+
+  # Save the line for later output.
+  line = field[1]
+  for (i = 2; i <= n; i++)
+    line = line " " field[i]
+  output_line[nout++] = line
+}
+
+function output_saved_lines(i)
+{
+  for (i = 0; i < nout; i++)
+    if (output_line[i])
+      print output_line[i]
 }
 }
 
 
 BEGIN {
 BEGIN {
   print "# This zic input file is in the public domain."
   print "# This zic input file is in the public domain."
-  paw_through_packratdata()
 }
 }
 
 
 /^[[:space:]]*[^#[:space:]]/ {
 /^[[:space:]]*[^#[:space:]]/ {
   process_input_line($0)
   process_input_line($0)
 }
 }
+
+END {
+  output_saved_lines()
+}