markov.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. #!/usr/bin/env python3
  2. """
  3. Markov chain simulation of words or characters.
  4. """
  5. class Markov:
  6. def __init__(self, histsize, choice):
  7. self.histsize = histsize
  8. self.choice = choice
  9. self.trans = {}
  10. def add(self, state, next):
  11. self.trans.setdefault(state, []).append(next)
  12. def put(self, seq):
  13. n = self.histsize
  14. add = self.add
  15. add(None, seq[:0])
  16. for i in range(len(seq)):
  17. add(seq[max(0, i-n):i], seq[i:i+1])
  18. add(seq[len(seq)-n:], None)
  19. def get(self):
  20. choice = self.choice
  21. trans = self.trans
  22. n = self.histsize
  23. seq = choice(trans[None])
  24. while True:
  25. subseq = seq[max(0, len(seq)-n):]
  26. options = trans[subseq]
  27. next = choice(options)
  28. if not next:
  29. break
  30. seq += next
  31. return seq
  32. def test():
  33. import sys, random, getopt
  34. args = sys.argv[1:]
  35. try:
  36. opts, args = getopt.getopt(args, '0123456789cdwq')
  37. except getopt.error:
  38. print('Usage: %s [-#] [-cddqw] [file] ...' % sys.argv[0])
  39. print('Options:')
  40. print('-#: 1-digit history size (default 2)')
  41. print('-c: characters (default)')
  42. print('-w: words')
  43. print('-d: more debugging output')
  44. print('-q: no debugging output')
  45. print('Input files (default stdin) are split in paragraphs')
  46. print('separated blank lines and each paragraph is split')
  47. print('in words by whitespace, then reconcatenated with')
  48. print('exactly one space separating words.')
  49. print('Output consists of paragraphs separated by blank')
  50. print('lines, where lines are no longer than 72 characters.')
  51. sys.exit(2)
  52. histsize = 2
  53. do_words = False
  54. debug = 1
  55. for o, a in opts:
  56. if '-0' <= o <= '-9': histsize = int(o[1:])
  57. if o == '-c': do_words = False
  58. if o == '-d': debug += 1
  59. if o == '-q': debug = 0
  60. if o == '-w': do_words = True
  61. if not args:
  62. args = ['-']
  63. m = Markov(histsize, random.choice)
  64. try:
  65. for filename in args:
  66. if filename == '-':
  67. f = sys.stdin
  68. if f.isatty():
  69. print('Sorry, need stdin from file')
  70. continue
  71. else:
  72. f = open(filename, 'r')
  73. if debug: print('processing', filename, '...')
  74. text = f.read()
  75. f.close()
  76. paralist = text.split('\n\n')
  77. for para in paralist:
  78. if debug > 1: print('feeding ...')
  79. words = para.split()
  80. if words:
  81. if do_words:
  82. data = tuple(words)
  83. else:
  84. data = ' '.join(words)
  85. m.put(data)
  86. except KeyboardInterrupt:
  87. print('Interrupted -- continue with data read so far')
  88. if not m.trans:
  89. print('No valid input files')
  90. return
  91. if debug: print('done.')
  92. if debug > 1:
  93. for key in m.trans.keys():
  94. if key is None or len(key) < histsize:
  95. print(repr(key), m.trans[key])
  96. if histsize == 0: print(repr(''), m.trans[''])
  97. print()
  98. while True:
  99. data = m.get()
  100. if do_words:
  101. words = data
  102. else:
  103. words = data.split()
  104. n = 0
  105. limit = 72
  106. for w in words:
  107. if n + len(w) > limit:
  108. print()
  109. n = 0
  110. print(w, end=' ')
  111. n += len(w) + 1
  112. print()
  113. print()
  114. if __name__ == "__main__":
  115. test()