robotparser.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. """ robotparser.py
  2. Copyright (C) 2000 Bastian Kleineidam
  3. You can choose between two licenses when using this package:
  4. 1) GNU GPLv2
  5. 2) PSF license for Python 2.2
  6. The robots.txt Exclusion Protocol is implemented as specified in
  7. http://www.robotstxt.org/norobots-rfc.txt
  8. """
  9. import collections
  10. import urllib.parse
  11. import urllib.request
  12. __all__ = ["RobotFileParser"]
  13. RequestRate = collections.namedtuple("RequestRate", "requests seconds")
  14. class RobotFileParser:
  15. """ This class provides a set of methods to read, parse and answer
  16. questions about a single robots.txt file.
  17. """
  18. def __init__(self, url=''):
  19. self.entries = []
  20. self.default_entry = None
  21. self.disallow_all = False
  22. self.allow_all = False
  23. self.set_url(url)
  24. self.last_checked = 0
  25. def mtime(self):
  26. """Returns the time the robots.txt file was last fetched.
  27. This is useful for long-running web spiders that need to
  28. check for new robots.txt files periodically.
  29. """
  30. return self.last_checked
  31. def modified(self):
  32. """Sets the time the robots.txt file was last fetched to the
  33. current time.
  34. """
  35. import time
  36. self.last_checked = time.time()
  37. def set_url(self, url):
  38. """Sets the URL referring to a robots.txt file."""
  39. self.url = url
  40. self.host, self.path = urllib.parse.urlparse(url)[1:3]
  41. def read(self):
  42. """Reads the robots.txt URL and feeds it to the parser."""
  43. try:
  44. f = urllib.request.urlopen(self.url)
  45. except urllib.error.HTTPError as err:
  46. if err.code in (401, 403):
  47. self.disallow_all = True
  48. elif err.code >= 400 and err.code < 500:
  49. self.allow_all = True
  50. else:
  51. raw = f.read()
  52. self.parse(raw.decode("utf-8").splitlines())
  53. def _add_entry(self, entry):
  54. if "*" in entry.useragents:
  55. # the default entry is considered last
  56. if self.default_entry is None:
  57. # the first default entry wins
  58. self.default_entry = entry
  59. else:
  60. self.entries.append(entry)
  61. def parse(self, lines):
  62. """Parse the input lines from a robots.txt file.
  63. We allow that a user-agent: line is not preceded by
  64. one or more blank lines.
  65. """
  66. # states:
  67. # 0: start state
  68. # 1: saw user-agent line
  69. # 2: saw an allow or disallow line
  70. state = 0
  71. entry = Entry()
  72. self.modified()
  73. for line in lines:
  74. if not line:
  75. if state == 1:
  76. entry = Entry()
  77. state = 0
  78. elif state == 2:
  79. self._add_entry(entry)
  80. entry = Entry()
  81. state = 0
  82. # remove optional comment and strip line
  83. i = line.find('#')
  84. if i >= 0:
  85. line = line[:i]
  86. line = line.strip()
  87. if not line:
  88. continue
  89. line = line.split(':', 1)
  90. if len(line) == 2:
  91. line[0] = line[0].strip().lower()
  92. line[1] = urllib.parse.unquote(line[1].strip())
  93. if line[0] == "user-agent":
  94. if state == 2:
  95. self._add_entry(entry)
  96. entry = Entry()
  97. entry.useragents.append(line[1])
  98. state = 1
  99. elif line[0] == "disallow":
  100. if state != 0:
  101. entry.rulelines.append(RuleLine(line[1], False))
  102. state = 2
  103. elif line[0] == "allow":
  104. if state != 0:
  105. entry.rulelines.append(RuleLine(line[1], True))
  106. state = 2
  107. elif line[0] == "crawl-delay":
  108. if state != 0:
  109. # before trying to convert to int we need to make
  110. # sure that robots.txt has valid syntax otherwise
  111. # it will crash
  112. if line[1].strip().isdigit():
  113. entry.delay = int(line[1])
  114. state = 2
  115. elif line[0] == "request-rate":
  116. if state != 0:
  117. numbers = line[1].split('/')
  118. # check if all values are sane
  119. if (len(numbers) == 2 and numbers[0].strip().isdigit()
  120. and numbers[1].strip().isdigit()):
  121. entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
  122. state = 2
  123. if state == 2:
  124. self._add_entry(entry)
  125. def can_fetch(self, useragent, url):
  126. """using the parsed robots.txt decide if useragent can fetch url"""
  127. if self.disallow_all:
  128. return False
  129. if self.allow_all:
  130. return True
  131. # Until the robots.txt file has been read or found not
  132. # to exist, we must assume that no url is allowable.
  133. # This prevents false positives when a user erroneously
  134. # calls can_fetch() before calling read().
  135. if not self.last_checked:
  136. return False
  137. # search for given user agent matches
  138. # the first match counts
  139. parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
  140. url = urllib.parse.urlunparse(('','',parsed_url.path,
  141. parsed_url.params,parsed_url.query, parsed_url.fragment))
  142. url = urllib.parse.quote(url)
  143. if not url:
  144. url = "/"
  145. for entry in self.entries:
  146. if entry.applies_to(useragent):
  147. return entry.allowance(url)
  148. # try the default entry last
  149. if self.default_entry:
  150. return self.default_entry.allowance(url)
  151. # agent not found ==> access granted
  152. return True
  153. def crawl_delay(self, useragent):
  154. if not self.mtime():
  155. return None
  156. for entry in self.entries:
  157. if entry.applies_to(useragent):
  158. return entry.delay
  159. if self.default_entry:
  160. return self.default_entry.delay
  161. return None
  162. def request_rate(self, useragent):
  163. if not self.mtime():
  164. return None
  165. for entry in self.entries:
  166. if entry.applies_to(useragent):
  167. return entry.req_rate
  168. if self.default_entry:
  169. return self.default_entry.req_rate
  170. return None
  171. def __str__(self):
  172. entries = self.entries
  173. if self.default_entry is not None:
  174. entries = entries + [self.default_entry]
  175. return '\n'.join(map(str, entries)) + '\n'
  176. class RuleLine:
  177. """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
  178. (allowance==False) followed by a path."""
  179. def __init__(self, path, allowance):
  180. if path == '' and not allowance:
  181. # an empty value means allow all
  182. allowance = True
  183. path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
  184. self.path = urllib.parse.quote(path)
  185. self.allowance = allowance
  186. def applies_to(self, filename):
  187. return self.path == "*" or filename.startswith(self.path)
  188. def __str__(self):
  189. return ("Allow" if self.allowance else "Disallow") + ": " + self.path
  190. class Entry:
  191. """An entry has one or more user-agents and zero or more rulelines"""
  192. def __init__(self):
  193. self.useragents = []
  194. self.rulelines = []
  195. self.delay = None
  196. self.req_rate = None
  197. def __str__(self):
  198. ret = []
  199. for agent in self.useragents:
  200. ret.append(f"User-agent: {agent}")
  201. if self.delay is not None:
  202. ret.append(f"Crawl-delay: {self.delay}")
  203. if self.req_rate is not None:
  204. rate = self.req_rate
  205. ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
  206. ret.extend(map(str, self.rulelines))
  207. ret.append('') # for compatibility
  208. return '\n'.join(ret)
  209. def applies_to(self, useragent):
  210. """check if this entry applies to the specified agent"""
  211. # split the name token and make it lower case
  212. useragent = useragent.split("/")[0].lower()
  213. for agent in self.useragents:
  214. if agent == '*':
  215. # we have the catch-all agent
  216. return True
  217. agent = agent.lower()
  218. if agent in useragent:
  219. return True
  220. return False
  221. def allowance(self, filename):
  222. """Preconditions:
  223. - our agent applies to this entry
  224. - filename is URL decoded"""
  225. for line in self.rulelines:
  226. if line.applies_to(filename):
  227. return line.allowance
  228. return True