mimetypes.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608
  1. """Guess the MIME type of a file.
  2. This module defines two useful functions:
  3. guess_type(url, strict=True) -- guess the MIME type and encoding of a URL.
  4. guess_extension(type, strict=True) -- guess the extension for a given MIME type.
  5. It also contains the following, for tuning the behavior:
  6. Data:
  7. knownfiles -- list of files to parse
  8. inited -- flag set when init() has been called
  9. suffix_map -- dictionary mapping suffixes to suffixes
  10. encodings_map -- dictionary mapping suffixes to encodings
  11. types_map -- dictionary mapping suffixes to types
  12. Functions:
  13. init([files]) -- parse a list of files, default knownfiles (on Windows, the
  14. default values are taken from the registry)
  15. read_mime_types(file) -- parse one file, return a dictionary or None
  16. """
  17. import os
  18. import sys
  19. import posixpath
  20. import urllib.parse
  21. try:
  22. import winreg as _winreg
  23. except ImportError:
  24. _winreg = None
  25. __all__ = [
  26. "knownfiles", "inited", "MimeTypes",
  27. "guess_type", "guess_all_extensions", "guess_extension",
  28. "add_type", "init", "read_mime_types",
  29. "suffix_map", "encodings_map", "types_map", "common_types"
  30. ]
  31. knownfiles = [
  32. "/etc/mime.types",
  33. "/etc/httpd/mime.types", # Mac OS X
  34. "/etc/httpd/conf/mime.types", # Apache
  35. "/etc/apache/mime.types", # Apache 1
  36. "/etc/apache2/mime.types", # Apache 2
  37. "/usr/local/etc/httpd/conf/mime.types",
  38. "/usr/local/lib/netscape/mime.types",
  39. "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2
  40. "/usr/local/etc/mime.types", # Apache 1.3
  41. ]
  42. inited = False
  43. _db = None
  44. class MimeTypes:
  45. """MIME-types datastore.
  46. This datastore can handle information from mime.types-style files
  47. and supports basic determination of MIME type from a filename or
  48. URL, and can guess a reasonable extension given a MIME type.
  49. """
  50. def __init__(self, filenames=(), strict=True):
  51. if not inited:
  52. init()
  53. self.encodings_map = _encodings_map_default.copy()
  54. self.suffix_map = _suffix_map_default.copy()
  55. self.types_map = ({}, {}) # dict for (non-strict, strict)
  56. self.types_map_inv = ({}, {})
  57. for (ext, type) in _types_map_default.items():
  58. self.add_type(type, ext, True)
  59. for (ext, type) in _common_types_default.items():
  60. self.add_type(type, ext, False)
  61. for name in filenames:
  62. self.read(name, strict)
  63. def add_type(self, type, ext, strict=True):
  64. """Add a mapping between a type and an extension.
  65. When the extension is already known, the new
  66. type will replace the old one. When the type
  67. is already known the extension will be added
  68. to the list of known extensions.
  69. If strict is true, information will be added to
  70. list of standard types, else to the list of non-standard
  71. types.
  72. """
  73. self.types_map[strict][ext] = type
  74. exts = self.types_map_inv[strict].setdefault(type, [])
  75. if ext not in exts:
  76. exts.append(ext)
  77. def guess_type(self, url, strict=True):
  78. """Guess the type of a file based on its URL.
  79. Return value is a tuple (type, encoding) where type is None if
  80. the type can't be guessed (no or unknown suffix) or a string
  81. of the form type/subtype, usable for a MIME Content-type
  82. header; and encoding is None for no encoding or the name of
  83. the program used to encode (e.g. compress or gzip). The
  84. mappings are table driven. Encoding suffixes are case
  85. sensitive; type suffixes are first tried case sensitive, then
  86. case insensitive.
  87. The suffixes .tgz, .taz and .tz (case sensitive!) are all
  88. mapped to '.tar.gz'. (This is table-driven too, using the
  89. dictionary suffix_map.)
  90. Optional `strict' argument when False adds a bunch of commonly found,
  91. but non-standard types.
  92. """
  93. url = os.fspath(url)
  94. scheme, url = urllib.parse.splittype(url)
  95. if scheme == 'data':
  96. # syntax of data URLs:
  97. # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
  98. # mediatype := [ type "/" subtype ] *( ";" parameter )
  99. # data := *urlchar
  100. # parameter := attribute "=" value
  101. # type/subtype defaults to "text/plain"
  102. comma = url.find(',')
  103. if comma < 0:
  104. # bad data URL
  105. return None, None
  106. semi = url.find(';', 0, comma)
  107. if semi >= 0:
  108. type = url[:semi]
  109. else:
  110. type = url[:comma]
  111. if '=' in type or '/' not in type:
  112. type = 'text/plain'
  113. return type, None # never compressed, so encoding is None
  114. base, ext = posixpath.splitext(url)
  115. while ext in self.suffix_map:
  116. base, ext = posixpath.splitext(base + self.suffix_map[ext])
  117. if ext in self.encodings_map:
  118. encoding = self.encodings_map[ext]
  119. base, ext = posixpath.splitext(base)
  120. else:
  121. encoding = None
  122. types_map = self.types_map[True]
  123. if ext in types_map:
  124. return types_map[ext], encoding
  125. elif ext.lower() in types_map:
  126. return types_map[ext.lower()], encoding
  127. elif strict:
  128. return None, encoding
  129. types_map = self.types_map[False]
  130. if ext in types_map:
  131. return types_map[ext], encoding
  132. elif ext.lower() in types_map:
  133. return types_map[ext.lower()], encoding
  134. else:
  135. return None, encoding
  136. def guess_all_extensions(self, type, strict=True):
  137. """Guess the extensions for a file based on its MIME type.
  138. Return value is a list of strings giving the possible filename
  139. extensions, including the leading dot ('.'). The extension is not
  140. guaranteed to have been associated with any particular data stream,
  141. but would be mapped to the MIME type `type' by guess_type().
  142. Optional `strict' argument when false adds a bunch of commonly found,
  143. but non-standard types.
  144. """
  145. type = type.lower()
  146. extensions = self.types_map_inv[True].get(type, [])
  147. if not strict:
  148. for ext in self.types_map_inv[False].get(type, []):
  149. if ext not in extensions:
  150. extensions.append(ext)
  151. return extensions
  152. def guess_extension(self, type, strict=True):
  153. """Guess the extension for a file based on its MIME type.
  154. Return value is a string giving a filename extension,
  155. including the leading dot ('.'). The extension is not
  156. guaranteed to have been associated with any particular data
  157. stream, but would be mapped to the MIME type `type' by
  158. guess_type(). If no extension can be guessed for `type', None
  159. is returned.
  160. Optional `strict' argument when false adds a bunch of commonly found,
  161. but non-standard types.
  162. """
  163. extensions = self.guess_all_extensions(type, strict)
  164. if not extensions:
  165. return None
  166. return extensions[0]
  167. def read(self, filename, strict=True):
  168. """
  169. Read a single mime.types-format file, specified by pathname.
  170. If strict is true, information will be added to
  171. list of standard types, else to the list of non-standard
  172. types.
  173. """
  174. with open(filename, encoding='utf-8') as fp:
  175. self.readfp(fp, strict)
  176. def readfp(self, fp, strict=True):
  177. """
  178. Read a single mime.types-format file.
  179. If strict is true, information will be added to
  180. list of standard types, else to the list of non-standard
  181. types.
  182. """
  183. while 1:
  184. line = fp.readline()
  185. if not line:
  186. break
  187. words = line.split()
  188. for i in range(len(words)):
  189. if words[i][0] == '#':
  190. del words[i:]
  191. break
  192. if not words:
  193. continue
  194. type, suffixes = words[0], words[1:]
  195. for suff in suffixes:
  196. self.add_type(type, '.' + suff, strict)
  197. def read_windows_registry(self, strict=True):
  198. """
  199. Load the MIME types database from Windows registry.
  200. If strict is true, information will be added to
  201. list of standard types, else to the list of non-standard
  202. types.
  203. """
  204. # Windows only
  205. if not _winreg:
  206. return
  207. def enum_types(mimedb):
  208. i = 0
  209. while True:
  210. try:
  211. ctype = _winreg.EnumKey(mimedb, i)
  212. except OSError:
  213. break
  214. else:
  215. if '\0' not in ctype:
  216. yield ctype
  217. i += 1
  218. with _winreg.OpenKey(_winreg.HKEY_CLASSES_ROOT, '') as hkcr:
  219. for subkeyname in enum_types(hkcr):
  220. try:
  221. with _winreg.OpenKey(hkcr, subkeyname) as subkey:
  222. # Only check file extensions
  223. if not subkeyname.startswith("."):
  224. continue
  225. # raises OSError if no 'Content Type' value
  226. mimetype, datatype = _winreg.QueryValueEx(
  227. subkey, 'Content Type')
  228. if datatype != _winreg.REG_SZ:
  229. continue
  230. self.add_type(mimetype, subkeyname, strict)
  231. except OSError:
  232. continue
  233. def guess_type(url, strict=True):
  234. """Guess the type of a file based on its URL.
  235. Return value is a tuple (type, encoding) where type is None if the
  236. type can't be guessed (no or unknown suffix) or a string of the
  237. form type/subtype, usable for a MIME Content-type header; and
  238. encoding is None for no encoding or the name of the program used
  239. to encode (e.g. compress or gzip). The mappings are table
  240. driven. Encoding suffixes are case sensitive; type suffixes are
  241. first tried case sensitive, then case insensitive.
  242. The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped
  243. to ".tar.gz". (This is table-driven too, using the dictionary
  244. suffix_map).
  245. Optional `strict' argument when false adds a bunch of commonly found, but
  246. non-standard types.
  247. """
  248. if _db is None:
  249. init()
  250. return _db.guess_type(url, strict)
  251. def guess_all_extensions(type, strict=True):
  252. """Guess the extensions for a file based on its MIME type.
  253. Return value is a list of strings giving the possible filename
  254. extensions, including the leading dot ('.'). The extension is not
  255. guaranteed to have been associated with any particular data
  256. stream, but would be mapped to the MIME type `type' by
  257. guess_type(). If no extension can be guessed for `type', None
  258. is returned.
  259. Optional `strict' argument when false adds a bunch of commonly found,
  260. but non-standard types.
  261. """
  262. if _db is None:
  263. init()
  264. return _db.guess_all_extensions(type, strict)
  265. def guess_extension(type, strict=True):
  266. """Guess the extension for a file based on its MIME type.
  267. Return value is a string giving a filename extension, including the
  268. leading dot ('.'). The extension is not guaranteed to have been
  269. associated with any particular data stream, but would be mapped to the
  270. MIME type `type' by guess_type(). If no extension can be guessed for
  271. `type', None is returned.
  272. Optional `strict' argument when false adds a bunch of commonly found,
  273. but non-standard types.
  274. """
  275. if _db is None:
  276. init()
  277. return _db.guess_extension(type, strict)
  278. def add_type(type, ext, strict=True):
  279. """Add a mapping between a type and an extension.
  280. When the extension is already known, the new
  281. type will replace the old one. When the type
  282. is already known the extension will be added
  283. to the list of known extensions.
  284. If strict is true, information will be added to
  285. list of standard types, else to the list of non-standard
  286. types.
  287. """
  288. if _db is None:
  289. init()
  290. return _db.add_type(type, ext, strict)
  291. def init(files=None):
  292. global suffix_map, types_map, encodings_map, common_types
  293. global inited, _db
  294. inited = True # so that MimeTypes.__init__() doesn't call us again
  295. if files is None or _db is None:
  296. db = MimeTypes()
  297. if _winreg:
  298. db.read_windows_registry()
  299. if files is None:
  300. files = knownfiles
  301. else:
  302. files = knownfiles + list(files)
  303. else:
  304. db = _db
  305. for file in files:
  306. if os.path.isfile(file):
  307. db.read(file)
  308. encodings_map = db.encodings_map
  309. suffix_map = db.suffix_map
  310. types_map = db.types_map[True]
  311. common_types = db.types_map[False]
  312. # Make the DB a global variable now that it is fully initialized
  313. _db = db
  314. def read_mime_types(file):
  315. try:
  316. f = open(file)
  317. except OSError:
  318. return None
  319. with f:
  320. db = MimeTypes()
  321. db.readfp(f, True)
  322. return db.types_map[True]
  323. def _default_mime_types():
  324. global suffix_map, _suffix_map_default
  325. global encodings_map, _encodings_map_default
  326. global types_map, _types_map_default
  327. global common_types, _common_types_default
  328. suffix_map = _suffix_map_default = {
  329. '.svgz': '.svg.gz',
  330. '.tgz': '.tar.gz',
  331. '.taz': '.tar.gz',
  332. '.tz': '.tar.gz',
  333. '.tbz2': '.tar.bz2',
  334. '.txz': '.tar.xz',
  335. }
  336. encodings_map = _encodings_map_default = {
  337. '.gz': 'gzip',
  338. '.Z': 'compress',
  339. '.bz2': 'bzip2',
  340. '.xz': 'xz',
  341. }
  342. # Before adding new types, make sure they are either registered with IANA,
  343. # at http://www.iana.org/assignments/media-types
  344. # or extensions, i.e. using the x- prefix
  345. # If you add to these, please keep them sorted by mime type.
  346. # Make sure the entry with the preferred file extension for a particular mime type
  347. # appears before any others of the same mimetype.
  348. types_map = _types_map_default = {
  349. '.js' : 'application/javascript',
  350. '.mjs' : 'application/javascript',
  351. '.json' : 'application/json',
  352. '.doc' : 'application/msword',
  353. '.dot' : 'application/msword',
  354. '.wiz' : 'application/msword',
  355. '.bin' : 'application/octet-stream',
  356. '.a' : 'application/octet-stream',
  357. '.dll' : 'application/octet-stream',
  358. '.exe' : 'application/octet-stream',
  359. '.o' : 'application/octet-stream',
  360. '.obj' : 'application/octet-stream',
  361. '.so' : 'application/octet-stream',
  362. '.oda' : 'application/oda',
  363. '.pdf' : 'application/pdf',
  364. '.p7c' : 'application/pkcs7-mime',
  365. '.ps' : 'application/postscript',
  366. '.ai' : 'application/postscript',
  367. '.eps' : 'application/postscript',
  368. '.m3u' : 'application/vnd.apple.mpegurl',
  369. '.m3u8' : 'application/vnd.apple.mpegurl',
  370. '.xls' : 'application/vnd.ms-excel',
  371. '.xlb' : 'application/vnd.ms-excel',
  372. '.ppt' : 'application/vnd.ms-powerpoint',
  373. '.pot' : 'application/vnd.ms-powerpoint',
  374. '.ppa' : 'application/vnd.ms-powerpoint',
  375. '.pps' : 'application/vnd.ms-powerpoint',
  376. '.pwz' : 'application/vnd.ms-powerpoint',
  377. '.wasm' : 'application/wasm',
  378. '.bcpio' : 'application/x-bcpio',
  379. '.cpio' : 'application/x-cpio',
  380. '.csh' : 'application/x-csh',
  381. '.dvi' : 'application/x-dvi',
  382. '.gtar' : 'application/x-gtar',
  383. '.hdf' : 'application/x-hdf',
  384. '.latex' : 'application/x-latex',
  385. '.mif' : 'application/x-mif',
  386. '.cdf' : 'application/x-netcdf',
  387. '.nc' : 'application/x-netcdf',
  388. '.p12' : 'application/x-pkcs12',
  389. '.pfx' : 'application/x-pkcs12',
  390. '.ram' : 'application/x-pn-realaudio',
  391. '.pyc' : 'application/x-python-code',
  392. '.pyo' : 'application/x-python-code',
  393. '.sh' : 'application/x-sh',
  394. '.shar' : 'application/x-shar',
  395. '.swf' : 'application/x-shockwave-flash',
  396. '.sv4cpio': 'application/x-sv4cpio',
  397. '.sv4crc' : 'application/x-sv4crc',
  398. '.tar' : 'application/x-tar',
  399. '.tcl' : 'application/x-tcl',
  400. '.tex' : 'application/x-tex',
  401. '.texi' : 'application/x-texinfo',
  402. '.texinfo': 'application/x-texinfo',
  403. '.roff' : 'application/x-troff',
  404. '.t' : 'application/x-troff',
  405. '.tr' : 'application/x-troff',
  406. '.man' : 'application/x-troff-man',
  407. '.me' : 'application/x-troff-me',
  408. '.ms' : 'application/x-troff-ms',
  409. '.ustar' : 'application/x-ustar',
  410. '.src' : 'application/x-wais-source',
  411. '.xsl' : 'application/xml',
  412. '.rdf' : 'application/xml',
  413. '.wsdl' : 'application/xml',
  414. '.xpdl' : 'application/xml',
  415. '.zip' : 'application/zip',
  416. '.au' : 'audio/basic',
  417. '.snd' : 'audio/basic',
  418. '.mp3' : 'audio/mpeg',
  419. '.mp2' : 'audio/mpeg',
  420. '.aif' : 'audio/x-aiff',
  421. '.aifc' : 'audio/x-aiff',
  422. '.aiff' : 'audio/x-aiff',
  423. '.ra' : 'audio/x-pn-realaudio',
  424. '.wav' : 'audio/x-wav',
  425. '.bmp' : 'image/bmp',
  426. '.gif' : 'image/gif',
  427. '.ief' : 'image/ief',
  428. '.jpg' : 'image/jpeg',
  429. '.jpe' : 'image/jpeg',
  430. '.jpeg' : 'image/jpeg',
  431. '.png' : 'image/png',
  432. '.svg' : 'image/svg+xml',
  433. '.tiff' : 'image/tiff',
  434. '.tif' : 'image/tiff',
  435. '.ico' : 'image/vnd.microsoft.icon',
  436. '.ras' : 'image/x-cmu-raster',
  437. '.bmp' : 'image/x-ms-bmp',
  438. '.pnm' : 'image/x-portable-anymap',
  439. '.pbm' : 'image/x-portable-bitmap',
  440. '.pgm' : 'image/x-portable-graymap',
  441. '.ppm' : 'image/x-portable-pixmap',
  442. '.rgb' : 'image/x-rgb',
  443. '.xbm' : 'image/x-xbitmap',
  444. '.xpm' : 'image/x-xpixmap',
  445. '.xwd' : 'image/x-xwindowdump',
  446. '.eml' : 'message/rfc822',
  447. '.mht' : 'message/rfc822',
  448. '.mhtml' : 'message/rfc822',
  449. '.nws' : 'message/rfc822',
  450. '.css' : 'text/css',
  451. '.csv' : 'text/csv',
  452. '.html' : 'text/html',
  453. '.htm' : 'text/html',
  454. '.txt' : 'text/plain',
  455. '.bat' : 'text/plain',
  456. '.c' : 'text/plain',
  457. '.h' : 'text/plain',
  458. '.ksh' : 'text/plain',
  459. '.pl' : 'text/plain',
  460. '.rtx' : 'text/richtext',
  461. '.tsv' : 'text/tab-separated-values',
  462. '.py' : 'text/x-python',
  463. '.etx' : 'text/x-setext',
  464. '.sgm' : 'text/x-sgml',
  465. '.sgml' : 'text/x-sgml',
  466. '.vcf' : 'text/x-vcard',
  467. '.xml' : 'text/xml',
  468. '.mp4' : 'video/mp4',
  469. '.mpeg' : 'video/mpeg',
  470. '.m1v' : 'video/mpeg',
  471. '.mpa' : 'video/mpeg',
  472. '.mpe' : 'video/mpeg',
  473. '.mpg' : 'video/mpeg',
  474. '.mov' : 'video/quicktime',
  475. '.qt' : 'video/quicktime',
  476. '.webm' : 'video/webm',
  477. '.avi' : 'video/x-msvideo',
  478. '.movie' : 'video/x-sgi-movie',
  479. }
  480. # These are non-standard types, commonly found in the wild. They will
  481. # only match if strict=0 flag is given to the API methods.
  482. # Please sort these too
  483. common_types = _common_types_default = {
  484. '.rtf' : 'application/rtf',
  485. '.midi': 'audio/midi',
  486. '.mid' : 'audio/midi',
  487. '.jpg' : 'image/jpg',
  488. '.pict': 'image/pict',
  489. '.pct' : 'image/pict',
  490. '.pic' : 'image/pict',
  491. '.xul' : 'text/xul',
  492. }
  493. _default_mime_types()
  494. if __name__ == '__main__':
  495. import getopt
  496. USAGE = """\
  497. Usage: mimetypes.py [options] type
  498. Options:
  499. --help / -h -- print this message and exit
  500. --lenient / -l -- additionally search of some common, but non-standard
  501. types.
  502. --extension / -e -- guess extension instead of type
  503. More than one type argument may be given.
  504. """
  505. def usage(code, msg=''):
  506. print(USAGE)
  507. if msg: print(msg)
  508. sys.exit(code)
  509. try:
  510. opts, args = getopt.getopt(sys.argv[1:], 'hle',
  511. ['help', 'lenient', 'extension'])
  512. except getopt.error as msg:
  513. usage(1, msg)
  514. strict = 1
  515. extension = 0
  516. for opt, arg in opts:
  517. if opt in ('-h', '--help'):
  518. usage(0)
  519. elif opt in ('-l', '--lenient'):
  520. strict = 0
  521. elif opt in ('-e', '--extension'):
  522. extension = 1
  523. for gtype in args:
  524. if extension:
  525. guess = guess_extension(gtype, strict)
  526. if not guess: print("I don't know anything about type", gtype)
  527. else: print(guess)
  528. else:
  529. guess, encoding = guess_type(gtype, strict)
  530. if not guess: print("I don't know anything about type", gtype)
  531. else: print('type:', guess, 'encoding:', encoding)