You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

693 lines
25 KiB

4 years ago
  1. import collections
  2. import re
  3. import string
  4. import zlib
  5. from enum import IntEnum
  6. from multidict import CIMultiDict, CIMultiDictProxy
  7. from yarl import URL
  8. from . import hdrs
  9. from .helpers import NO_EXTENSIONS
  10. from .http_exceptions import (BadStatusLine, ContentEncodingError,
  11. ContentLengthError, InvalidHeader, LineTooLong,
  12. TransferEncodingError)
  13. from .http_writer import HttpVersion, HttpVersion10
  14. from .log import internal_logger
  15. from .streams import EMPTY_PAYLOAD, StreamReader
  16. try:
  17. import brotli
  18. HAS_BROTLI = True
  19. except ImportError: # pragma: no cover
  20. HAS_BROTLI = False
  21. __all__ = (
  22. 'HttpParser', 'HttpRequestParser', 'HttpResponseParser',
  23. 'RawRequestMessage', 'RawResponseMessage')
  24. ASCIISET = set(string.printable)
  25. METHRE = re.compile('[A-Z0-9$-_.]+')
  26. VERSRE = re.compile(r'HTTP/(\d+).(\d+)')
  27. HDRRE = re.compile(rb'[\x00-\x1F\x7F()<>@,;:\[\]={} \t\\\\\"]')
  28. RawRequestMessage = collections.namedtuple(
  29. 'RawRequestMessage',
  30. ['method', 'path', 'version', 'headers', 'raw_headers',
  31. 'should_close', 'compression', 'upgrade', 'chunked', 'url'])
  32. RawResponseMessage = collections.namedtuple(
  33. 'RawResponseMessage',
  34. ['version', 'code', 'reason', 'headers', 'raw_headers',
  35. 'should_close', 'compression', 'upgrade', 'chunked'])
  36. class ParseState(IntEnum):
  37. PARSE_NONE = 0
  38. PARSE_LENGTH = 1
  39. PARSE_CHUNKED = 2
  40. PARSE_UNTIL_EOF = 3
  41. class ChunkState(IntEnum):
  42. PARSE_CHUNKED_SIZE = 0
  43. PARSE_CHUNKED_CHUNK = 1
  44. PARSE_CHUNKED_CHUNK_EOF = 2
  45. PARSE_MAYBE_TRAILERS = 3
  46. PARSE_TRAILERS = 4
  47. class HttpParser:
  48. def __init__(self, protocol=None, loop=None,
  49. max_line_size=8190, max_headers=32768, max_field_size=8190,
  50. timer=None, code=None, method=None, readall=False,
  51. payload_exception=None,
  52. response_with_body=True, read_until_eof=False,
  53. auto_decompress=True):
  54. self.protocol = protocol
  55. self.loop = loop
  56. self.max_line_size = max_line_size
  57. self.max_headers = max_headers
  58. self.max_field_size = max_field_size
  59. self.timer = timer
  60. self.code = code
  61. self.method = method
  62. self.readall = readall
  63. self.payload_exception = payload_exception
  64. self.response_with_body = response_with_body
  65. self.read_until_eof = read_until_eof
  66. self._lines = []
  67. self._tail = b''
  68. self._upgraded = False
  69. self._payload = None
  70. self._payload_parser = None
  71. self._auto_decompress = auto_decompress
  72. def feed_eof(self):
  73. if self._payload_parser is not None:
  74. self._payload_parser.feed_eof()
  75. self._payload_parser = None
  76. else:
  77. # try to extract partial message
  78. if self._tail:
  79. self._lines.append(self._tail)
  80. if self._lines:
  81. if self._lines[-1] != '\r\n':
  82. self._lines.append('')
  83. try:
  84. return self.parse_message(self._lines)
  85. except Exception:
  86. return None
  87. def feed_data(self, data,
  88. SEP=b'\r\n', EMPTY=b'',
  89. CONTENT_LENGTH=hdrs.CONTENT_LENGTH,
  90. METH_CONNECT=hdrs.METH_CONNECT,
  91. SEC_WEBSOCKET_KEY1=hdrs.SEC_WEBSOCKET_KEY1):
  92. messages = []
  93. if self._tail:
  94. data, self._tail = self._tail + data, b''
  95. data_len = len(data)
  96. start_pos = 0
  97. loop = self.loop
  98. while start_pos < data_len:
  99. # read HTTP message (request/response line + headers), \r\n\r\n
  100. # and split by lines
  101. if self._payload_parser is None and not self._upgraded:
  102. pos = data.find(SEP, start_pos)
  103. # consume \r\n
  104. if pos == start_pos and not self._lines:
  105. start_pos = pos + 2
  106. continue
  107. if pos >= start_pos:
  108. # line found
  109. self._lines.append(data[start_pos:pos])
  110. start_pos = pos + 2
  111. # \r\n\r\n found
  112. if self._lines[-1] == EMPTY:
  113. try:
  114. msg = self.parse_message(self._lines)
  115. finally:
  116. self._lines.clear()
  117. # payload length
  118. length = msg.headers.get(CONTENT_LENGTH)
  119. if length is not None:
  120. try:
  121. length = int(length)
  122. except ValueError:
  123. raise InvalidHeader(CONTENT_LENGTH)
  124. if length < 0:
  125. raise InvalidHeader(CONTENT_LENGTH)
  126. # do not support old websocket spec
  127. if SEC_WEBSOCKET_KEY1 in msg.headers:
  128. raise InvalidHeader(SEC_WEBSOCKET_KEY1)
  129. self._upgraded = msg.upgrade
  130. method = getattr(msg, 'method', self.method)
  131. # calculate payload
  132. if ((length is not None and length > 0) or
  133. msg.chunked and not msg.upgrade):
  134. payload = StreamReader(
  135. self.protocol, timer=self.timer, loop=loop)
  136. payload_parser = HttpPayloadParser(
  137. payload, length=length,
  138. chunked=msg.chunked, method=method,
  139. compression=msg.compression,
  140. code=self.code, readall=self.readall,
  141. response_with_body=self.response_with_body,
  142. auto_decompress=self._auto_decompress)
  143. if not payload_parser.done:
  144. self._payload_parser = payload_parser
  145. elif method == METH_CONNECT:
  146. payload = StreamReader(
  147. self.protocol, timer=self.timer, loop=loop)
  148. self._upgraded = True
  149. self._payload_parser = HttpPayloadParser(
  150. payload, method=msg.method,
  151. compression=msg.compression, readall=True,
  152. auto_decompress=self._auto_decompress)
  153. else:
  154. if (getattr(msg, 'code', 100) >= 199 and
  155. length is None and self.read_until_eof):
  156. payload = StreamReader(
  157. self.protocol, timer=self.timer, loop=loop)
  158. payload_parser = HttpPayloadParser(
  159. payload, length=length,
  160. chunked=msg.chunked, method=method,
  161. compression=msg.compression,
  162. code=self.code, readall=True,
  163. response_with_body=self.response_with_body,
  164. auto_decompress=self._auto_decompress)
  165. if not payload_parser.done:
  166. self._payload_parser = payload_parser
  167. else:
  168. payload = EMPTY_PAYLOAD
  169. messages.append((msg, payload))
  170. else:
  171. self._tail = data[start_pos:]
  172. data = EMPTY
  173. break
  174. # no parser, just store
  175. elif self._payload_parser is None and self._upgraded:
  176. assert not self._lines
  177. break
  178. # feed payload
  179. elif data and start_pos < data_len:
  180. assert not self._lines
  181. try:
  182. eof, data = self._payload_parser.feed_data(
  183. data[start_pos:])
  184. except BaseException as exc:
  185. if self.payload_exception is not None:
  186. self._payload_parser.payload.set_exception(
  187. self.payload_exception(str(exc)))
  188. else:
  189. self._payload_parser.payload.set_exception(exc)
  190. eof = True
  191. data = b''
  192. if eof:
  193. start_pos = 0
  194. data_len = len(data)
  195. self._payload_parser = None
  196. continue
  197. else:
  198. break
  199. if data and start_pos < data_len:
  200. data = data[start_pos:]
  201. else:
  202. data = EMPTY
  203. return messages, self._upgraded, data
  204. def parse_headers(self, lines):
  205. """Parses RFC 5322 headers from a stream.
  206. Line continuations are supported. Returns list of header name
  207. and value pairs. Header name is in upper case.
  208. """
  209. headers = CIMultiDict()
  210. raw_headers = []
  211. lines_idx = 1
  212. line = lines[1]
  213. line_count = len(lines)
  214. while line:
  215. # Parse initial header name : value pair.
  216. try:
  217. bname, bvalue = line.split(b':', 1)
  218. except ValueError:
  219. raise InvalidHeader(line) from None
  220. bname = bname.strip(b' \t')
  221. bvalue = bvalue.lstrip()
  222. if HDRRE.search(bname):
  223. raise InvalidHeader(bname)
  224. if len(bname) > self.max_field_size:
  225. raise LineTooLong(
  226. "request header name {}".format(
  227. bname.decode("utf8", "xmlcharrefreplace")),
  228. self.max_field_size,
  229. len(bname))
  230. header_length = len(bvalue)
  231. # next line
  232. lines_idx += 1
  233. line = lines[lines_idx]
  234. # consume continuation lines
  235. continuation = line and line[0] in (32, 9) # (' ', '\t')
  236. if continuation:
  237. bvalue = [bvalue]
  238. while continuation:
  239. header_length += len(line)
  240. if header_length > self.max_field_size:
  241. raise LineTooLong(
  242. 'request header field {}'.format(
  243. bname.decode("utf8", "xmlcharrefreplace")),
  244. self.max_field_size,
  245. header_length)
  246. bvalue.append(line)
  247. # next line
  248. lines_idx += 1
  249. if lines_idx < line_count:
  250. line = lines[lines_idx]
  251. if line:
  252. continuation = line[0] in (32, 9) # (' ', '\t')
  253. else:
  254. line = b''
  255. break
  256. bvalue = b''.join(bvalue)
  257. else:
  258. if header_length > self.max_field_size:
  259. raise LineTooLong(
  260. 'request header field {}'.format(
  261. bname.decode("utf8", "xmlcharrefreplace")),
  262. self.max_field_size,
  263. header_length)
  264. bvalue = bvalue.strip()
  265. name = bname.decode('utf-8', 'surrogateescape')
  266. value = bvalue.decode('utf-8', 'surrogateescape')
  267. headers.add(name, value)
  268. raw_headers.append((bname, bvalue))
  269. close_conn = None
  270. encoding = None
  271. upgrade = False
  272. chunked = False
  273. raw_headers = tuple(raw_headers)
  274. headers = CIMultiDictProxy(headers)
  275. # keep-alive
  276. conn = headers.get(hdrs.CONNECTION)
  277. if conn:
  278. v = conn.lower()
  279. if v == 'close':
  280. close_conn = True
  281. elif v == 'keep-alive':
  282. close_conn = False
  283. elif v == 'upgrade':
  284. upgrade = True
  285. # encoding
  286. enc = headers.get(hdrs.CONTENT_ENCODING)
  287. if enc:
  288. enc = enc.lower()
  289. if enc in ('gzip', 'deflate', 'br'):
  290. encoding = enc
  291. # chunking
  292. te = headers.get(hdrs.TRANSFER_ENCODING)
  293. if te and 'chunked' in te.lower():
  294. chunked = True
  295. return headers, raw_headers, close_conn, encoding, upgrade, chunked
  296. class HttpRequestParser(HttpParser):
  297. """Read request status line. Exception .http_exceptions.BadStatusLine
  298. could be raised in case of any errors in status line.
  299. Returns RawRequestMessage.
  300. """
  301. def parse_message(self, lines):
  302. # request line
  303. line = lines[0].decode('utf-8', 'surrogateescape')
  304. try:
  305. method, path, version = line.split(None, 2)
  306. except ValueError:
  307. raise BadStatusLine(line) from None
  308. if len(path) > self.max_line_size:
  309. raise LineTooLong(
  310. 'Status line is too long', self.max_line_size, len(path))
  311. # method
  312. method = method.upper()
  313. if not METHRE.match(method):
  314. raise BadStatusLine(method)
  315. # version
  316. try:
  317. if version.startswith('HTTP/'):
  318. n1, n2 = version[5:].split('.', 1)
  319. version = HttpVersion(int(n1), int(n2))
  320. else:
  321. raise BadStatusLine(version)
  322. except Exception:
  323. raise BadStatusLine(version)
  324. # read headers
  325. (headers, raw_headers,
  326. close, compression, upgrade, chunked) = self.parse_headers(lines)
  327. if close is None: # then the headers weren't set in the request
  328. if version <= HttpVersion10: # HTTP 1.0 must asks to not close
  329. close = True
  330. else: # HTTP 1.1 must ask to close.
  331. close = False
  332. return RawRequestMessage(
  333. method, path, version, headers, raw_headers,
  334. close, compression, upgrade, chunked, URL(path))
  335. class HttpResponseParser(HttpParser):
  336. """Read response status line and headers.
  337. BadStatusLine could be raised in case of any errors in status line.
  338. Returns RawResponseMessage"""
  339. def parse_message(self, lines):
  340. line = lines[0].decode('utf-8', 'surrogateescape')
  341. try:
  342. version, status = line.split(None, 1)
  343. except ValueError:
  344. raise BadStatusLine(line) from None
  345. try:
  346. status, reason = status.split(None, 1)
  347. except ValueError:
  348. reason = ''
  349. if len(reason) > self.max_line_size:
  350. raise LineTooLong(
  351. 'Status line is too long', self.max_line_size,
  352. len(reason))
  353. # version
  354. match = VERSRE.match(version)
  355. if match is None:
  356. raise BadStatusLine(line)
  357. version = HttpVersion(int(match.group(1)), int(match.group(2)))
  358. # The status code is a three-digit number
  359. try:
  360. status = int(status)
  361. except ValueError:
  362. raise BadStatusLine(line) from None
  363. if status > 999:
  364. raise BadStatusLine(line)
  365. # read headers
  366. (headers, raw_headers,
  367. close, compression, upgrade, chunked) = self.parse_headers(lines)
  368. if close is None:
  369. close = version <= HttpVersion10
  370. return RawResponseMessage(
  371. version, status, reason.strip(),
  372. headers, raw_headers, close, compression, upgrade, chunked)
  373. class HttpPayloadParser:
  374. def __init__(self, payload,
  375. length=None, chunked=False, compression=None,
  376. code=None, method=None,
  377. readall=False, response_with_body=True, auto_decompress=True):
  378. self.payload = payload
  379. self._length = 0
  380. self._type = ParseState.PARSE_NONE
  381. self._chunk = ChunkState.PARSE_CHUNKED_SIZE
  382. self._chunk_size = 0
  383. self._chunk_tail = b''
  384. self._auto_decompress = auto_decompress
  385. self.done = False
  386. # payload decompression wrapper
  387. if response_with_body and compression and self._auto_decompress:
  388. payload = DeflateBuffer(payload, compression)
  389. # payload parser
  390. if not response_with_body:
  391. # don't parse payload if it's not expected to be received
  392. self._type = ParseState.PARSE_NONE
  393. payload.feed_eof()
  394. self.done = True
  395. elif chunked:
  396. self._type = ParseState.PARSE_CHUNKED
  397. elif length is not None:
  398. self._type = ParseState.PARSE_LENGTH
  399. self._length = length
  400. if self._length == 0:
  401. payload.feed_eof()
  402. self.done = True
  403. else:
  404. if readall and code != 204:
  405. self._type = ParseState.PARSE_UNTIL_EOF
  406. elif method in ('PUT', 'POST'):
  407. internal_logger.warning( # pragma: no cover
  408. 'Content-Length or Transfer-Encoding header is required')
  409. self._type = ParseState.PARSE_NONE
  410. payload.feed_eof()
  411. self.done = True
  412. self.payload = payload
  413. def feed_eof(self):
  414. if self._type == ParseState.PARSE_UNTIL_EOF:
  415. self.payload.feed_eof()
  416. elif self._type == ParseState.PARSE_LENGTH:
  417. raise ContentLengthError(
  418. "Not enough data for satisfy content length header.")
  419. elif self._type == ParseState.PARSE_CHUNKED:
  420. raise TransferEncodingError(
  421. "Not enough data for satisfy transfer length header.")
  422. def feed_data(self, chunk, SEP=b'\r\n', CHUNK_EXT=b';'):
  423. # Read specified amount of bytes
  424. if self._type == ParseState.PARSE_LENGTH:
  425. required = self._length
  426. chunk_len = len(chunk)
  427. if required >= chunk_len:
  428. self._length = required - chunk_len
  429. self.payload.feed_data(chunk, chunk_len)
  430. if self._length == 0:
  431. self.payload.feed_eof()
  432. return True, b''
  433. else:
  434. self._length = 0
  435. self.payload.feed_data(chunk[:required], required)
  436. self.payload.feed_eof()
  437. return True, chunk[required:]
  438. # Chunked transfer encoding parser
  439. elif self._type == ParseState.PARSE_CHUNKED:
  440. if self._chunk_tail:
  441. chunk = self._chunk_tail + chunk
  442. self._chunk_tail = b''
  443. while chunk:
  444. # read next chunk size
  445. if self._chunk == ChunkState.PARSE_CHUNKED_SIZE:
  446. pos = chunk.find(SEP)
  447. if pos >= 0:
  448. i = chunk.find(CHUNK_EXT, 0, pos)
  449. if i >= 0:
  450. size = chunk[:i] # strip chunk-extensions
  451. else:
  452. size = chunk[:pos]
  453. try:
  454. size = int(bytes(size), 16)
  455. except ValueError:
  456. exc = TransferEncodingError(chunk[:pos])
  457. self.payload.set_exception(exc)
  458. raise exc from None
  459. chunk = chunk[pos+2:]
  460. if size == 0: # eof marker
  461. self._chunk = ChunkState.PARSE_MAYBE_TRAILERS
  462. else:
  463. self._chunk = ChunkState.PARSE_CHUNKED_CHUNK
  464. self._chunk_size = size
  465. self.payload.begin_http_chunk_receiving()
  466. else:
  467. self._chunk_tail = chunk
  468. return False, None
  469. # read chunk and feed buffer
  470. if self._chunk == ChunkState.PARSE_CHUNKED_CHUNK:
  471. required = self._chunk_size
  472. chunk_len = len(chunk)
  473. if required > chunk_len:
  474. self._chunk_size = required - chunk_len
  475. self.payload.feed_data(chunk, chunk_len)
  476. return False, None
  477. else:
  478. self._chunk_size = 0
  479. self.payload.feed_data(chunk[:required], required)
  480. chunk = chunk[required:]
  481. self._chunk = ChunkState.PARSE_CHUNKED_CHUNK_EOF
  482. self.payload.end_http_chunk_receiving()
  483. # toss the CRLF at the end of the chunk
  484. if self._chunk == ChunkState.PARSE_CHUNKED_CHUNK_EOF:
  485. if chunk[:2] == SEP:
  486. chunk = chunk[2:]
  487. self._chunk = ChunkState.PARSE_CHUNKED_SIZE
  488. else:
  489. self._chunk_tail = chunk
  490. return False, None
  491. # if stream does not contain trailer, after 0\r\n
  492. # we should get another \r\n otherwise
  493. # trailers needs to be skiped until \r\n\r\n
  494. if self._chunk == ChunkState.PARSE_MAYBE_TRAILERS:
  495. if chunk[:2] == SEP:
  496. # end of stream
  497. self.payload.feed_eof()
  498. return True, chunk[2:]
  499. else:
  500. self._chunk = ChunkState.PARSE_TRAILERS
  501. # read and discard trailer up to the CRLF terminator
  502. if self._chunk == ChunkState.PARSE_TRAILERS:
  503. pos = chunk.find(SEP)
  504. if pos >= 0:
  505. chunk = chunk[pos+2:]
  506. self._chunk = ChunkState.PARSE_MAYBE_TRAILERS
  507. else:
  508. self._chunk_tail = chunk
  509. return False, None
  510. # Read all bytes until eof
  511. elif self._type == ParseState.PARSE_UNTIL_EOF:
  512. self.payload.feed_data(chunk, len(chunk))
  513. return False, None
  514. class DeflateBuffer:
  515. """DeflateStream decompress stream and feed data into specified stream."""
  516. def __init__(self, out, encoding):
  517. self.out = out
  518. self.size = 0
  519. self.encoding = encoding
  520. self._started_decoding = False
  521. if encoding == 'br':
  522. if not HAS_BROTLI: # pragma: no cover
  523. raise ContentEncodingError(
  524. 'Can not decode content-encoding: brotli (br). '
  525. 'Please install `brotlipy`')
  526. self.decompressor = brotli.Decompressor()
  527. else:
  528. zlib_mode = (16 + zlib.MAX_WBITS
  529. if encoding == 'gzip' else -zlib.MAX_WBITS)
  530. self.decompressor = zlib.decompressobj(wbits=zlib_mode)
  531. def set_exception(self, exc):
  532. self.out.set_exception(exc)
  533. def feed_data(self, chunk, size):
  534. self.size += size
  535. try:
  536. chunk = self.decompressor.decompress(chunk)
  537. except Exception:
  538. if not self._started_decoding and self.encoding == 'deflate':
  539. self.decompressor = zlib.decompressobj()
  540. try:
  541. chunk = self.decompressor.decompress(chunk)
  542. except Exception:
  543. raise ContentEncodingError(
  544. 'Can not decode content-encoding: %s' % self.encoding)
  545. else:
  546. raise ContentEncodingError(
  547. 'Can not decode content-encoding: %s' % self.encoding)
  548. if chunk:
  549. self._started_decoding = True
  550. self.out.feed_data(chunk, len(chunk))
  551. def feed_eof(self):
  552. chunk = self.decompressor.flush()
  553. if chunk or self.size > 0:
  554. self.out.feed_data(chunk, len(chunk))
  555. if self.encoding == 'deflate' and not self.decompressor.eof:
  556. raise ContentEncodingError('deflate')
  557. self.out.feed_eof()
  558. def begin_http_chunk_receiving(self):
  559. self.out.begin_http_chunk_receiving()
  560. def end_http_chunk_receiving(self):
  561. self.out.end_http_chunk_receiving()
  562. HttpRequestParserPy = HttpRequestParser
  563. HttpResponseParserPy = HttpResponseParser
  564. RawRequestMessagePy = RawRequestMessage
  565. RawResponseMessagePy = RawResponseMessage
  566. try:
  567. if not NO_EXTENSIONS: # pragma: no cover
  568. from ._http_parser import (HttpRequestParser, # type: ignore # noqa
  569. HttpResponseParser,
  570. RawRequestMessage,
  571. RawResponseMessage)
  572. HttpRequestParserC = HttpRequestParser
  573. HttpResponseParserC = HttpResponseParser
  574. RawRequestMessageC = RawRequestMessage
  575. RawResponseMessageC = RawResponseMessage
  576. except ImportError: # pragma: no cover
  577. pass