import collections import re import string import zlib from enum import IntEnum from multidict import CIMultiDict, CIMultiDictProxy from yarl import URL from . import hdrs from .helpers import NO_EXTENSIONS from .http_exceptions import (BadStatusLine, ContentEncodingError, ContentLengthError, InvalidHeader, LineTooLong, TransferEncodingError) from .http_writer import HttpVersion, HttpVersion10 from .log import internal_logger from .streams import EMPTY_PAYLOAD, StreamReader try: import brotli HAS_BROTLI = True except ImportError: # pragma: no cover HAS_BROTLI = False __all__ = ( 'HttpParser', 'HttpRequestParser', 'HttpResponseParser', 'RawRequestMessage', 'RawResponseMessage') ASCIISET = set(string.printable) METHRE = re.compile('[A-Z0-9$-_.]+') VERSRE = re.compile(r'HTTP/(\d+).(\d+)') HDRRE = re.compile(rb'[\x00-\x1F\x7F()<>@,;:\[\]={} \t\\\\\"]') RawRequestMessage = collections.namedtuple( 'RawRequestMessage', ['method', 'path', 'version', 'headers', 'raw_headers', 'should_close', 'compression', 'upgrade', 'chunked', 'url']) RawResponseMessage = collections.namedtuple( 'RawResponseMessage', ['version', 'code', 'reason', 'headers', 'raw_headers', 'should_close', 'compression', 'upgrade', 'chunked']) class ParseState(IntEnum): PARSE_NONE = 0 PARSE_LENGTH = 1 PARSE_CHUNKED = 2 PARSE_UNTIL_EOF = 3 class ChunkState(IntEnum): PARSE_CHUNKED_SIZE = 0 PARSE_CHUNKED_CHUNK = 1 PARSE_CHUNKED_CHUNK_EOF = 2 PARSE_MAYBE_TRAILERS = 3 PARSE_TRAILERS = 4 class HttpParser: def __init__(self, protocol=None, loop=None, max_line_size=8190, max_headers=32768, max_field_size=8190, timer=None, code=None, method=None, readall=False, payload_exception=None, response_with_body=True, read_until_eof=False, auto_decompress=True): self.protocol = protocol self.loop = loop self.max_line_size = max_line_size self.max_headers = max_headers self.max_field_size = max_field_size self.timer = timer self.code = code self.method = method self.readall = readall self.payload_exception = payload_exception self.response_with_body = response_with_body self.read_until_eof = read_until_eof self._lines = [] self._tail = b'' self._upgraded = False self._payload = None self._payload_parser = None self._auto_decompress = auto_decompress def feed_eof(self): if self._payload_parser is not None: self._payload_parser.feed_eof() self._payload_parser = None else: # try to extract partial message if self._tail: self._lines.append(self._tail) if self._lines: if self._lines[-1] != '\r\n': self._lines.append('') try: return self.parse_message(self._lines) except Exception: return None def feed_data(self, data, SEP=b'\r\n', EMPTY=b'', CONTENT_LENGTH=hdrs.CONTENT_LENGTH, METH_CONNECT=hdrs.METH_CONNECT, SEC_WEBSOCKET_KEY1=hdrs.SEC_WEBSOCKET_KEY1): messages = [] if self._tail: data, self._tail = self._tail + data, b'' data_len = len(data) start_pos = 0 loop = self.loop while start_pos < data_len: # read HTTP message (request/response line + headers), \r\n\r\n # and split by lines if self._payload_parser is None and not self._upgraded: pos = data.find(SEP, start_pos) # consume \r\n if pos == start_pos and not self._lines: start_pos = pos + 2 continue if pos >= start_pos: # line found self._lines.append(data[start_pos:pos]) start_pos = pos + 2 # \r\n\r\n found if self._lines[-1] == EMPTY: try: msg = self.parse_message(self._lines) finally: self._lines.clear() # payload length length = msg.headers.get(CONTENT_LENGTH) if length is not None: try: length = int(length) except ValueError: raise InvalidHeader(CONTENT_LENGTH) if length < 0: raise InvalidHeader(CONTENT_LENGTH) # do not support old websocket spec if SEC_WEBSOCKET_KEY1 in msg.headers: raise InvalidHeader(SEC_WEBSOCKET_KEY1) self._upgraded = msg.upgrade method = getattr(msg, 'method', self.method) # calculate payload if ((length is not None and length > 0) or msg.chunked and not msg.upgrade): payload = StreamReader( self.protocol, timer=self.timer, loop=loop) payload_parser = HttpPayloadParser( payload, length=length, chunked=msg.chunked, method=method, compression=msg.compression, code=self.code, readall=self.readall, response_with_body=self.response_with_body, auto_decompress=self._auto_decompress) if not payload_parser.done: self._payload_parser = payload_parser elif method == METH_CONNECT: payload = StreamReader( self.protocol, timer=self.timer, loop=loop) self._upgraded = True self._payload_parser = HttpPayloadParser( payload, method=msg.method, compression=msg.compression, readall=True, auto_decompress=self._auto_decompress) else: if (getattr(msg, 'code', 100) >= 199 and length is None and self.read_until_eof): payload = StreamReader( self.protocol, timer=self.timer, loop=loop) payload_parser = HttpPayloadParser( payload, length=length, chunked=msg.chunked, method=method, compression=msg.compression, code=self.code, readall=True, response_with_body=self.response_with_body, auto_decompress=self._auto_decompress) if not payload_parser.done: self._payload_parser = payload_parser else: payload = EMPTY_PAYLOAD messages.append((msg, payload)) else: self._tail = data[start_pos:] data = EMPTY break # no parser, just store elif self._payload_parser is None and self._upgraded: assert not self._lines break # feed payload elif data and start_pos < data_len: assert not self._lines try: eof, data = self._payload_parser.feed_data( data[start_pos:]) except BaseException as exc: if self.payload_exception is not None: self._payload_parser.payload.set_exception( self.payload_exception(str(exc))) else: self._payload_parser.payload.set_exception(exc) eof = True data = b'' if eof: start_pos = 0 data_len = len(data) self._payload_parser = None continue else: break if data and start_pos < data_len: data = data[start_pos:] else: data = EMPTY return messages, self._upgraded, data def parse_headers(self, lines): """Parses RFC 5322 headers from a stream. Line continuations are supported. Returns list of header name and value pairs. Header name is in upper case. """ headers = CIMultiDict() raw_headers = [] lines_idx = 1 line = lines[1] line_count = len(lines) while line: # Parse initial header name : value pair. try: bname, bvalue = line.split(b':', 1) except ValueError: raise InvalidHeader(line) from None bname = bname.strip(b' \t') bvalue = bvalue.lstrip() if HDRRE.search(bname): raise InvalidHeader(bname) if len(bname) > self.max_field_size: raise LineTooLong( "request header name {}".format( bname.decode("utf8", "xmlcharrefreplace")), self.max_field_size, len(bname)) header_length = len(bvalue) # next line lines_idx += 1 line = lines[lines_idx] # consume continuation lines continuation = line and line[0] in (32, 9) # (' ', '\t') if continuation: bvalue = [bvalue] while continuation: header_length += len(line) if header_length > self.max_field_size: raise LineTooLong( 'request header field {}'.format( bname.decode("utf8", "xmlcharrefreplace")), self.max_field_size, header_length) bvalue.append(line) # next line lines_idx += 1 if lines_idx < line_count: line = lines[lines_idx] if line: continuation = line[0] in (32, 9) # (' ', '\t') else: line = b'' break bvalue = b''.join(bvalue) else: if header_length > self.max_field_size: raise LineTooLong( 'request header field {}'.format( bname.decode("utf8", "xmlcharrefreplace")), self.max_field_size, header_length) bvalue = bvalue.strip() name = bname.decode('utf-8', 'surrogateescape') value = bvalue.decode('utf-8', 'surrogateescape') headers.add(name, value) raw_headers.append((bname, bvalue)) close_conn = None encoding = None upgrade = False chunked = False raw_headers = tuple(raw_headers) headers = CIMultiDictProxy(headers) # keep-alive conn = headers.get(hdrs.CONNECTION) if conn: v = conn.lower() if v == 'close': close_conn = True elif v == 'keep-alive': close_conn = False elif v == 'upgrade': upgrade = True # encoding enc = headers.get(hdrs.CONTENT_ENCODING) if enc: enc = enc.lower() if enc in ('gzip', 'deflate', 'br'): encoding = enc # chunking te = headers.get(hdrs.TRANSFER_ENCODING) if te and 'chunked' in te.lower(): chunked = True return headers, raw_headers, close_conn, encoding, upgrade, chunked class HttpRequestParser(HttpParser): """Read request status line. Exception .http_exceptions.BadStatusLine could be raised in case of any errors in status line. Returns RawRequestMessage. """ def parse_message(self, lines): # request line line = lines[0].decode('utf-8', 'surrogateescape') try: method, path, version = line.split(None, 2) except ValueError: raise BadStatusLine(line) from None if len(path) > self.max_line_size: raise LineTooLong( 'Status line is too long', self.max_line_size, len(path)) # method method = method.upper() if not METHRE.match(method): raise BadStatusLine(method) # version try: if version.startswith('HTTP/'): n1, n2 = version[5:].split('.', 1) version = HttpVersion(int(n1), int(n2)) else: raise BadStatusLine(version) except Exception: raise BadStatusLine(version) # read headers (headers, raw_headers, close, compression, upgrade, chunked) = self.parse_headers(lines) if close is None: # then the headers weren't set in the request if version <= HttpVersion10: # HTTP 1.0 must asks to not close close = True else: # HTTP 1.1 must ask to close. close = False return RawRequestMessage( method, path, version, headers, raw_headers, close, compression, upgrade, chunked, URL(path)) class HttpResponseParser(HttpParser): """Read response status line and headers. BadStatusLine could be raised in case of any errors in status line. Returns RawResponseMessage""" def parse_message(self, lines): line = lines[0].decode('utf-8', 'surrogateescape') try: version, status = line.split(None, 1) except ValueError: raise BadStatusLine(line) from None try: status, reason = status.split(None, 1) except ValueError: reason = '' if len(reason) > self.max_line_size: raise LineTooLong( 'Status line is too long', self.max_line_size, len(reason)) # version match = VERSRE.match(version) if match is None: raise BadStatusLine(line) version = HttpVersion(int(match.group(1)), int(match.group(2))) # The status code is a three-digit number try: status = int(status) except ValueError: raise BadStatusLine(line) from None if status > 999: raise BadStatusLine(line) # read headers (headers, raw_headers, close, compression, upgrade, chunked) = self.parse_headers(lines) if close is None: close = version <= HttpVersion10 return RawResponseMessage( version, status, reason.strip(), headers, raw_headers, close, compression, upgrade, chunked) class HttpPayloadParser: def __init__(self, payload, length=None, chunked=False, compression=None, code=None, method=None, readall=False, response_with_body=True, auto_decompress=True): self.payload = payload self._length = 0 self._type = ParseState.PARSE_NONE self._chunk = ChunkState.PARSE_CHUNKED_SIZE self._chunk_size = 0 self._chunk_tail = b'' self._auto_decompress = auto_decompress self.done = False # payload decompression wrapper if response_with_body and compression and self._auto_decompress: payload = DeflateBuffer(payload, compression) # payload parser if not response_with_body: # don't parse payload if it's not expected to be received self._type = ParseState.PARSE_NONE payload.feed_eof() self.done = True elif chunked: self._type = ParseState.PARSE_CHUNKED elif length is not None: self._type = ParseState.PARSE_LENGTH self._length = length if self._length == 0: payload.feed_eof() self.done = True else: if readall and code != 204: self._type = ParseState.PARSE_UNTIL_EOF elif method in ('PUT', 'POST'): internal_logger.warning( # pragma: no cover 'Content-Length or Transfer-Encoding header is required') self._type = ParseState.PARSE_NONE payload.feed_eof() self.done = True self.payload = payload def feed_eof(self): if self._type == ParseState.PARSE_UNTIL_EOF: self.payload.feed_eof() elif self._type == ParseState.PARSE_LENGTH: raise ContentLengthError( "Not enough data for satisfy content length header.") elif self._type == ParseState.PARSE_CHUNKED: raise TransferEncodingError( "Not enough data for satisfy transfer length header.") def feed_data(self, chunk, SEP=b'\r\n', CHUNK_EXT=b';'): # Read specified amount of bytes if self._type == ParseState.PARSE_LENGTH: required = self._length chunk_len = len(chunk) if required >= chunk_len: self._length = required - chunk_len self.payload.feed_data(chunk, chunk_len) if self._length == 0: self.payload.feed_eof() return True, b'' else: self._length = 0 self.payload.feed_data(chunk[:required], required) self.payload.feed_eof() return True, chunk[required:] # Chunked transfer encoding parser elif self._type == ParseState.PARSE_CHUNKED: if self._chunk_tail: chunk = self._chunk_tail + chunk self._chunk_tail = b'' while chunk: # read next chunk size if self._chunk == ChunkState.PARSE_CHUNKED_SIZE: pos = chunk.find(SEP) if pos >= 0: i = chunk.find(CHUNK_EXT, 0, pos) if i >= 0: size = chunk[:i] # strip chunk-extensions else: size = chunk[:pos] try: size = int(bytes(size), 16) except ValueError: exc = TransferEncodingError(chunk[:pos]) self.payload.set_exception(exc) raise exc from None chunk = chunk[pos+2:] if size == 0: # eof marker self._chunk = ChunkState.PARSE_MAYBE_TRAILERS else: self._chunk = ChunkState.PARSE_CHUNKED_CHUNK self._chunk_size = size self.payload.begin_http_chunk_receiving() else: self._chunk_tail = chunk return False, None # read chunk and feed buffer if self._chunk == ChunkState.PARSE_CHUNKED_CHUNK: required = self._chunk_size chunk_len = len(chunk) if required > chunk_len: self._chunk_size = required - chunk_len self.payload.feed_data(chunk, chunk_len) return False, None else: self._chunk_size = 0 self.payload.feed_data(chunk[:required], required) chunk = chunk[required:] self._chunk = ChunkState.PARSE_CHUNKED_CHUNK_EOF self.payload.end_http_chunk_receiving() # toss the CRLF at the end of the chunk if self._chunk == ChunkState.PARSE_CHUNKED_CHUNK_EOF: if chunk[:2] == SEP: chunk = chunk[2:] self._chunk = ChunkState.PARSE_CHUNKED_SIZE else: self._chunk_tail = chunk return False, None # if stream does not contain trailer, after 0\r\n # we should get another \r\n otherwise # trailers needs to be skiped until \r\n\r\n if self._chunk == ChunkState.PARSE_MAYBE_TRAILERS: if chunk[:2] == SEP: # end of stream self.payload.feed_eof() return True, chunk[2:] else: self._chunk = ChunkState.PARSE_TRAILERS # read and discard trailer up to the CRLF terminator if self._chunk == ChunkState.PARSE_TRAILERS: pos = chunk.find(SEP) if pos >= 0: chunk = chunk[pos+2:] self._chunk = ChunkState.PARSE_MAYBE_TRAILERS else: self._chunk_tail = chunk return False, None # Read all bytes until eof elif self._type == ParseState.PARSE_UNTIL_EOF: self.payload.feed_data(chunk, len(chunk)) return False, None class DeflateBuffer: """DeflateStream decompress stream and feed data into specified stream.""" def __init__(self, out, encoding): self.out = out self.size = 0 self.encoding = encoding self._started_decoding = False if encoding == 'br': if not HAS_BROTLI: # pragma: no cover raise ContentEncodingError( 'Can not decode content-encoding: brotli (br). ' 'Please install `brotlipy`') self.decompressor = brotli.Decompressor() else: zlib_mode = (16 + zlib.MAX_WBITS if encoding == 'gzip' else -zlib.MAX_WBITS) self.decompressor = zlib.decompressobj(wbits=zlib_mode) def set_exception(self, exc): self.out.set_exception(exc) def feed_data(self, chunk, size): self.size += size try: chunk = self.decompressor.decompress(chunk) except Exception: if not self._started_decoding and self.encoding == 'deflate': self.decompressor = zlib.decompressobj() try: chunk = self.decompressor.decompress(chunk) except Exception: raise ContentEncodingError( 'Can not decode content-encoding: %s' % self.encoding) else: raise ContentEncodingError( 'Can not decode content-encoding: %s' % self.encoding) if chunk: self._started_decoding = True self.out.feed_data(chunk, len(chunk)) def feed_eof(self): chunk = self.decompressor.flush() if chunk or self.size > 0: self.out.feed_data(chunk, len(chunk)) if self.encoding == 'deflate' and not self.decompressor.eof: raise ContentEncodingError('deflate') self.out.feed_eof() def begin_http_chunk_receiving(self): self.out.begin_http_chunk_receiving() def end_http_chunk_receiving(self): self.out.end_http_chunk_receiving() HttpRequestParserPy = HttpRequestParser HttpResponseParserPy = HttpResponseParser RawRequestMessagePy = RawRequestMessage RawResponseMessagePy = RawResponseMessage try: if not NO_EXTENSIONS: # pragma: no cover from ._http_parser import (HttpRequestParser, # type: ignore # noqa HttpResponseParser, RawRequestMessage, RawResponseMessage) HttpRequestParserC = HttpRequestParser HttpResponseParserC = HttpResponseParser RawRequestMessageC = RawRequestMessage RawResponseMessageC = RawResponseMessage except ImportError: # pragma: no cover pass