From 2011f3a0b2cf381d9c8e0b16614a6552a5b83a3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=2E=20K=C3=A4rkk=C3=A4inen?= <98187+Tronic@users.noreply.github.com> Date: Tue, 27 Aug 2019 16:30:23 +0300 Subject: [PATCH] PEP 594 has cgi module scheduled for deprecation in Python 3.8 (#1649) * PEP 594 has cgi module scheduled for deprecation in Python 3.8. Reimplement cgi.parse_header in Sanic. The new implementation is much faster than either cgi.parse_header or equivalent werkzeug.parse_options_header, and unlike the two, handles also quoted values with semicolons or \" in them. * Fix string escape. * Useless linter complaints. * More linter issues * Add return type hint. * Do not support quoted-pair escapes. - Improved documentation and renamed the function more aptly as it only seems to apply to content-type and content-disposition headers. * Unquote filenames also in normal mode. * Add tests for headers. Adapted from CPython parse_header tests with changes on the final test. * Linter * Revert "Unquote filenames also in normal mode." This reverts commit bf0d502bcd5c443a4178f1c239692976c0f5f185. * Improved parse_content_header and added tests with Firefox and Chrome. - Unescaping of quotes moved to parse_content_header because it affects all fields, not just filenames. - It is impossible to handle all cases correctly but the current heuristics should suffice well for typical cases and beyond. - Added comparisons with cgi.parse_header and werkzeug.parse_options_header. * Updated comments as well. --- sanic/headers.py | 37 ++++++++++++++++++++++++++++ sanic/request.py | 6 ++--- tests/test_headers.py | 57 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 3 deletions(-) create mode 100644 sanic/headers.py create mode 100644 tests/test_headers.py diff --git a/sanic/headers.py b/sanic/headers.py new file mode 100644 index 00000000..8062b57c --- /dev/null +++ b/sanic/headers.py @@ -0,0 +1,37 @@ +import re +import typing + + +Options = typing.Dict[str, str] # key=value fields in various headers + +token, quoted = r"([\w!#$%&'*+\-.^_`|~]+)", r'"([^"]*)"' +parameter = re.compile(fr";\s*{token}=(?:{token}|{quoted})", re.ASCII) +firefox_quote_escape = re.compile(r'\\"(?!; |\s*$)') + +# RFC's quoted-pair escapes are mostly ignored by browsers. Chrome, Firefox and +# curl all have different escaping, that we try to handle as well as possible, +# even though no client espaces in a way that would allow perfect handling. + +# For more information, consult ../tests/test_requests.py + + +def parse_content_header(value: str) -> typing.Tuple[str, Options]: + """Parse content-type and content-disposition header values. + + E.g. 'form-data; name=upload; filename=\"file.txt\"' to + ('form-data', {'name': 'upload', 'filename': 'file.txt'}) + + Mostly identical to cgi.parse_header and werkzeug.parse_options_header + but runs faster and handles special characters better. Unescapes quotes. + """ + value = firefox_quote_escape.sub("%22", value) + pos = value.find(";") + if pos == -1: + options = {} + else: + options = { + m.group(1).lower(): m.group(2) or m.group(3).replace("%22", '"') + for m in parameter.finditer(value[pos:]) + } + value = value[:pos] + return value.strip().lower(), options diff --git a/sanic/request.py b/sanic/request.py index 8356d579..9663063b 100644 --- a/sanic/request.py +++ b/sanic/request.py @@ -4,7 +4,6 @@ import json import sys import warnings -from cgi import parse_header from collections import defaultdict, namedtuple from http.cookies import SimpleCookie from urllib.parse import parse_qs, parse_qsl, unquote, urlunparse @@ -12,6 +11,7 @@ from urllib.parse import parse_qs, parse_qsl, unquote, urlunparse from httptools import parse_url from sanic.exceptions import InvalidUsage +from sanic.headers import parse_content_header from sanic.log import error_logger, logger @@ -177,7 +177,7 @@ class Request(dict): content_type = self.headers.get( "Content-Type", DEFAULT_HTTP_CONTENT_TYPE ) - content_type, parameters = parse_header(content_type) + content_type, parameters = parse_content_header(content_type) try: if content_type == "application/x-www-form-urlencoded": self.parsed_form = RequestParameters( @@ -561,7 +561,7 @@ def parse_multipart_form(body, boundary): colon_index = form_line.index(":") form_header_field = form_line[0:colon_index].lower() - form_header_value, form_parameters = parse_header( + form_header_value, form_parameters = parse_content_header( form_line[colon_index + 2 :] ) diff --git a/tests/test_headers.py b/tests/test_headers.py new file mode 100644 index 00000000..e228f386 --- /dev/null +++ b/tests/test_headers.py @@ -0,0 +1,57 @@ +import pytest + +from sanic import headers + + +@pytest.mark.parametrize( + "input, expected", + [ + ("text/plain", ("text/plain", {})), + ("text/vnd.just.made.this.up ; ", ("text/vnd.just.made.this.up", {})), + ("text/plain;charset=us-ascii", ("text/plain", {"charset": "us-ascii"})), + ('text/plain ; charset="us-ascii"', ("text/plain", {"charset": "us-ascii"})), + ( + 'text/plain ; charset="us-ascii"; another=opt', + ("text/plain", {"charset": "us-ascii", "another": "opt"}) + ), + ( + 'attachment; filename="silly.txt"', + ("attachment", {"filename": "silly.txt"}) + ), + ( + 'attachment; filename="strange;name"', + ("attachment", {"filename": "strange;name"}) + ), + ( + 'attachment; filename="strange;name";size=123;', + ("attachment", {"filename": "strange;name", "size": "123"}) + ), + ( + 'form-data; name="files"; filename="fo\\"o;bar\\"', + ('form-data', {'name': 'files', 'filename': 'fo"o;bar\\'}) + # cgi.parse_header: + # ('form-data', {'name': 'files', 'filename': 'fo"o;bar\\'}) + # werkzeug.parse_options_header: + # ('form-data', {'name': 'files', 'filename': '"fo\\"o', 'bar\\"': None}) + ), + # with Unicode filename! + ( + # Chrome: + # Content-Disposition: form-data; name="foo%22;bar\"; filename="😀" + 'form-data; name="foo%22;bar\\"; filename="😀"', + ('form-data', {'name': 'foo";bar\\', 'filename': '😀'}) + # cgi: ('form-data', {'name': 'foo%22;bar"; filename="😀'}) + # werkzeug: ('form-data', {'name': 'foo%22;bar"; filename='}) + ), + ( + # Firefox: + # Content-Disposition: form-data; name="foo\";bar\"; filename="😀" + 'form-data; name="foo\\";bar\\"; filename="😀"', + ('form-data', {'name': 'foo";bar\\', 'filename': '😀'}) + # cgi: ('form-data', {'name': 'foo";bar"; filename="😀'}) + # werkzeug: ('form-data', {'name': 'foo";bar"; filename='}) + ), + ] +) +def test_parse_headers(input, expected): + assert headers.parse_content_header(input) == expected