PEP 594 has cgi module scheduled for deprecation in Python 3.8 (#1649)

* PEP 594 has cgi module scheduled for deprecation in Python 3.8. Reimplement
cgi.parse_header in Sanic. The new implementation is much faster than either
cgi.parse_header or equivalent werkzeug.parse_options_header, and unlike the
two, handles also quoted values with semicolons or \" in them.

* Fix string escape.

* Useless linter complaints.

* More linter issues

* Add return type hint.

* Do not support quoted-pair escapes.

- Improved documentation and renamed the function more aptly as it only seems
  to apply to content-type and content-disposition headers.

* Unquote filenames also in normal mode.

* Add tests for headers. Adapted from CPython parse_header tests with changes on the final test.

* Linter

* Revert "Unquote filenames also in normal mode."

This reverts commit bf0d502bcd.

* Improved parse_content_header and added tests with Firefox and Chrome.

- Unescaping of quotes moved to parse_content_header because it affects all fields,
  not just filenames.
- It is impossible to handle all cases correctly but the current heuristics should
  suffice well for typical cases and beyond.
- Added comparisons with cgi.parse_header and werkzeug.parse_options_header.

* Updated comments as well.
This commit is contained in:
L. Kärkkäinen
2019-08-27 16:30:23 +03:00
committed by Stephen Sadowski
parent 228a31ee0a
commit 2011f3a0b2
3 changed files with 97 additions and 3 deletions

37
sanic/headers.py Normal file
View File

@@ -0,0 +1,37 @@
import re
import typing
Options = typing.Dict[str, str] # key=value fields in various headers
token, quoted = r"([\w!#$%&'*+\-.^_`|~]+)", r'"([^"]*)"'
parameter = re.compile(fr";\s*{token}=(?:{token}|{quoted})", re.ASCII)
firefox_quote_escape = re.compile(r'\\"(?!; |\s*$)')
# RFC's quoted-pair escapes are mostly ignored by browsers. Chrome, Firefox and
# curl all have different escaping, that we try to handle as well as possible,
# even though no client espaces in a way that would allow perfect handling.
# For more information, consult ../tests/test_requests.py
def parse_content_header(value: str) -> typing.Tuple[str, Options]:
"""Parse content-type and content-disposition header values.
E.g. 'form-data; name=upload; filename=\"file.txt\"' to
('form-data', {'name': 'upload', 'filename': 'file.txt'})
Mostly identical to cgi.parse_header and werkzeug.parse_options_header
but runs faster and handles special characters better. Unescapes quotes.
"""
value = firefox_quote_escape.sub("%22", value)
pos = value.find(";")
if pos == -1:
options = {}
else:
options = {
m.group(1).lower(): m.group(2) or m.group(3).replace("%22", '"')
for m in parameter.finditer(value[pos:])
}
value = value[:pos]
return value.strip().lower(), options

View File

@@ -4,7 +4,6 @@ import json
import sys
import warnings
from cgi import parse_header
from collections import defaultdict, namedtuple
from http.cookies import SimpleCookie
from urllib.parse import parse_qs, parse_qsl, unquote, urlunparse
@@ -12,6 +11,7 @@ from urllib.parse import parse_qs, parse_qsl, unquote, urlunparse
from httptools import parse_url
from sanic.exceptions import InvalidUsage
from sanic.headers import parse_content_header
from sanic.log import error_logger, logger
@@ -177,7 +177,7 @@ class Request(dict):
content_type = self.headers.get(
"Content-Type", DEFAULT_HTTP_CONTENT_TYPE
)
content_type, parameters = parse_header(content_type)
content_type, parameters = parse_content_header(content_type)
try:
if content_type == "application/x-www-form-urlencoded":
self.parsed_form = RequestParameters(
@@ -561,7 +561,7 @@ def parse_multipart_form(body, boundary):
colon_index = form_line.index(":")
form_header_field = form_line[0:colon_index].lower()
form_header_value, form_parameters = parse_header(
form_header_value, form_parameters = parse_content_header(
form_line[colon_index + 2 :]
)