From 2011f3a0b2cf381d9c8e0b16614a6552a5b83a3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=2E=20K=C3=A4rkk=C3=A4inen?=
 <98187+Tronic@users.noreply.github.com>
Date: Tue, 27 Aug 2019 16:30:23 +0300
Subject: [PATCH] PEP 594 has cgi module scheduled for deprecation in Python
 3.8 (#1649)

* PEP 594 has cgi module scheduled for deprecation in Python 3.8. Reimplement
cgi.parse_header in Sanic. The new implementation is much faster than either
cgi.parse_header or equivalent werkzeug.parse_options_header, and unlike the
two, handles also quoted values with semicolons or \" in them.

* Fix string escape.

* Useless linter complaints.

* More linter issues

* Add return type hint.

* Do not support quoted-pair escapes.

- Improved documentation and renamed the function more aptly as it only seems
  to apply to content-type and content-disposition headers.

* Unquote filenames also in normal mode.

* Add tests for headers. Adapted from CPython parse_header tests with changes on the final test.

* Linter

* Revert "Unquote filenames also in normal mode."

This reverts commit bf0d502bcd5c443a4178f1c239692976c0f5f185.

* Improved parse_content_header and added tests with Firefox and Chrome.

- Unescaping of quotes moved to parse_content_header because it affects all fields,
  not just filenames.
- It is impossible to handle all cases correctly but the current heuristics should
  suffice well for typical cases and beyond.
- Added comparisons with cgi.parse_header and werkzeug.parse_options_header.

* Updated comments as well.
---
 sanic/headers.py      | 37 ++++++++++++++++++++++++++++
 sanic/request.py      |  6 ++---
 tests/test_headers.py | 57 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 97 insertions(+), 3 deletions(-)
 create mode 100644 sanic/headers.py
 create mode 100644 tests/test_headers.py

diff --git a/sanic/headers.py b/sanic/headers.py
new file mode 100644
index 00000000..8062b57c
--- /dev/null
+++ b/sanic/headers.py
@@ -0,0 +1,37 @@
+import re
+import typing
+
+
+Options = typing.Dict[str, str]  # key=value fields in various headers
+
+token, quoted = r"([\w!#$%&'*+\-.^_`|~]+)", r'"([^"]*)"'
+parameter = re.compile(fr";\s*{token}=(?:{token}|{quoted})", re.ASCII)
+firefox_quote_escape = re.compile(r'\\"(?!; |\s*$)')
+
+# RFC's quoted-pair escapes are mostly ignored by browsers. Chrome, Firefox and
+# curl all have different escaping, that we try to handle as well as possible,
+# even though no client espaces in a way that would allow perfect handling.
+
+# For more information, consult ../tests/test_requests.py
+
+
+def parse_content_header(value: str) -> typing.Tuple[str, Options]:
+    """Parse content-type and content-disposition header values.
+
+    E.g. 'form-data; name=upload; filename=\"file.txt\"' to
+    ('form-data', {'name': 'upload', 'filename': 'file.txt'})
+
+    Mostly identical to cgi.parse_header and werkzeug.parse_options_header
+    but runs faster and handles special characters better. Unescapes quotes.
+    """
+    value = firefox_quote_escape.sub("%22", value)
+    pos = value.find(";")
+    if pos == -1:
+        options = {}
+    else:
+        options = {
+            m.group(1).lower(): m.group(2) or m.group(3).replace("%22", '"')
+            for m in parameter.finditer(value[pos:])
+        }
+        value = value[:pos]
+    return value.strip().lower(), options
diff --git a/sanic/request.py b/sanic/request.py
index 8356d579..9663063b 100644
--- a/sanic/request.py
+++ b/sanic/request.py
@@ -4,7 +4,6 @@ import json
 import sys
 import warnings
 
-from cgi import parse_header
 from collections import defaultdict, namedtuple
 from http.cookies import SimpleCookie
 from urllib.parse import parse_qs, parse_qsl, unquote, urlunparse
@@ -12,6 +11,7 @@ from urllib.parse import parse_qs, parse_qsl, unquote, urlunparse
 from httptools import parse_url
 
 from sanic.exceptions import InvalidUsage
+from sanic.headers import parse_content_header
 from sanic.log import error_logger, logger
 
 
@@ -177,7 +177,7 @@ class Request(dict):
             content_type = self.headers.get(
                 "Content-Type", DEFAULT_HTTP_CONTENT_TYPE
             )
-            content_type, parameters = parse_header(content_type)
+            content_type, parameters = parse_content_header(content_type)
             try:
                 if content_type == "application/x-www-form-urlencoded":
                     self.parsed_form = RequestParameters(
@@ -561,7 +561,7 @@ def parse_multipart_form(body, boundary):
 
             colon_index = form_line.index(":")
             form_header_field = form_line[0:colon_index].lower()
-            form_header_value, form_parameters = parse_header(
+            form_header_value, form_parameters = parse_content_header(
                 form_line[colon_index + 2 :]
             )
 
diff --git a/tests/test_headers.py b/tests/test_headers.py
new file mode 100644
index 00000000..e228f386
--- /dev/null
+++ b/tests/test_headers.py
@@ -0,0 +1,57 @@
+import pytest
+
+from sanic import headers
+
+
+@pytest.mark.parametrize(
+    "input, expected",
+    [
+        ("text/plain", ("text/plain", {})),
+        ("text/vnd.just.made.this.up ; ", ("text/vnd.just.made.this.up", {})),
+        ("text/plain;charset=us-ascii", ("text/plain", {"charset": "us-ascii"})),
+        ('text/plain ; charset="us-ascii"', ("text/plain", {"charset": "us-ascii"})),
+        (
+            'text/plain ; charset="us-ascii"; another=opt',
+            ("text/plain", {"charset": "us-ascii", "another": "opt"})
+        ),
+        (
+            'attachment; filename="silly.txt"',
+            ("attachment", {"filename": "silly.txt"})
+        ),
+        (
+            'attachment; filename="strange;name"',
+            ("attachment", {"filename": "strange;name"})
+        ),
+        (
+            'attachment; filename="strange;name";size=123;',
+            ("attachment", {"filename": "strange;name", "size": "123"})
+        ),
+        (
+            'form-data; name="files"; filename="fo\\"o;bar\\"',
+            ('form-data', {'name': 'files', 'filename': 'fo"o;bar\\'})
+            # cgi.parse_header:
+            # ('form-data', {'name': 'files', 'filename': 'fo"o;bar\\'})
+            # werkzeug.parse_options_header:
+            # ('form-data', {'name': 'files', 'filename': '"fo\\"o', 'bar\\"': None})
+        ),
+        # <input type=file name="foo&quot;;bar\"> with Unicode filename!
+        (
+            # Chrome:
+            # Content-Disposition: form-data; name="foo%22;bar\"; filename="😀"
+            'form-data; name="foo%22;bar\\"; filename="😀"',
+            ('form-data', {'name': 'foo";bar\\', 'filename': '😀'})
+            # cgi: ('form-data', {'name': 'foo%22;bar"; filename="😀'})
+            # werkzeug: ('form-data', {'name': 'foo%22;bar"; filename='})
+        ),
+        (
+            # Firefox:
+            # Content-Disposition: form-data; name="foo\";bar\"; filename="😀"
+            'form-data; name="foo\\";bar\\"; filename="😀"',
+            ('form-data', {'name': 'foo";bar\\', 'filename': '😀'})
+            # cgi: ('form-data', {'name': 'foo";bar"; filename="😀'})
+            # werkzeug: ('form-data', {'name': 'foo";bar"; filename='})
+        ),
+    ]
+)
+def test_parse_headers(input, expected):
+    assert headers.parse_content_header(input) == expected