Arguably (and I do), the email
module is barely worth it for parsing POST request bodies. "What you're doing wrong" could be argued (though I haven't) to be "trying to parse a POST request in 2023 without 3rd-party libraries".
WSGI, which is being pushed as the essential replacement for CGI, doesn't provide everything the email
library needs to parse the request body all in one place. You'll need to inject a header beforehand to prime it (a) that the message is multipart
, and (b) what the boundary
between the form fields is. Then you need to Message.iter_parts()
, and from each of these Message.get_payload()
for the value, Message.get_filename()
for the names of any file attachment uploads, and the "legacy" function get_param('name', header='content-disposition')
for field names because the non-legacy replacement, Message.get_name()
, doesn't seem to exist yet.
Here's a working draft I threw together this afternoon; I don't like it, but can't see any better approaches for now:
import wsgiref.simple_server
import email.message, email.parser, email.policy
import urllib.parse
import re
from collections import namedtuple
try:
from resource import getpagesize
except ImportError:
import mmap
def getpagesize():
return mmap.PAGESIZE
FieldEntry = namedtuple('FieldEntry', ["name", "value", "filename", "MIMEtype"])
FieldEntry_T = 'tuple[str, Optional[bytes], Optional[str], Optional[str]]'
def wsgi_parseForm(environ, /):
if environ['REQUEST_METHOD'] == 'GET':
for k, v in _parse_qs(environ['QUERY_STRING']):
yield FieldEntry(k, v, None, None)
return
m = email.message.Message()
m.add_header('Content-Type', environ['CONTENT_TYPE'])
match m.get_content_type():
case 'application/x-www-form-urlencoded':
for k, v in _parse_qs(bytes().join(_wsgi_body(environ)).decode('ascii')):
yield FieldEntry(k, v, None, None)
return
case 'multipart/form-data':
p = email.parser.BytesFeedParser(policy=email.policy.HTTP)
p.feed(('Content-Type: %s\r\n' % environ['CONTENT_TYPE']).encode('utf-8'))
# ^Don't try to abbreviate this line; it also injects the boundary parameter, which is needed to parse out the sub-messages!
p.feed('\r\n'.encode('utf-8'))
for chunk in _wsgi_body(environ):
# TODO stream each element to the caller as they arrive
# rather than loading them all into RAM at the same time
p.feed(chunk)
m = p.close(); del p
assert m.is_multipart()
for part in m.iter_parts():
part.set_default_type(None)
yield FieldEntry(
part.get_param('name', header='content-disposition'),
part.get_payload(decode=True),
part.get_filename(None),
part.get_content_type()
)
case t:
raise ValueError('unexpected Content-Type: %s' % t)
def _wsgi_body(environ, /):
# Workaround helper function for https://github.com/python/cpython/issues/66077
wsgi_input = environ['wsgi.input']
try:
_read = wsgi_input.read1
except AttributeError:
def _read(n=getpagesize(), /):
return wsgi_input.read(n)
if environ.get('wsgi.input_terminated', False):
# https://github.com/GrahamDumpleton/mod_wsgi/blob/4.9.4/docs/configuration-directives/WSGIChunkedRequest.rst
try:
yield from iter(wsgi_input)
return
except (TypeError, NotImplementedError):
while chunk := _read():
yield chunk
return
if 'HTTP_TRANSFER_ENCODING' in environ:
# https://mail.python.org/pipermail/web-sig/2007-March/002630.html
# https://wsgi.readthedocs.io/en/latest/proposals-2.0.html#unknown-length-wsgi-input
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Transfer-Encoding#chunked_encoding
raise NotImplementedError("Transfer-Encoding: %s" % environ['HTTP_TRANSFER_ENCODING'])
toread = int(environ.get('CONTENT_LENGTH', 0) or 0) # Weirdly, this is set to the empty string for requests with no body, hence the or-clause
readsofar = 0
while (readsofar < toread):
chunk = _read()
readsofar += len(chunk)
yield chunk
_QSSPLIT = re.compile(r'(?:^|&)([^&]*)')
_QSPARAM = re.compile(r'^(.*?)(?:=(.*))?$', re.DOTALL)
def _parse_qs(qs: str) -> 'Iterable[tuple[str, Optional[bytes]]]':
for p in (m.group(1) for m in _QSSPLIT.finditer(qs)):
k, v = _QSPARAM.match(p).groups()
k = urllib.parse.unquote_plus(k)
v = urllib.parse.unquote_to_bytes(v.replace('+', ' ')) if v is not None else v
yield k, v
def main_wsgi(environ, start_response):
from pprint import pformat
if (environ['REQUEST_METHOD'] == 'GET') and (not environ.get('QUERY_STRING')):
start_response('200 OK', [('Content-Type', 'text/html')])
yield '<form method="POST" enctype="multipart/form-data">'.encode()
yield '<label for="username">username:</label><input name="username" value="user123" /><br />'.encode()
yield '<label for="password">password:</label><input name="password" value="lol456" /><br />'.encode()
yield '<label for="avatar">avatar:</label><input name="avatar" type="file"><br /><input type="submit" value="register" />'.encode()
else:
start_response('200 OK', [('Content-Type', 'text/plain')])
yield pformat(list(wsgi_parseForm(environ))).encode()
if __name__ == '__main__':
wsgiref.simple_server.make_server('', 8000, main_wsgi).serve_forever()
Perhaps studying one of the existing library implementations could be educational or enlightening.