Using email.message to parse an HTTP POST request?

Question

With the deprecation of the standard cgi library in favor of WSGI starting in Python 3.11, the documentation alleges:

The FieldStorage class can typically be replaced with … the email.message module or [3rd-party] multipart for POST and PUT.

However, it's not at all obvious how the first of these (using the standard library to parse HTTP POST requests) is actually done.

For example, the following code hangs on EmailBytesParser.parse():

import wsgiref.simple_server
import email.parser

import logging

_EmailBytesParser = email.parser.BytesParser()

def main_wsgi(environ, start_response):
    if environ['REQUEST_METHOD'] == 'GET':
        start_response('200 OK', [('Content-Type', 'text/html')])
        yield '<form method="POST" enctype="multipart/form-data"><label for="username">username:</label><input name="username" value="user123" /><br /><label for="avatar">avatar: </label><input name="avatar" type="file"><br /><input type="submit" value="register" />'.encode()
    else:
        assert environ['REQUEST_METHOD'] == 'POST'
        assert environ['CONTENT_TYPE'].startswith('multipart/form-data')
        start_response('200 OK', [('Content-Type', 'text/plain')])
        logging.debug("About to start parsing form data...")
        # body = environ['wsgi.input'].read(int(environ['CONTENT_LENGTH']))
        body = _EmailBytesParser.parse(environ['wsgi.input'], headersonly=True)
        logging.debug("Form data parsed!")
        yield repr(body).encode()

if __name__ == '__main__':
    logging.basicConfig()
    logging.getLogger().setLevel(logging.DEBUG)
    wsgiref.simple_server.make_server('', 8000, main_wsgi).serve_forever()

I wasn't able to find any code examples online actually demonstrating the email module's suitability for this. What am I doing wrong?

score 1 · Answer 1 · answered May 02 '23 at 22:26

Arguably (and I do), the email module is barely worth it for parsing POST request bodies. "What you're doing wrong" could be argued (though I haven't) to be "trying to parse a POST request in 2023 without 3rd-party libraries".

WSGI, which is being pushed as the essential replacement for CGI, doesn't provide everything the email library needs to parse the request body all in one place. You'll need to inject a header beforehand to prime it (a) that the message is multipart, and (b) what the boundary between the form fields is. Then you need to Message.iter_parts(), and from each of these Message.get_payload() for the value, Message.get_filename() for the names of any file attachment uploads, and the "legacy" function get_param('name', header='content-disposition') for field names because the non-legacy replacement, Message.get_name(), doesn't seem to exist yet.

Here's a working draft I threw together this afternoon; I don't like it, but can't see any better approaches for now:

import wsgiref.simple_server
import email.message, email.parser, email.policy
import urllib.parse
import re
from collections import namedtuple
try:
    from resource import getpagesize
except ImportError:
    import mmap
    def getpagesize():
        return mmap.PAGESIZE

FieldEntry = namedtuple('FieldEntry', ["name", "value", "filename", "MIMEtype"])
FieldEntry_T = 'tuple[str, Optional[bytes], Optional[str], Optional[str]]'

def wsgi_parseForm(environ, /):
    if environ['REQUEST_METHOD'] == 'GET':
        for k, v in _parse_qs(environ['QUERY_STRING']):
            yield FieldEntry(k, v, None, None)
        return
    m = email.message.Message()
    m.add_header('Content-Type', environ['CONTENT_TYPE'])
    match m.get_content_type():
        case 'application/x-www-form-urlencoded':
            for k, v in _parse_qs(bytes().join(_wsgi_body(environ)).decode('ascii')):
                yield FieldEntry(k, v, None, None)
            return
        case 'multipart/form-data':
            p = email.parser.BytesFeedParser(policy=email.policy.HTTP)
            p.feed(('Content-Type: %s\r\n' % environ['CONTENT_TYPE']).encode('utf-8'))
            # ^Don't try to abbreviate this line; it also injects the boundary parameter, which is needed to parse out the sub-messages!
            p.feed('\r\n'.encode('utf-8'))
            for chunk in _wsgi_body(environ):
                # TODO stream each element to the caller as they arrive
                # rather than loading them all into RAM at the same time
                p.feed(chunk)
            m = p.close(); del p
            assert m.is_multipart()
            for part in m.iter_parts():
                part.set_default_type(None)
                yield FieldEntry(
                    part.get_param('name', header='content-disposition'),
                    part.get_payload(decode=True),
                    part.get_filename(None),
                    part.get_content_type()
                )
        case t:
            raise ValueError('unexpected Content-Type: %s' % t)


def _wsgi_body(environ, /):
    # Workaround helper function for https://github.com/python/cpython/issues/66077
    wsgi_input = environ['wsgi.input']
    try:
        _read = wsgi_input.read1
    except AttributeError:
        def _read(n=getpagesize(), /):
            return wsgi_input.read(n)

    if environ.get('wsgi.input_terminated', False):
        # https://github.com/GrahamDumpleton/mod_wsgi/blob/4.9.4/docs/configuration-directives/WSGIChunkedRequest.rst
        try:
            yield from iter(wsgi_input)
            return
        except (TypeError, NotImplementedError):
            while chunk := _read():
                yield chunk
            return
        
    if 'HTTP_TRANSFER_ENCODING' in environ:
        # https://mail.python.org/pipermail/web-sig/2007-March/002630.html
        # https://wsgi.readthedocs.io/en/latest/proposals-2.0.html#unknown-length-wsgi-input
        # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Transfer-Encoding#chunked_encoding
        raise NotImplementedError("Transfer-Encoding: %s" % environ['HTTP_TRANSFER_ENCODING'])

    toread = int(environ.get('CONTENT_LENGTH', 0) or 0)  # Weirdly, this is set to the empty string for requests with no body, hence the or-clause
    readsofar = 0
    while (readsofar < toread):
        chunk = _read()
        readsofar += len(chunk)
        yield chunk


_QSSPLIT = re.compile(r'(?:^|&)([^&]*)')
_QSPARAM = re.compile(r'^(.*?)(?:=(.*))?$', re.DOTALL)
def _parse_qs(qs: str) -> 'Iterable[tuple[str, Optional[bytes]]]':
    for p in (m.group(1) for m in _QSSPLIT.finditer(qs)):
        k, v = _QSPARAM.match(p).groups()
        k = urllib.parse.unquote_plus(k)
        v = urllib.parse.unquote_to_bytes(v.replace('+', ' ')) if v is not None else v
        yield k, v


def main_wsgi(environ, start_response):
    from pprint import pformat
    if (environ['REQUEST_METHOD'] == 'GET') and (not environ.get('QUERY_STRING')):
        start_response('200 OK', [('Content-Type', 'text/html')])
        yield '<form method="POST" enctype="multipart/form-data">'.encode()
        yield '<label for="username">username:</label><input name="username" value="user123" /><br />'.encode()
        yield '<label for="password">password:</label><input name="password" value="lol456" /><br />'.encode()
        yield '<label for="avatar">avatar:</label><input name="avatar" type="file"><br /><input type="submit" value="register" />'.encode()
    else:
        start_response('200 OK', [('Content-Type', 'text/plain')])
        yield pformat(list(wsgi_parseForm(environ))).encode()


if __name__ == '__main__':
    wsgiref.simple_server.make_server('', 8000, main_wsgi).serve_forever()

Perhaps studying one of the existing library implementations could be educational or enlightening.

One of the core Python developers suggested looking into the `policy` parameter to see if it offers any opportunity for cleaning up the code. — JamesTheAwesomeDude, May 03 '23 at 17:19

Using email.message to parse an HTTP POST request?

1 Answers1