You can get several pages in parallel, without threads. It exploits HTTP pipelining by resetting the state(private variable!) of HTTPSConnection
to trick it into sending the next request ahead of time.
from http.client import HTTPSConnection, _CS_IDLE
from urllib.parse import urlparse, urlunparse
def pipeline(host, pages, max_out_bound=4, debuglevel=0):
page_count = len(pages)
conn = HTTPSConnection(host)
conn.set_debuglevel(debuglevel)
responses = [None] * page_count
finished = [False] * page_count
content = [None] * page_count
headers = {'Host': host, 'Content-Length': 0, 'Connection': 'Keep-Alive'}
while not all(finished):
# Send
out_bound = 0
for i, page in enumerate(pages):
if out_bound >= max_out_bound:
break
elif page and not finished[i] and responses[i] is None:
if debuglevel > 0:
print('Sending request for %r...' % (page,))
conn._HTTPConnection__state = _CS_IDLE # private variable!
conn.request("GET", page, None, headers)
responses[i] = conn.response_class(conn.sock, method=conn._method)
out_bound += 1
# Try to read a response
for i, resp in enumerate(responses):
if resp is None:
continue
if debuglevel > 0:
print('Retrieving %r...' % (pages[i],))
out_bound -= 1
skip_read = False
resp.begin()
if debuglevel > 0:
print(' %d %s' % (resp.status, resp.reason))
if 200 <= resp.status < 300:
# Ok
content[i] = resp.read()
cookie = resp.getheader('Set-Cookie')
if cookie is not None:
headers['Cookie'] = cookie
skip_read = True
finished[i] = True
responses[i] = None
elif 300 <= resp.status < 400:
# Redirect
loc = resp.getheader('Location')
responses[i] = None
parsed = loc and urlparse(loc)
if not parsed:
# Missing or empty location header
content[i] = (resp.status, resp.reason)
finished[i] = True
elif parsed.netloc != '' and parsed.netloc != host:
# Redirect to another host
content[i] = (resp.status, resp.reason, loc)
finished[i] = True
else:
path = urlunparse(parsed._replace(scheme='', netloc='', fragment=''))
if debuglevel > 0:
print(' Updated %r to %r' % (pages[i], path))
pages[i] = path
elif resp.status >= 400:
# Failed
content[i] = (resp.status, resp.reason)
finished[i] = True
responses[i] = None
if resp.will_close:
# Connection (will be) closed, need to resend
conn.close()
if debuglevel > 0:
print(' Connection closed')
for j, f in enumerate(finished):
if not f and responses[j] is not None:
if debuglevel > 0:
print(' Discarding out-bound request for %r' % (pages[j],))
responses[j] = None
break
elif not skip_read:
resp.read() # read any data
if any(not f and responses[j] is None for j, f in enumerate(finished)):
# Send another pending request
break
else:
break # All responses are None?
return content
if __name__ == '__main__':
domain = 'en.wikipedia.org'
pages = ['/wiki/HTTP_pipelining', '/wiki/HTTP', '/wiki/HTTP_persistent_connection']
data = pipeline(domain, pages, max_out_bound=3, debuglevel=1)
for i, page in enumerate(data):
print()
print('==== Page %r ====' % (pages[i],))
print(page[:512])