I am attempting to read a site into BeautifulSoup and so far every attempt has failed at the point where I attempt to open a secure connection (I initially tried to approach this with Python 3, but as you can see that was also fraught with danger). Here is my latest attempt involving urllib2 (I haven't found a urllib3 example or have had much success updating this code to urllib3):
import httplib, ssl, urllib2, socket
from bs4 import BeautifulSoup
class HTTPSConnectionV3(httplib.HTTPSConnection):
def __init__(self, *args, **kwargs):
httplib.HTTPSConnection.__init__(self, *args, **kwargs)
def connect(self):
sock = socket.create_connection((self.host, self.port), self.timeout)
if self._tunnel_host:
self.sock = sock
self._tunnel()
try:
self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
except ssl.SSLError, e:
print("Trying SSLv3.")
self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
class HTTPSHandlerV3(urllib2.HTTPSHandler):
def https_open(self, req):
return self.do_open(HTTPSConnectionV3, req)
# install opener
urllib2.install_opener(urllib2.build_opener(HTTPSHandlerV3()))
r = urllib2.urlopen('https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm')
s= r.read()
soup = BeautifulSoup(s)
for t in soup.findAll('h2'):
print(t)
When I run this code I get the following stack-trace:
Traceback (most recent call last):
File "test.py", line 27, in <module>
r = urllib2.urlopen('https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm')
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 404, in open
response = self._open(req, data)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 422, in _open
'_open', req)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "test.py", line 22, in https_open
return self.do_open(HTTPSConnectionV3, req)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 1187, in do_open
r = h.getresponse(buffering=True)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1045, in getresponse
response.begin()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 409, in begin
version, status, reason = self._read_status()
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 373, in _read_status
raise BadStatusLine(line)
httplib.BadStatusLine: ''
To make matters trickier here is what I see when I cURL the URL:
$ curl -v https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm
* Adding handle: conn: 0x7ff1f1804000
* Adding handle: send: 0
* Adding handle: recv: 0
* Curl_addHandleToPipeline: length: 1
* - Conn 0 (0x7ff1f1804000) send_pipe: 1, recv_pipe: 0
* About to connect() to bw6.clpccd.cc.ca.us port 443 (#0)
* Trying 205.155.225.145...
* Connected to bw6.clpccd.cc.ca.us (205.155.225.145) port 443 (#0)
* Server aborted the SSL handshake
* Closing connection 0
curl: (35) Server aborted the SSL handshake
If I force SSLv3, I get the expected output:
$ curl -v -3 https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm
* Adding handle: conn: 0x7ffa50804000
* Adding handle: send: 0
* Adding handle: recv: 0
* Curl_addHandleToPipeline: length: 1
* - Conn 0 (0x7ffa50804000) send_pipe: 1, recv_pipe: 0
* About to connect() to bw6.clpccd.cc.ca.us port 443 (#0)
* Trying 205.155.225.145...
* Connected to bw6.clpccd.cc.ca.us (205.155.225.145) port 443 (#0)
* SSL 3.0 connection using SSL_RSA_WITH_RC4_128_SHA
* Server certificate: bw6.clpccd.cc.ca.us
* Server certificate: VeriSign Class 3 Secure Server CA - G3
* Server certificate: VeriSign Class 3 Public Primary Certification Authority - G5
* Server certificate: Class 3 Public Primary Certification Authority
> GET /clpccd/2014/02/sched_l.htm HTTP/1.1
> User-Agent: Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)
> Host: bw6.clpccd.cc.ca.us
> Accept: */*
> Referer:
>
< HTTP/1.1 200 OK
< Date: Thu, 23 Oct 2014 00:00:11 GMT
* Server Oracle-Application-Server-10g/10.1.3.4.0 Oracle-HTTP-Server is not blacklisted
< Server: Oracle-Application-Server-10g/10.1.3.4.0 Oracle-HTTP-Server
< Last-Modified: Wed, 22 Oct 2014 20:05:42 GMT
< ETag: "422e-1e72-54480e16"
< Accept-Ranges: bytes
< Content-Length: 7794
< Connection: close
< Content-Type: text/html
<
<html....>
* Closing connection 0
In case my earlier attempt helps anyone here is what my approach looked like when I was using the Requests library with Python 3 (following their documentation for Example: Specific SSL Version¶)
import ssl
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.poolmanager import PoolManager
from bs4 import BeautifulSoup
headers = {'User-agent': 'Mozilla/5.0 (Windows NT 5.2; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}
class Ssl3HttpAdapter(HTTPAdapter):
""""Transport adapter" that allows us to use SSLv3."""
def init_poolmanager(self, connections, maxsize, block=True):
self.poolmanager = PoolManager(num_pools=connections,
maxsize=maxsize,
block=block,
ssl_version=ssl.PROTOCOL_SSLv3)
s = requests.session()
s.mount('https://bw6.clpccd.cc.ca.us', Ssl3HttpAdapter())
r = s.get('https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm', headers=headers)
soup = BeautifulSoup(r.text)
for t in soup.findAll('h2'):
print(t)
Which produced a similar (but more cryptic) stack-trace:
Traceback (most recent call last):
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 331, in _make_request
httplib_response = conn.getresponse(buffering=True)
TypeError: getresponse() got an unexpected keyword argument 'buffering'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 516, in urlopen
body=body, headers=headers)
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 333, in _make_request
httplib_response = conn.getresponse()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 1172, in getresponse
response.begin()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 351, in begin
version, status, reason = self._read_status()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 321, in _read_status
raise BadStatusLine(line)
http.client.BadStatusLine: ''
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.4/site-packages/requests/adapters.py", line 362, in send
timeout=timeout
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 559, in urlopen
_pool=self, _stacktrace=stacktrace)
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/util/retry.py", line 245, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/packages/six.py", line 309, in reraise
raise value.with_traceback(tb)
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 516, in urlopen
body=body, headers=headers)
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 333, in _make_request
httplib_response = conn.getresponse()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 1172, in getresponse
response.begin()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 351, in begin
version, status, reason = self._read_status()
File "/usr/local/Cellar/python3/3.4.2_1/Frameworks/Python.framework/Versions/3.4/lib/python3.4/http/client.py", line 321, in _read_status
raise BadStatusLine(line)
requests.packages.urllib3.exceptions.ProtocolError: ('Connection aborted.', BadStatusLine("''",))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "./test.py", line 23, in <module>
r = s.get('https://bw6.clpccd.cc.ca.us/clpccd/2014/02/sched_l.htm', headers=headers)
File "/usr/local/lib/python3.4/site-packages/requests/sessions.py", line 469, in get
return self.request('GET', url, **kwargs)
File "/usr/local/lib/python3.4/site-packages/requests/sessions.py", line 457, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python3.4/site-packages/requests/sessions.py", line 569, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python3.4/site-packages/requests/adapters.py", line 407, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', BadStatusLine("''",))