I've created a script for web scraping and I'm using 2Captcha to solve captchas. 2Captcha has a Python library, but I've created my own functions to generate the captcha ID and captcha token code.
My captcha module has 3 functions: get_captcha_id(), get_captcha_response(), and apply_token()
Everything works great, and I'm able to sovled a couple dozen captchas until eventually I get the 2 following error: ERROR_WRONG_CAPTCHA_ID
When this happens, the script first comes to the error ERROR_CAPTCHA_UNSOLVABLE, then the loop goes back and generates an entire new captcha ID. Maybe I should keep the same ID and just generate a new token?
I just want to know if there's a better way to do this anyway...
Here is the code to start the 2Captcha on my main script:
captcha_solved = 0
#Solves recpacha via 2Captcha API
while captcha_solved == 0:
captcha_id = captcha.get_captcha_id(browser.current_url)
if captcha_id != 0 or captcha_id != None:
print("Captcha ID is: "+str(captcha_id))
cap_res = captcha.get_captcha_response(captcha_id)
if cap_res == "ERROR_CAPTCHA_UNSOLVABLE" or cap_res == "ERROR_TOKEN_EXPIRED" or cap_res == "ERROR_WRONG_CAPTCHA_ID":
print("Captcha failed... Restarting captcha")
browser.refresh()
sleep(1)
continue
else:
print("Capcha Token: "+cap_res)
captcha.apply_token(browser, cap_res)
solver.report(captcha_id, True)
captcha_solved = captcha_solved + 1
break
Once this while loop is complete, the main script will start. After about 2 dozen captcha or so, I'll receive this error:
Traceback (most recent call last):
File "C:\Users\Anthony\eclipse-workspace\Indiana SOS Biz Search\main.py", line 191, in <module>
cap_res = captcha.get_captcha_response(captcha_id)
File "C:\Users\Anthony\eclipse-workspace\Indiana SOS Biz Search\captcha.py", line 83, in get_captcha_response
solver.report(cap_id, False)
File "C:\Users\Anthony\AppData\Local\Programs\Python\Python39\lib\site-packages\twocaptcha\solver.py", line 496, in report
self.api_client.res(key=self.API_KEY, action=rep, id=id_)
File "C:\Users\Anthony\AppData\Local\Programs\Python\Python39\lib\site-packages\twocaptcha\api.py", line 113, in res
raise ApiException(resp)
twocaptcha.api.ApiException: ERROR_WRONG_CAPTCHA_ID
I've thought I added enough failsafes to be able to regenerate a Captcha Token Here is my captcha.py file code:
from twocaptcha import TwoCaptcha
from random import randint
from time import sleep
from urllib.request import urlopen, Request
import re
from bs4 import BeautifulSoup
from twocaptcha.solver import ValidationException
from twocaptcha.api import NetworkException, ApiException
from selenium.common.exceptions import TimeoutException
#solver = TwoCaptcha('API_KEY')
site_key = "###"
api_key = "###"
config = {
'server': '2captcha.com',
'apiKey': api_key,
'callback': 'https://your.site.com/',
'defaultTimeout': 120,
'recaptchaTimeout': 600,
'pollingInterval': 10,
}
proxy={
'type': 'HTTP',
'uri': '###'
}
user_agent = '###'
solver = TwoCaptcha(**config)
print("2Captcha Balance: $"+str(solver.balance()))
def get_captcha_id(captcha_url):
try:
result = solver.recaptcha(sitekey=site_key, url=captcha_url, proxy=proxy)
#print(result)
split_string = str(result).split(":", 1)
substring = split_string[0]
#print(substring)
if (substring == "{'captchaId'"):
strip_beginning = re.sub("{'captchaId': '", "", str(result))
captcha_id = re.sub("'}", "", strip_beginning)
return captcha_id
else:
print("could not find captcha ID")
return 0
except ValidationException as e:
# invalid parameters passed
print(e)
return e
except NetworkException as e:
# network error occurred
print(e)
return e
except ApiException as e:
# api respond with error
print(e)
return e
except TimeoutException as e:
# captcha is not solved so far
print(e)
return e
def get_captcha_response(cap_id):
capcha_ready = 0
response_url = "https://2captcha.com/res.php?key="+api_key+"&action=get&id="+cap_id
while capcha_ready == 0:
PageRequest = Request(response_url,data=None,headers={'User-Agent': user_agent})
PageResponse = urlopen(PageRequest)
PageHtml = PageResponse.read()
PageSoup = BeautifulSoup(PageHtml, 'html.parser')
SoupText = str(PageSoup)
if SoupText == "ERROR_CAPTCHA_UNSOLVABLE" or SoupText == "ERROR_WRONG_CAPTCHA_ID" or SoupText == "ERROR_TOKEN_EXPIRED":
solver.report(cap_id, False)
return SoupText
elif str(PageSoup) == "CAPCHA_NOT_READY":
print("Waiting for capcha response...")
rand = randint(12,18)
print("sleeping for "+str(rand)+" seconds")
sleep(rand)
else:
split_string = str(PageSoup).split("|", 1)
if len(split_string) > 0:
substring = split_string[1]
return substring
capcha_ready = capcha_ready + 1
#print(PageSoup)
return PageSoup
def apply_token(browser, token):
print("Applying token to browser...")
browser.execute_script('document.getElementById("g-recaptcha-response").innerHTML = "{}";'.format(token))
print("Token applied")
Thanks for your help for this, I really appreciate it!