Now I have used this:
from selenium import webdriver
from selenium.webdriver import FirefoxProfile
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options
class DNS_LOOKUP:
ROBTEX_IPLOOKUP = 'https://www.robtex.com/ip-lookup/'
ROBTEX_HEAD = '//section[1]/div[3]/p/a'
ROBTEX_TABLE = '//section[2]/div[3]/table/tbody/tr/td//a'
NSLOOKUP_IPV4 = '//div[2]/div[1]/table/tbody/tr/td[2]/span[1]'
NSLOOKUP_IPV6 = '//div[2]/div[2]/table/tbody/tr/td[2]/span[1]'
NSLOOKUP_SOURCES = ['cloudflare', 'google', 'opendns', 'authoritative']
def __init__(self):
options = Options()
options.add_argument("--headless")
options.add_argument("--log-level=3")
options.add_argument("--mute-audio")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument('--disable-extensions')
options.add_argument('--disable-gpu')
capabibilties = DesiredCapabilities().FIREFOX
capabibilties['pageLoadStrategy'] = 'eager'
profile = FirefoxProfile(os.environ['appdata'] + '\\Mozilla\\Firefox\\Profiles\\bkpihn0o.bot')
profile.set_preference("http.response.timeout", 1)
profile.set_preference("dom.max_script_run_time", 0)
profile.set_preference('permissions.default.stylesheet', 2)
profile.set_preference('permissions.default.image', 2)
profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
profile.set_preference("permissions.default.script", 2)
profile.set_preference("javascript.enabled", False)
self.Firefox = webdriver.Firefox(capabilities=capabibilties, options=options, firefox_profile=profile)
self.AltFirefox = webdriver.Firefox(capabilities=capabibilties)
def _robtex(self, addr):
self.Firefox.get(f'https://www.robtex.com/dns-lookup/{addr}')
ips = {href.removeprefix(DNS_LOOKUP.ROBTEX_IPLOOKUP) for e in self.Firefox.find_elements('xpath', DNS_LOOKUP.ROBTEX_HEAD) if (href := e.get_attribute('href')).startswith(DNS_LOOKUP.ROBTEX_IPLOOKUP)}
ips |= {href.removeprefix(DNS_LOOKUP.ROBTEX_IPLOOKUP) for e in self.Firefox.find_elements('xpath', DNS_LOOKUP.ROBTEX_TABLE) if (href := e.get_attribute('href')).startswith(DNS_LOOKUP.ROBTEX_IPLOOKUP)}
ipv4, ipv6 = set(), set()
for i in sorted(ips):
if IPV4.match(i):
ipv4.add(i)
elif is_ipv6(i):
ipv6.add(i)
return ipv4, ipv6
def _nslookup(self, addr):
ipv4, ipv6 = set(), set()
for source in DNS_LOOKUP.NSLOOKUP_SOURCES:
self.AltFirefox.get(f'https://www.nslookup.io/dns-records/{addr}#{source}')
ipv4 |= {ip for e in self.AltFirefox.find_elements('xpath', DNS_LOOKUP.NSLOOKUP_IPV4) if IPV4.match((ip := e.text))}
ipv6 |= {ip for e in self.AltFirefox.find_elements('xpath', DNS_LOOKUP.NSLOOKUP_IPV6) if is_ipv6((ip := e.text))}
return ipv4, ipv6
def dns_query(self, addr):
robtex = self._robtex(addr)
nslookup = self._nslookup(addr)
ipv4, ipv6 = robtex
ipv4 |= nslookup[0]
ipv6 |= nslookup[1]
return {'ipv4': sorted(ipv4), 'ipv6': sorted(ipv6)}
This method returns a good number of addresses, but sadly not enough, and as you can see, it uses selenium
rather than requests
, and therefore it is slow. Well, truth be told, I have tested extensively and rigorously, time and again, that selenium is always faster than requests. But still its speed is by far unacceptable.
I have also written this:
import dns
resolver = dns.resolver.Resolver()
resolver.nameservers = ['8.8.8.8']
def dns_resolve(address):
return sorted({resolver.query(address)[0].address for i in range(4)})
It is much faster, however it only returns one address per server per query, so I repeated the operation four times, I want to have at least 4 addresses per server per query be returned...
I have even written this:
import json
import requests
def manual_resolve(address):
return [i['data'] for i in json.loads(requests.get(f'https://dns.google/resolve?name={address}&type=A').text)['Answer']]
It is as low level as I can get, but as I said before, in my network condition requests is actually slower than selenium, much slower...
So I want to know what is the fastest way to query DNS A records using multiple servers, and by multiple I mean a massive amount;
I have got 5556 trustworthy name-servers from here: https://public-dns.info/nameservers.csv (that file the address points to may change over time, and the version when I downloaded it has 5556 entries), and I used this script to process the information:
import csv
import json
import ping3
import re
import pickle
import subprocess
import time
from collections import namedtuple
from datetime import datetime
from pathlib import Path
IPV4 = re.compile(r'^((25[0-5]|2[0-4]\d|1?\d\d?)\.){3}(25[0-5]|2[0-4]\d|1?\d\d?)$')
publicdns = Path('C:/Users/Estranger/Downloads/nameservers.csv').read_text(encoding='utf8').splitlines()
publicdns = list(csv.reader(publicdns))
to_date = lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ')
Entry = namedtuple('Entry', publicdns[0])
deserializer = [str, str, int, str, str, str, str, str, bool, float, to_date, to_date]
publicdns = [Entry(*(f(v) for f, v in zip(deserializer, i))) for i in publicdns[1:]]
Path('D:/nameservers.pickle').write_bytes(pickle.dumps(publicdns, protocol=pickle.HIGHEST_PROTOCOL))
IPV4_DNS = [ipv4 for e in publicdns if e.reliability >= 0.75 and IPV4.match((ipv4 := e.ip_address))]
Path('D:/reliable_ipv4_dns.txt').write_text('\n'.join(IPV4_DNS))
def ping(addr, lim=0.5):
return sum(d if (d := ping3.ping(addr, timeout=lim, unit='ms')) else 0 for _ in range(4)) / 4
ping_latency = []
new_servers = []
def format_delta(d):
d = int(d)
h, rem = divmod(d, 3600)
m, s = divmod(rem, 60)
return f'{h:02d}:{m:02d}:{s:02d}'
def ping_filter(condition, timeout):
loop = 1
if loop == 1:
servers = IPV4_DNS.copy()
logs = []
start = datetime.now()
success_rate = 0
while True:
loop_start = datetime.now()
total = len(servers)
ping_latency.clear()
new_servers.clear()
succeeded = 0
failed = 0
l = len(str(total))
for iteration, server in enumerate(servers):
latency = ping(server, timeout)
timestamp = datetime.now()
elapsed = timestamp-start
loop_elapsed = timestamp-loop_start
eta = (loop_elapsed.total_seconds() / (iteration + 1)) * (total - iteration - 1)
entry = {
'timestamp': f'{timestamp:%Y-%m-%d %H:%M:%S}',
'loop': loop,
'loop start': f'{loop_start:%Y-%m-%d %H:%M:%S}',
'iteration': iteration,
'server': server,
'success': True,
'latency': round(latency, 2),
'unit': 'ms',
'total': total,
'succeeded': succeeded,
'failed': failed,
'started': f'{start:%Y-%m-%d %H:%M:%S}',
'elapsed': format_delta(elapsed.total_seconds()),
'loop runtime': format_delta(loop_elapsed.total_seconds()),
'ETA': format_delta(eta),
'success rate': f'{success_rate:06.2%}'
}
if 0 < latency <= int(timeout*1000):
succeeded += 1
entry['succeeded'] += 1
new_servers.append(server)
ping_latency.append((server, latency))
else:
failed += 1
entry['failed'] += 1
entry['success'] = False
entry['latency'] = 'timeout'
if iteration == total - 1:
success_rate = succeeded / total
entry['success rate'] = f'{success_rate:06.2%}'
print(json.dumps(entry, indent=4))
logs.append(entry)
new_total = len(new_servers)
servers = new_servers.copy()
if new_total == total or loop == 32:
timestamp = datetime.now()
elapsed = datetime.now()-start
entry = {
'completed': f'{timestamp:%Y-%m-%d %H:%M:%S}',
'started': f'{start:%Y-%m-%d %H:%M:%S}',
'elapsed': format_delta(elapsed.total_seconds()),
'loop': loop
}
print(json.dumps(entry, indent=4))
logs.append(entry)
break
loop += 1
Path(f'D:/IPv4_DNS_{condition}.txt').write_text('\n'.join(servers))
Path(f'D:/IPv4_DNS_ping_log_{condition}.json').write_text(json.dumps(logs, indent=4))
Path(f'D:/IPv4_DNS_ping_latency_{condition}.json').write_text(json.dumps(dict(ping_latency), indent=4))
ping_filter('NOVPN', 0.3)
It takes more than 24 hours to complete, and in short I ended up with 1518 servers.
And I need to resolve A records per inputted address per operation using all those 1518 servers to have a chance of find an IP address that isn't blocked or slowed, so how to resolve DNS A records asynchronously with a massive number of name servers?
Update
OK, now I have looked at asyncio
and concurrent.futures.ThreadPoolExecutor
and dns.asyncresolver
and I think they are exactly I am looking for, but still there are somethings I don't quite understand yet.
I am thinking about using 4 concurrent thread pools, each run synchronously 4 times (to get 4 addresses per server since I can only get 1 address per server now and Google isn't of any help), each with a max size of 4 and each of the task is an execution of the asynchronous DNS querying function with 32 servers.
Here is what I come up with:
def split_sixteen(series):
length = len(series)
p4 = -(-length // 4)
p16 = -(-length // 16)
indices = [(0, 1), (1, 2), (2, 3), (3, 4)]
return [[series[p4*a:p4*b][p16*c:p16*d] for c, d in indices] for a, b in indices]
class Assigner:
def __init__(self, tasks, batch=32) -> None:
self.tasks = tasks
self.length = len(tasks)
self.index = 0
self.batch = batch
self.all_assigned = False
def assign(self):
if not self.all_assigned:
start = self.index
if self.index + self.batch <= self.length:
self.index += self.batch
else:
self.index = self.length
if self.index == self.length:
self.all_assigned = True
return self.tasks[start:self.index]
else:
raise StopIteration('All tasks have been assigned')
I don't know what function the thread pools should run, I think the function should have a while loop, that is up until the assigner is exhausted, the loop takes up to 32 servers from the assigner and puts it to be run in the thread pool, if there are not 4 coroutines already, else it waits until one of the routines finishes to launch another, after the end of the loop, the function should wait for the routines to finish to combine the results, and the results from all 4 thread pools should be combined...
I don't know how to make all of these work together...
And about selenium
faster than requests
, shocking, I know, but it is True, truth doesn't become untrue if you don't subjectively believe it:
import os
import requests
from selenium import webdriver
from selenium.webdriver import FirefoxProfile
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options
options = Options()
options.add_argument("--headless")
options.add_argument("--log-level=3")
options.add_argument("--mute-audio")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument('--disable-extensions')
options.add_argument('--disable-gpu')
capabibilties = DesiredCapabilities().FIREFOX
capabibilties['pageLoadStrategy'] = 'eager'
profile = FirefoxProfile(os.environ['appdata'] + '\\Mozilla\\Firefox\\Profiles\\bkpihn0o.bot')
profile.set_preference("http.response.timeout", 1)
profile.set_preference("dom.max_script_run_time", 0)
profile.set_preference('permissions.default.stylesheet', 2)
profile.set_preference('permissions.default.image', 2)
profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
profile.set_preference("permissions.default.script", 2)
profile.set_preference("javascript.enabled", False)
Firefox = webdriver.Firefox(capabilities=capabibilties, options=options, firefox_profile=profile)
Maybe I didn't mention the fact that I performed these tests while connected to a VPN, and it seems that requests somehow didn't take advantages of the VPN.