2

Now I have used this:

from selenium import webdriver
from selenium.webdriver import FirefoxProfile
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options

class DNS_LOOKUP:
    ROBTEX_IPLOOKUP = 'https://www.robtex.com/ip-lookup/'
    ROBTEX_HEAD = '//section[1]/div[3]/p/a'
    ROBTEX_TABLE = '//section[2]/div[3]/table/tbody/tr/td//a'
    NSLOOKUP_IPV4 = '//div[2]/div[1]/table/tbody/tr/td[2]/span[1]'
    NSLOOKUP_IPV6 = '//div[2]/div[2]/table/tbody/tr/td[2]/span[1]'
    NSLOOKUP_SOURCES = ['cloudflare', 'google', 'opendns', 'authoritative']
    def __init__(self):
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--log-level=3")
        options.add_argument("--mute-audio")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument('--disable-extensions')
        options.add_argument('--disable-gpu')
        capabibilties = DesiredCapabilities().FIREFOX
        capabibilties['pageLoadStrategy'] = 'eager'
        profile = FirefoxProfile(os.environ['appdata'] + '\\Mozilla\\Firefox\\Profiles\\bkpihn0o.bot')
        profile.set_preference("http.response.timeout", 1)
        profile.set_preference("dom.max_script_run_time", 0)
        profile.set_preference('permissions.default.stylesheet', 2)
        profile.set_preference('permissions.default.image', 2)
        profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
        profile.set_preference("permissions.default.script", 2)
        profile.set_preference("javascript.enabled", False)
        self.Firefox = webdriver.Firefox(capabilities=capabibilties, options=options, firefox_profile=profile)
        self.AltFirefox = webdriver.Firefox(capabilities=capabibilties)
    
    def _robtex(self, addr):
        self.Firefox.get(f'https://www.robtex.com/dns-lookup/{addr}')
        ips = {href.removeprefix(DNS_LOOKUP.ROBTEX_IPLOOKUP) for e in self.Firefox.find_elements('xpath', DNS_LOOKUP.ROBTEX_HEAD) if (href := e.get_attribute('href')).startswith(DNS_LOOKUP.ROBTEX_IPLOOKUP)}
        ips |= {href.removeprefix(DNS_LOOKUP.ROBTEX_IPLOOKUP) for e in self.Firefox.find_elements('xpath', DNS_LOOKUP.ROBTEX_TABLE) if (href := e.get_attribute('href')).startswith(DNS_LOOKUP.ROBTEX_IPLOOKUP)}
        ipv4, ipv6 = set(), set()
        for i in sorted(ips):
            if IPV4.match(i):
                ipv4.add(i)
            elif is_ipv6(i):
                ipv6.add(i)
        return ipv4, ipv6
    
    def _nslookup(self, addr):
        ipv4, ipv6 = set(), set()
        for source in DNS_LOOKUP.NSLOOKUP_SOURCES:
            self.AltFirefox.get(f'https://www.nslookup.io/dns-records/{addr}#{source}')
            ipv4 |= {ip for e in self.AltFirefox.find_elements('xpath', DNS_LOOKUP.NSLOOKUP_IPV4) if IPV4.match((ip := e.text))}
            ipv6 |= {ip for e in self.AltFirefox.find_elements('xpath', DNS_LOOKUP.NSLOOKUP_IPV6) if is_ipv6((ip := e.text))}
        return ipv4, ipv6
    
    def dns_query(self, addr):    
        robtex = self._robtex(addr)
        nslookup = self._nslookup(addr)
        ipv4, ipv6 = robtex
        ipv4 |= nslookup[0]
        ipv6 |= nslookup[1]
        return {'ipv4': sorted(ipv4), 'ipv6': sorted(ipv6)}

This method returns a good number of addresses, but sadly not enough, and as you can see, it uses selenium rather than requests, and therefore it is slow. Well, truth be told, I have tested extensively and rigorously, time and again, that selenium is always faster than requests. But still its speed is by far unacceptable.

I have also written this:

import dns
 resolver = dns.resolver.Resolver()                                       
 resolver.nameservers = ['8.8.8.8']                                       
 def dns_resolve(address):                                                
     return sorted({resolver.query(address)[0].address for i in range(4)})

It is much faster, however it only returns one address per server per query, so I repeated the operation four times, I want to have at least 4 addresses per server per query be returned...

I have even written this:

import json
import requests

def manual_resolve(address):                                                                                                
    return [i['data'] for i in json.loads(requests.get(f'https://dns.google/resolve?name={address}&type=A').text)['Answer']]

It is as low level as I can get, but as I said before, in my network condition requests is actually slower than selenium, much slower...

So I want to know what is the fastest way to query DNS A records using multiple servers, and by multiple I mean a massive amount;

I have got 5556 trustworthy name-servers from here: https://public-dns.info/nameservers.csv (that file the address points to may change over time, and the version when I downloaded it has 5556 entries), and I used this script to process the information:

import csv
import json
import ping3
import re
import pickle
import subprocess
import time
from collections import namedtuple
from datetime import datetime
from pathlib import Path

IPV4 = re.compile(r'^((25[0-5]|2[0-4]\d|1?\d\d?)\.){3}(25[0-5]|2[0-4]\d|1?\d\d?)$')

publicdns = Path('C:/Users/Estranger/Downloads/nameservers.csv').read_text(encoding='utf8').splitlines()
publicdns = list(csv.reader(publicdns))

to_date = lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ')
Entry = namedtuple('Entry', publicdns[0])

deserializer = [str, str, int, str, str, str, str, str, bool, float, to_date, to_date]
publicdns = [Entry(*(f(v) for f, v in zip(deserializer, i))) for i in publicdns[1:]]
Path('D:/nameservers.pickle').write_bytes(pickle.dumps(publicdns, protocol=pickle.HIGHEST_PROTOCOL))
IPV4_DNS = [ipv4 for e in publicdns if e.reliability >= 0.75 and IPV4.match((ipv4 := e.ip_address))]
Path('D:/reliable_ipv4_dns.txt').write_text('\n'.join(IPV4_DNS))

def ping(addr, lim=0.5):
    return sum(d if (d := ping3.ping(addr, timeout=lim, unit='ms')) else 0 for _ in range(4)) / 4

ping_latency = []
new_servers = []

def format_delta(d):
    d = int(d)
    h, rem = divmod(d, 3600)
    m, s = divmod(rem, 60)
    return f'{h:02d}:{m:02d}:{s:02d}'

def ping_filter(condition, timeout):
    loop = 1
    if loop == 1:
        servers = IPV4_DNS.copy()
    logs = []
    start = datetime.now()
    success_rate = 0
    while True:
        loop_start = datetime.now()
        total = len(servers)
        ping_latency.clear()
        new_servers.clear()
        succeeded = 0
        failed = 0
        l = len(str(total))
        for iteration, server in enumerate(servers):
            latency = ping(server, timeout)
            timestamp = datetime.now()
            elapsed = timestamp-start
            loop_elapsed = timestamp-loop_start
            eta = (loop_elapsed.total_seconds() / (iteration + 1)) * (total - iteration - 1)
            entry = {
                'timestamp': f'{timestamp:%Y-%m-%d %H:%M:%S}',
                'loop': loop,
                'loop start': f'{loop_start:%Y-%m-%d %H:%M:%S}',
                'iteration': iteration,
                'server': server,
                'success': True,
                'latency': round(latency, 2),
                'unit': 'ms',
                'total': total,
                'succeeded': succeeded,
                'failed': failed,
                'started': f'{start:%Y-%m-%d %H:%M:%S}',
                'elapsed': format_delta(elapsed.total_seconds()),
                'loop runtime': format_delta(loop_elapsed.total_seconds()),
                'ETA': format_delta(eta),
                'success rate': f'{success_rate:06.2%}'
            }
            if 0 < latency <= int(timeout*1000):
                succeeded += 1
                entry['succeeded'] += 1
                new_servers.append(server)
                ping_latency.append((server, latency))
            else:
                failed += 1
                entry['failed'] += 1
                entry['success'] = False
                entry['latency'] = 'timeout'
            if iteration == total - 1:
                success_rate = succeeded / total
                entry['success rate'] = f'{success_rate:06.2%}'
            print(json.dumps(entry, indent=4))
            logs.append(entry)
        new_total = len(new_servers)
        servers = new_servers.copy()
        if new_total == total or loop == 32:
            timestamp = datetime.now()
            elapsed = datetime.now()-start
            entry = {
                'completed': f'{timestamp:%Y-%m-%d %H:%M:%S}',
                'started': f'{start:%Y-%m-%d %H:%M:%S}',
                'elapsed': format_delta(elapsed.total_seconds()),
                'loop': loop
            }
            print(json.dumps(entry, indent=4))
            logs.append(entry)
            break
        loop += 1
    Path(f'D:/IPv4_DNS_{condition}.txt').write_text('\n'.join(servers))
    Path(f'D:/IPv4_DNS_ping_log_{condition}.json').write_text(json.dumps(logs, indent=4))
    Path(f'D:/IPv4_DNS_ping_latency_{condition}.json').write_text(json.dumps(dict(ping_latency), indent=4))

ping_filter('NOVPN', 0.3)

It takes more than 24 hours to complete, and in short I ended up with 1518 servers.

And I need to resolve A records per inputted address per operation using all those 1518 servers to have a chance of find an IP address that isn't blocked or slowed, so how to resolve DNS A records asynchronously with a massive number of name servers?

Update


OK, now I have looked at asyncio and concurrent.futures.ThreadPoolExecutor and dns.asyncresolver and I think they are exactly I am looking for, but still there are somethings I don't quite understand yet.

I am thinking about using 4 concurrent thread pools, each run synchronously 4 times (to get 4 addresses per server since I can only get 1 address per server now and Google isn't of any help), each with a max size of 4 and each of the task is an execution of the asynchronous DNS querying function with 32 servers.

Here is what I come up with:

def split_sixteen(series):                                                           
    length = len(series)                                                             
    p4 = -(-length // 4)                                                             
    p16 = -(-length // 16)                                                           
    indices = [(0, 1), (1, 2), (2, 3), (3, 4)]                                       
    return [[series[p4*a:p4*b][p16*c:p16*d] for c, d in indices] for a, b in indices]


class Assigner:
    def __init__(self, tasks, batch=32) -> None:
        self.tasks = tasks
        self.length = len(tasks)
        self.index = 0
        self.batch = batch
        self.all_assigned = False
    
    def assign(self):
        if not self.all_assigned:
            start = self.index
            if self.index + self.batch <= self.length:
                self.index += self.batch
            else:
                self.index = self.length
            if self.index == self.length:
                self.all_assigned = True
            return self.tasks[start:self.index]
        else:
            raise StopIteration('All tasks have been assigned')

I don't know what function the thread pools should run, I think the function should have a while loop, that is up until the assigner is exhausted, the loop takes up to 32 servers from the assigner and puts it to be run in the thread pool, if there are not 4 coroutines already, else it waits until one of the routines finishes to launch another, after the end of the loop, the function should wait for the routines to finish to combine the results, and the results from all 4 thread pools should be combined...

I don't know how to make all of these work together...

And about selenium faster than requests, shocking, I know, but it is True, truth doesn't become untrue if you don't subjectively believe it:

import os
import requests
from selenium import webdriver
from selenium.webdriver import FirefoxProfile
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options

options = Options()
options.add_argument("--headless")
options.add_argument("--log-level=3")
options.add_argument("--mute-audio")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument('--disable-extensions')
options.add_argument('--disable-gpu')
capabibilties = DesiredCapabilities().FIREFOX
capabibilties['pageLoadStrategy'] = 'eager'
profile = FirefoxProfile(os.environ['appdata'] + '\\Mozilla\\Firefox\\Profiles\\bkpihn0o.bot')
profile.set_preference("http.response.timeout", 1)
profile.set_preference("dom.max_script_run_time", 0)
profile.set_preference('permissions.default.stylesheet', 2)
profile.set_preference('permissions.default.image', 2)
profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
profile.set_preference("permissions.default.script", 2)
profile.set_preference("javascript.enabled", False)
Firefox = webdriver.Firefox(capabilities=capabibilties, options=options, firefox_profile=profile)

enter image description here

Maybe I didn't mention the fact that I performed these tests while connected to a VPN, and it seems that requests somehow didn't take advantages of the VPN.

Ξένη Γήινος
  • 2,181
  • 1
  • 9
  • 35
  • 1
    It seems all your long post can just be summarized with "so how to resolve DNS A records asynchronously with a massive number of name servers?" and FWIW I believe the first few paragraphs to be completely unneeded here and offtopic. As for the real technical question, what did you try exactly? Did you look at threads or multiprocess? Did you look at asynchronous DNS queries (see https://dnspython.readthedocs.io/en/stable/async.html to start)? – Patrick Mevzek Dec 04 '21 at 19:13
  • "selenium is always faster than requests." You are not showing really how you use requests and providing real data so it is hard to believe such an assertion. – Patrick Mevzek Dec 04 '21 at 19:15
  • First, you have to import `dns.resolver` and then you can say `resolver = dns.resolver.Resolver() `. Second, and most important, `resolver.query(address)` returns a list of IP addresses so `addr = resolver.query(address); ip_addresses = [addr[i].address for i in range(len(addr))]) will give you a list of the IP addresses returned with a single query. Now it well may be that it does not return *all* the possible IP addresses and if you make multiple calls you may get additional IP addresses, but you should not ignore the ones that are returned. – Booboo Dec 08 '21 at 14:38
  • "And about selenium faster than requests, shocking, I know, but it is True, truth doesn't become untrue if you don't subjectively believe it:" The plural of anecdote is not data, and your tests are failing any coherency as you yourself say: "I performed these tests while connected to a VPN, and it seems that requests somehow didn't take advantages of the VPN." So you are not even comparing two equal things. – Patrick Mevzek Dec 12 '21 at 05:40
  • I don't see why adding more and more DNS servers could givey a better performance. You should measure the response time and select few fast responding servers. The only problem will be rate-limiting - the QPS (queries per second). For example Google's public DNS allows 1500 QPS. But how many QPS do you need? Please provide some estimation. – VPfB Dec 13 '21 at 09:00
  • @Booboo A DNS name to address query returns the complete set of addresses each time, only the order can differ. The set may change over time, but the answer is complete. No additional queries for additional addresses should be made. – VPfB Dec 13 '21 at 09:12

1 Answers1

3

See my comment to your post.

I find it difficult to believe that selenium can outperform calling a DNS server directly. After all, selenium would be using the same network making its GET requests.

I installed dnspython under Windows and I did some benchmarking using a thread pool whose size was equal to the number of domains I was trying to resolve, which was 750 consisting 15 distinct domains each repeated 50 times. I then created a small list of DNS servers from the CSV file the OP referenced. I also attempted to resolve the domains using asyncio, but could not get this to work. These were the specific benchmarks:

  1. test1: All the domains were resolved concurrently using a server list consisting of the single DNS server '8.8.8.8'. A given domain, such as bonappetit.com might be resolved as different IP addresses on different resolution requests even though the same DNS server address was used for all requests. The time to resolve all requests was 1.56 seconds.

  2. test2: For this benchmark each requests was given a server list of a single DNS server uniformly chosen from the list of DNS servers. I quickly discovered that many of the DNS server addresses were not particularly reliable and caused timeout exceptions and had to be removed from the server list and the benchmark rerun. The time to resolve all requests was 5.57 seconds.

  3. test3: For this benchmark each request was given the complete list of DNS servers excluding '8.8.8.8'. The benchmark time was 3.54 seconds. Not shown was a benchmark where I provide the full list of servers including '8.8.8.8'. Its benchmark time was essentially identical to the time for 'test1'. I repeated the benchmark again where '8.8.8.8' was the last entry in the list and the running time was only consistently a few hundredths of a second slower than the running time for test1 where it was the first entry in the list. I have subsequently rerun this benchmark again excluding '8.8.8.8' from the list and I now see running times comparable to the running time for test1. This suggests to me that the servers I happened to have been using other than '8.8.8.8' have very variable response times even when they do not timeout.

  4. test4: I used a completely different method of DNS resolution, namely socket.gethostbyname. This produced by far the fastest benchmark time of .27 seconds and for each domain only a single IP address was ever returned. I believe both results can probably be explained by Windows caching the results.

  5. test5: This was an attempt to use asyncio with a single DNS server, '8.8.8.8', and it timed out. I am not sure why.

Conclusion

First, your 5556 DNS servers are not equally trustworthy and from minute to minute their trustworthiness changes. I would experiment with the servers that are geographically located near you to determine the most trustworthy and put a few in the server list keeping '8.8.8.8' as the first entry. Just to make this clearer, I am talking the code used in benchmark test3, but without excluding server '8.8.8.8'.

Second, I can see no reason to create more than one thread pool. But there is some maximum size you should not exceed. Certainly 500 should not be a problem. But if you can get asyncio to work, you should be able to gather thousands of tasks without a problem.

import dns.resolver
import asyncio
import dns.asyncresolver
import socket
import time
from multiprocessing.pool import ThreadPool



l = [
    'yahoo.com',
    'cnn.com',
    'ibm.com',
    'nytimes.com',
    'stackoverflow.com',
    'tcm.com',
    'wqxr.org',
    'wahingtonpost.com',
    'theatlantic.com',
    'nymag.com',
    'newyorker.com',
    'bonappetit.com',
    'seriouseats.com',
    'foodtv.com',
    'food52.com',
]

domains = []
for _ in range(50):
    domains += l


servers = [
    '8.8.8.8',
    '8.0.7.0',
    '8.0.6.0',
    '195.99.66.220',
    '38.132.106.139',
]


def test1(pool):
    def resolve(domain):
        resolver = dns.resolver.Resolver()
        resolver.nameservers = ['8.8.8.8']
        return (domain, resolver.resolve(domain)[0].address)

    return pool.map(resolve, domains)

def test2(pool):
    def resolve(idx, domain):
        resolver = dns.resolver.Resolver()
        i = idx % len(servers)
        resolver.nameservers = [servers[i]]
        try:
            return (domain, resolver.resolve(domain)[0].address)
        except Exception as e:
            print(e, servers[i])
            return None

    return pool.starmap(resolve, enumerate(domains))


def test3(pool):
    def resolve(domain):
        resolver = dns.resolver.Resolver()
        resolver.nameservers = servers[1:] # omit '8.8.8.8'
        return (domain, resolver.resolve(domain)[0].address)

    return pool.map(resolve, domains)


def test4(pool):
    def resolve(domain):
        return (domain, socket.gethostbyname(domain))

    return pool.map(resolve, domains)

async def test5():
    async def resolve(domain):
        resolver = dns.asyncresolver.Resolver()
        resolver.nameservers = ['8.8.8.8']
        addr = await resolver.resolve(domain)
        return (domain, addr[0].address)

    return await asyncio.gather(*(resolve(domain) for domain in domains))


pool = ThreadPool(len(domains))

def benchmark(fun):
    print()
    print(fun.__name__)
    start = time.time()
    results = fun(pool)
    print(time.time() - start)
    print(sorted(set(results)))


benchmark(test1)
benchmark(test2)
benchmark(test3)
benchmark(test4)

print()
print('test5')
start = time.time()
results = asyncio.run(test5())
print(time.time() - start)
print(sorted(set(results)))

Prints:

test1
1.5600032806396484
[('bonappetit.com', '151.101.0.239'), ('bonappetit.com', '151.101.192.239'), ('bonappetit.com', '151.101.64.239'), ('cnn.com', '151.101.1.67'), ('cnn.com', '151.101.129.67'), ('cnn.com', '151.101.65.67'), ('food52.com', '104.18.166.45'), ('food52.com', '104.18.174.13'), ('foodtv.com', '67.199.248.12'), ('foodtv.com', '67.199.248.13'), ('ibm.com', '184.29.179.199'), ('newyorker.com', '151.101.0.239'), ('newyorker.com', '151.101.128.239'), ('newyorker.com', '151.101.192.239'), ('newyorker.com', '151.101.64.239'), ('nymag.com', '151.101.130.133'), ('nymag.com', '151.101.194.133'), ('nymag.com', '151.101.2.133'), ('nymag.com', '151.101.66.133'), ('nytimes.com', '151.101.1.164'), ('nytimes.com', '151.101.129.164'), ('nytimes.com', '151.101.193.164'), ('nytimes.com', '151.101.65.164'), ('seriouseats.com', '151.101.2.137'), ('stackoverflow.com', '151.101.1.69'), ('stackoverflow.com', '151.101.129.69'), ('stackoverflow.com', '151.101.65.69'), ('tcm.com', '104.127.162.10'), ('theatlantic.com', '151.101.130.133'), ('theatlantic.com', '151.101.194.133'), ('theatlantic.com', '151.101.2.133'), ('theatlantic.com', '151.101.66.133'), ('wahingtonpost.com', '198.72.14.16'), ('wqxr.org', '44.194.174.151'), ('wqxr.org', '54.144.182.133'), ('yahoo.com', '74.6.143.25'), ('yahoo.com', '74.6.143.26'), ('yahoo.com', '74.6.231.21'), ('yahoo.com', '98.137.11.163')]

test2
5.566321611404419
[('bonappetit.com', '151.101.0.239'), ('bonappetit.com', '151.101.128.239'), ('bonappetit.com', '151.101.192.239'), ('bonappetit.com', '151.101.64.239'), ('cnn.com', '151.101.1.67'), ('cnn.com', '151.101.129.67'), ('cnn.com', '151.101.193.67'), ('cnn.com', '151.101.65.67'), ('food52.com', '104.18.166.45'), ('foodtv.com', '67.199.248.12'), ('foodtv.com', '67.199.248.13'), ('ibm.com', '23.218.185.219'), ('newyorker.com', '151.101.0.239'), ('newyorker.com', '151.101.128.239'), ('newyorker.com', '151.101.192.239'), ('newyorker.com', '151.101.64.239'), ('nymag.com', '151.101.130.133'), ('nytimes.com', '151.101.1.164'), ('nytimes.com', '151.101.129.164'), ('nytimes.com', '151.101.193.164'), ('nytimes.com', '151.101.65.164'), ('seriouseats.com', '151.101.130.137'), ('seriouseats.com', '151.101.194.137'), ('seriouseats.com', '151.101.2.137'), ('seriouseats.com', '151.101.66.137'), ('stackoverflow.com', '151.101.1.69'), ('tcm.com', '104.127.162.10'), ('theatlantic.com', '151.101.130.133'), ('theatlantic.com', '151.101.194.133'), ('theatlantic.com', '151.101.2.133'), ('theatlantic.com', '151.101.66.133'), ('wahingtonpost.com', '198.72.14.16'), ('wqxr.org', '44.194.174.151'), ('wqxr.org', '54.144.182.133'), ('yahoo.com', '74.6.143.26'), ('yahoo.com', '74.6.231.21'), ('yahoo.com', '98.137.11.163')]

test3
3.536404609680176
[('bonappetit.com', '151.101.0.239'), ('bonappetit.com', '151.101.128.239'), ('bonappetit.com', '151.101.192.239'), ('bonappetit.com', '151.101.64.239'), ('cnn.com', '151.101.1.67'), ('cnn.com', '151.101.129.67'), ('cnn.com', '151.101.193.67'), ('cnn.com', '151.101.65.67'), ('food52.com', '104.18.166.45'), ('food52.com', '104.18.174.13'), ('foodtv.com', '67.199.248.12'), ('foodtv.com', '67.199.248.13'), ('ibm.com', '23.218.185.219'), ('newyorker.com', '151.101.0.239'), ('newyorker.com', '151.101.128.239'), ('newyorker.com', '151.101.192.239'), ('newyorker.com', '151.101.64.239'), ('nymag.com', '151.101.130.133'), ('nymag.com', '151.101.194.133'), ('nymag.com', '151.101.2.133'), ('nymag.com', '151.101.66.133'), ('nytimes.com', '151.101.1.164'), ('nytimes.com', '151.101.129.164'), ('nytimes.com', '151.101.193.164'), ('nytimes.com', '151.101.65.164'), ('seriouseats.com', '151.101.130.137'), ('seriouseats.com', '151.101.194.137'), ('seriouseats.com', '151.101.2.137'), ('seriouseats.com', '151.101.66.137'), ('stackoverflow.com', '151.101.1.69'), ('stackoverflow.com', '151.101.129.69'), ('stackoverflow.com', '151.101.193.69'), ('stackoverflow.com', '151.101.65.69'), ('tcm.com', '23.75.199.121'), ('theatlantic.com', '151.101.130.133'), ('theatlantic.com', '151.101.194.133'), ('theatlantic.com', '151.101.2.133'), ('theatlantic.com', '151.101.66.133'), ('wahingtonpost.com', '198.72.14.16'), ('wqxr.org', '44.194.174.151'), ('wqxr.org', '54.144.182.133'), ('yahoo.com', '74.6.143.25'), ('yahoo.com', '74.6.143.26'), ('yahoo.com', '74.6.231.20'), ('yahoo.com', '74.6.231.21'), ('yahoo.com', '98.137.11.163'), ('yahoo.com', '98.137.11.164')]

test4
0.33908557891845703
[('bonappetit.com', '151.101.64.239'), ('cnn.com', '151.101.129.67'), ('food52.com', '104.18.174.13'), ('foodtv.com', '67.199.248.12'), ('ibm.com', '104.104.121.251'), ('newyorker.com', '151.101.192.239'), ('nymag.com', '151.101.130.133'), ('nytimes.com', '151.101.193.164'), ('seriouseats.com', '151.101.66.137'), ('stackoverflow.com', '151.101.65.69'), ('tcm.com', '104.127.162.10'), ('theatlantic.com', '151.101.2.133'), ('wahingtonpost.com', '198.72.14.16'), ('wqxr.org', '44.194.174.151'), ('yahoo.com', '98.137.11.164')]

test5
Traceback (most recent call last):
    ...
    addr = await resolver.resolve(domain)
  File "C:\Booboo\test\test_venv\lib\site-packages\dns\asyncresolver.py", line 74, in resolve
    timeout = self._compute_timeout(start, lifetime)
  File "C:\Booboo\test\test_venv\lib\site-packages\dns\resolver.py", line 997, in _compute_timeout
    raise Timeout(timeout=duration)
dns.exception.Timeout: The DNS operation timed out after 5.38653302192688 seconds

Update

I came across this post, which describes a bug in the PyPi version of dnspython that results in the async resolver timingout. The resolution is to load the latest version from Github:

pip install -U https://github.com/rthalley/dnspython/archive/master.zip

I reran the benchmarks this time increasing the number of domains requiring to be fetched to 3,000, which frankly is a lot of threads to be creating, so I set an upper bound of 500 on the size of the thread pool (and am now including the time to create the thread pool in the timings) and I gave the full DNS server list to both the multithreading benchmark and the asyncio benchmark (a total of two benchmarks). I also made an improvement where the resolver class is created only once and reused for resolving all requests. Also, all the IP addresses returned by a single query are added to a set of IP addresses maintained in a dictionary keyed by the domain in case multiple requests are being made against the same domain (as is the case in my benchmark) so we have a set of unique IP addresses ultimately found. It is the final dictionary that is returned.

The results:

multithreading: 1.99 seconds
asyncio: 3.43 seconds

This was surprising as I thought with a large number of domains to process and limiting the size of my thread pool that the asyncio version would be more performant. Multithreading clearly seems the way to go.

import dns.resolver
import asyncio
import dns.asyncresolver
import socket
import time
from multiprocessing.pool import ThreadPool


l = [
    'yahoo.com',
    'cnn.com',
    'ibm.com',
    'nytimes.com',
    'stackoverflow.com',
    'tcm.com',
    'wqxr.org',
    'wahingtonpost.com',
    'theatlantic.com',
    'nymag.com',
    'newyorker.com',
    'bonappetit.com',
    'seriouseats.com',
    'foodtv.com',
    'food52.com',
]

domains = []
for _ in range(200):
    domains += l

servers = [
    '8.8.8.8',
    '8.0.7.0',
    '8.0.6.0',
    '195.99.66.220',
    '38.132.106.139',
]


def threading_test(pool):
    resolver = dns.resolver.Resolver()
    resolver.nameservers = servers
    ip_addresses = {}

    def resolve(domain):
        results = resolver.resolve(domain)
        s = ip_addresses.setdefault(domain, set())
        for result in results:
            s.add(result.address)

    pool.map(resolve, domains)
    return ip_addresses


async def async_test():
    resolver = dns.asyncresolver.Resolver()
    resolver.nameservers = servers
    ip_addresses = {}

    async def resolve(domain):
        results = await resolver.resolve(domain)
        s = ip_addresses.setdefault(domain, set())
        for result in results:
            s.add(result.address)

    await asyncio.gather(*(resolve(domain) for domain in domains))
    return ip_addresses


print('threading_test')
start = time.time()
pool = ThreadPool(min(500, len(domains)))
results = threading_test(pool)
print(time.time() - start)
for k in sorted(results.keys()):
    print(k, sorted(results[k]))

print()
print('async_test')
start = time.time()
results = asyncio.run(async_test())
print(time.time() - start)
for k in sorted(results.keys()):
    print(k, sorted(results[k]))

Prints:

1.9919934272766113
bonappetit.com ['151.101.0.239', '151.101.128.239', '151.101.192.239', '151.101.64.239']
cnn.com ['151.101.1.67', '151.101.129.67', '151.101.193.67', '151.101.65.67']
food52.com ['104.18.166.45', '104.18.174.13']
foodtv.com ['67.199.248.12', '67.199.248.13']
ibm.com ['184.29.179.199']
newyorker.com ['151.101.0.239', '151.101.128.239', '151.101.192.239', '151.101.64.239']
nymag.com ['151.101.130.133', '151.101.194.133', '151.101.2.133', '151.101.66.133']
nytimes.com ['151.101.1.164', '151.101.129.164', '151.101.193.164', '151.101.65.164']
seriouseats.com ['151.101.130.137', '151.101.194.137', '151.101.2.137', '151.101.66.137']
stackoverflow.com ['151.101.1.69', '151.101.129.69', '151.101.193.69', '151.101.65.69']
tcm.com ['104.127.162.10']
theatlantic.com ['151.101.130.133', '151.101.194.133', '151.101.2.133', '151.101.66.133']
wahingtonpost.com ['198.72.14.16']
wqxr.org ['44.194.174.151', '54.144.182.133']
yahoo.com ['74.6.143.25', '74.6.143.26', '74.6.231.20', '74.6.231.21', '98.137.11.163', '98.137.11.164']

async_test
3.437023878097534
bonappetit.com ['151.101.0.239', '151.101.128.239', '151.101.192.239', '151.101.64.239']
cnn.com ['151.101.1.67', '151.101.129.67', '151.101.193.67', '151.101.65.67']
food52.com ['104.18.166.45', '104.18.174.13']
foodtv.com ['67.199.248.12', '67.199.248.13']
ibm.com ['184.29.179.199', '23.218.185.219']
newyorker.com ['151.101.0.239', '151.101.128.239', '151.101.192.239', '151.101.64.239']
nymag.com ['151.101.130.133', '151.101.194.133', '151.101.2.133', '151.101.66.133']
nytimes.com ['151.101.1.164', '151.101.129.164', '151.101.193.164', '151.101.65.164']
seriouseats.com ['151.101.130.137', '151.101.194.137', '151.101.2.137', '151.101.66.137']
stackoverflow.com ['151.101.1.69', '151.101.129.69', '151.101.193.69', '151.101.65.69']
tcm.com ['104.127.162.10', '23.79.32.175']
theatlantic.com ['151.101.130.133', '151.101.194.133', '151.101.2.133', '151.101.66.133']
wahingtonpost.com ['198.72.14.16']
wqxr.org ['44.194.174.151', '54.144.182.133']
yahoo.com ['74.6.143.25', '74.6.143.26', '74.6.231.20', '74.6.231.21', '98.137.11.163', '98.137.11.164']
Booboo
  • 38,656
  • 3
  • 37
  • 60