I've used a small function to find sitemaps by the most common name.
Sitemap naming stats:
https://dret.typepad.com/dretblog/2009/02/sitemap-names.html
def get_sitemap_bruto_force(website):
potential_sitemaps = [
"sitemap.xml",
"feeds/posts/default?orderby=updated",
"sitemap.xml.gz",
"sitemap_index.xml",
"s2/sitemaps/profiles-sitemap.xml",
"sitemap.php",
"sitemap_index.xml.gz",
"vb/sitemap_index.xml.gz",
"sitemapindex.xml",
"sitemap.gz"
]
for sitemap in potential_sitemaps:
try:
sitemap_response = requests.get(f"{website}/{sitemap}")
if sitemap_response.status_code == 200:
return [sitemap_response.url]
continue
except:
continue
As I retrieve sitemap index I'll send it to a recursive function to find all links from all sitemaps.
def dig_up_all_sitemaps(website):
sitemaps = []
index_sitemap = get_sitemap_paths_for_domain(website)
def recursive(sitemaps_to_crawl=index_sitemap):
current_sitemaps = []
for sitemap in sitemaps_to_crawl:
try:
child_sitemap = get_sitemap_links(sitemap)
current_sitemaps.append([x for x in child_sitemap if re.search("\.xml|\.xml.gz|\.gz$",x)])
except:
continue
current_sitemaps = list(itertools.chain.from_iterable(current_sitemaps))
sitemaps.extend(current_sitemaps)
if len(current_sitemaps) == 0:
return sitemaps
return recursive(current_sitemaps)
return recursive()
get_sitemap_paths_for_domain
returns a list of sitemaps