import requests from lxml import html from xml.etree.ElementTree import fromstring def getContentFromURL(url): return requests.get(url).text sitemapUrl = 'https://scanlibs.com/sitemap.xml' sitemapContent = getContentFromURL(sitemapUrl) sitemapXML = fromstring(sitemapContent) for entry in sitemapXML.findall('{*}url')[1:]: loc = entry.findtext('{*}loc') print(loc) print(entry.findtext('{*}lastmod')) print(entry.find('{*}image').findtext('{*}loc')) while True: content = getContentFromURL(loc) tree = html.fromstring(content) title = tree.xpath('/html/head/title')[0].text if not title.startswith('503 Service Temporarily Unavailable'): break else: print('MISS') print(title) meta = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[1]')[0].text print(meta) description = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[2]')[0].text print(description) print()