import requests, re, xmltodict from lxml import html #from xml.etree.ElementTree import fromstring def getContentFromURL(url): return requests.get(url).text sitemapUrl = 'https://scanlibs.com/sitemap.xml' sitemapContent = getContentFromURL(sitemapUrl) """ sitemapXML = fromstring(sitemapContent) for elem in sitemapXML.iter(): for el in elem.iter(): print(el.tag, ':', el.text) #print('!', elem.text, '!') import xmltodict, json """ jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:] for entry in jsonData: loc = entry['loc'] print(loc) print(entry['lastmod']) print(entry['image:image']['image:loc']) while True: content = getContentFromURL(loc) tree = html.fromstring(content) title = tree.xpath('/html/head/title')[0].text if not title.startswith('503 Service Temporarily Unavailable'): break else: print('MISS') print(title) meta = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[1]')[0].text print(meta) description = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[2]')[0].text print(description) print()