import requests, re from lxml import html from xml.etree.ElementTree import XML, fromstring def getContentFromURL(url): return requests.get(url).text sitemapUrl = 'https://scanlibs.com/sitemap.xml' sitemapContent = getContentFromURL(sitemapUrl) """ sitemapXML = fromstring(sitemapContent) for elem in sitemapXML.iter(): for el in elem.iter(): print(el.tag, ':', el.text) #print('!', elem.text, '!') import xmltodict, json """ jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:] for entry in jsonData: loc = entry['loc'] print(loc) print(entry['lastmod']) print(entry['image:image']['image:loc']) while True: content = getContentFromURL(loc) tree = html.fromstring(content) title = content.split('')[1].split('')[0] if not title.startswith('503 Service Temporarily Unavailable'): break else: print('MISS') print(title) print() continue metaPattern = 'border-bottom:1px solid #f0f0f0">' meta = content.split(metaPattern)[1].split('

')[0] print(meta) description = content.split('">')[1].split('