Webscraping_tests/scanlibs.py

import requests
from lxml import html
from xml.etree.ElementTree import fromstring

def getContentFromURL(url):
    return requests.get(url).text

sitemapUrl = 'https://scanlibs.com/sitemap.xml'

sitemapContent = getContentFromURL(sitemapUrl)
sitemapXML = fromstring(sitemapContent)

for entry in sitemapXML.findall('{*}url')[1:]:
    loc = entry.findtext('{*}loc')
    print(loc)
    print(entry.findtext('{*}lastmod'))
    print(entry.find('{*}image').findtext('{*}loc'))
    while True:
        content = getContentFromURL(loc)
        tree = html.fromstring(content)
        title = tree.xpath('/html/head/title')[0].text
        if not title.startswith('503 Service Temporarily Unavailable'):
            break
        else:
            print('MISS')
    print(title)
    meta = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[1]')[0].text
    print(meta)
    description = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[2]')[0].text
    print(description)
    print()