Webscraping_tests/bookys.py

import requests, xmltodict

def getContentFromURL(url):
    return requests.get(url).text

c = 0

# 36 is the first `i` value containing `livres`.
for i in range(1, 203):
    print(i)
    url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml'
    sitemapContent = getContentFromURL(url)
    jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:]
    for entry in jsonData:
        loc = entry['loc']
        if loc.startswith('https://ww9.bookys-ebooks.com/livres/'):
            print(i, c)
            print(loc)
            content = getContentFromURL(loc)
            tree = html.fromstring(content)
            title = tree.xpath('/html/body/div/div/div[2]/div[1]/div/div[2]/div[2]/h1')[0].text.strip()
            print(title)
            print()
            c += 1