2023-05-02 21:56:02 +02:00
|
|
|
import requests
|
|
|
|
from lxml import html
|
|
|
|
from xml.etree.ElementTree import fromstring
|
2023-05-01 23:15:23 +02:00
|
|
|
|
|
|
|
def getContentFromURL(url):
|
|
|
|
return requests.get(url).text
|
|
|
|
|
|
|
|
c = 0
|
|
|
|
|
2023-05-01 23:30:57 +02:00
|
|
|
# 36 is the first `i` value containing `livres`.
|
|
|
|
for i in range(1, 203):
|
2023-05-01 23:15:23 +02:00
|
|
|
print(i)
|
|
|
|
url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml'
|
|
|
|
sitemapContent = getContentFromURL(url)
|
2023-05-02 21:56:02 +02:00
|
|
|
sitemapXML = fromstring(sitemapContent)
|
|
|
|
for entry in sitemapXML:
|
|
|
|
loc = entry.findtext('{*}loc')
|
2023-05-01 23:15:23 +02:00
|
|
|
if loc.startswith('https://ww9.bookys-ebooks.com/livres/'):
|
|
|
|
print(i, c)
|
|
|
|
print(loc)
|
|
|
|
content = getContentFromURL(loc)
|
2023-05-01 23:30:57 +02:00
|
|
|
tree = html.fromstring(content)
|
|
|
|
title = tree.xpath('/html/body/div/div/div[2]/div[1]/div/div[2]/div[2]/h1')[0].text.strip()
|
2023-05-01 23:15:23 +02:00
|
|
|
print(title)
|
|
|
|
print()
|
|
|
|
c += 1
|
|
|
|
|