Webscraping_tests/bookys.py

28 lines
834 B
Python
Raw Normal View History

import requests
from lxml import html
from xml.etree.ElementTree import fromstring
2023-05-01 23:15:23 +02:00
def getContentFromURL(url):
return requests.get(url).text
c = 0
# 36 is the first `i` value containing `livres`.
for i in range(1, 203):
2023-05-01 23:15:23 +02:00
print(i)
url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml'
sitemapContent = getContentFromURL(url)
sitemapXML = fromstring(sitemapContent)
for entry in sitemapXML:
loc = entry.findtext('{*}loc')
2023-05-01 23:15:23 +02:00
if loc.startswith('https://ww9.bookys-ebooks.com/livres/'):
print(i, c)
print(loc)
content = getContentFromURL(loc)
tree = html.fromstring(content)
title = tree.xpath('/html/body/div/div/div[2]/div[1]/div/div[2]/div[2]/h1')[0].text.strip()
2023-05-01 23:15:23 +02:00
print(title)
print()
c += 1