Webscraping_tests/bookys.py

26 lines
701 B
Python

import requests
def getContentFromURL(url):
return requests.get(url).text
c = 0
for i in range(36, 203):#1, 203):
print(i)
url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml'
sitemapContent = getContentFromURL(url)
jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:]
for entry in jsonData:
#print(entry)
loc = entry['loc']
if loc.startswith('https://ww9.bookys-ebooks.com/livres/'):
#print(entry)
print(i, c)
print(loc)
content = getContentFromURL(loc)
title = content.split('<title>')[1].split('\n')[0]
print(title)
print()
c += 1