diff --git a/bookys.py b/bookys.py index 44d35a5..c21a9b8 100644 --- a/bookys.py +++ b/bookys.py @@ -1,4 +1,6 @@ -import requests, xmltodict +import requests +from lxml import html +from xml.etree.ElementTree import fromstring def getContentFromURL(url): return requests.get(url).text @@ -10,9 +12,9 @@ for i in range(1, 203): print(i) url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml' sitemapContent = getContentFromURL(url) - jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:] - for entry in jsonData: - loc = entry['loc'] + sitemapXML = fromstring(sitemapContent) + for entry in sitemapXML: + loc = entry.findtext('{*}loc') if loc.startswith('https://ww9.bookys-ebooks.com/livres/'): print(i, c) print(loc) diff --git a/scanlibs.py b/scanlibs.py index 4ba76d4..0c4c15a 100644 --- a/scanlibs.py +++ b/scanlibs.py @@ -1,6 +1,6 @@ -import requests, re, xmltodict +import requests from lxml import html -#from xml.etree.ElementTree import fromstring +from xml.etree.ElementTree import fromstring def getContentFromURL(url): return requests.get(url).text @@ -8,24 +8,13 @@ def getContentFromURL(url): sitemapUrl = 'https://scanlibs.com/sitemap.xml' sitemapContent = getContentFromURL(sitemapUrl) -""" sitemapXML = fromstring(sitemapContent) -for elem in sitemapXML.iter(): - for el in elem.iter(): - print(el.tag, ':', el.text) - #print('!', elem.text, '!') - -import xmltodict, json -""" - -jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:] - -for entry in jsonData: - loc = entry['loc'] +for entry in sitemapXML.findall('{*}url')[1:]: + loc = entry.findtext('{*}loc') print(loc) - print(entry['lastmod']) - print(entry['image:image']['image:loc']) + print(entry.findtext('{*}lastmod')) + print(entry.find('{*}image').findtext('{*}loc')) while True: content = getContentFromURL(loc) tree = html.fromstring(content)