Fix #1: Add XML native use without requiring xmltodict

This commit is contained in:
Benjamin Loison 2023-05-02 21:56:02 +02:00
parent 0583199a16
commit 3ef42f5ca8
Signed by: Benjamin_Loison
SSH Key Fingerprint: SHA256:BtnEgYTlHdOg1u+RmYcDE0mnfz1rhv5dSbQ2gyxW8B8
2 changed files with 12 additions and 21 deletions

View File

@ -1,4 +1,6 @@
import requests, xmltodict import requests
from lxml import html
from xml.etree.ElementTree import fromstring
def getContentFromURL(url): def getContentFromURL(url):
return requests.get(url).text return requests.get(url).text
@ -10,9 +12,9 @@ for i in range(1, 203):
print(i) print(i)
url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml' url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml'
sitemapContent = getContentFromURL(url) sitemapContent = getContentFromURL(url)
jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:] sitemapXML = fromstring(sitemapContent)
for entry in jsonData: for entry in sitemapXML:
loc = entry['loc'] loc = entry.findtext('{*}loc')
if loc.startswith('https://ww9.bookys-ebooks.com/livres/'): if loc.startswith('https://ww9.bookys-ebooks.com/livres/'):
print(i, c) print(i, c)
print(loc) print(loc)

View File

@ -1,6 +1,6 @@
import requests, re, xmltodict import requests
from lxml import html from lxml import html
#from xml.etree.ElementTree import fromstring from xml.etree.ElementTree import fromstring
def getContentFromURL(url): def getContentFromURL(url):
return requests.get(url).text return requests.get(url).text
@ -8,24 +8,13 @@ def getContentFromURL(url):
sitemapUrl = 'https://scanlibs.com/sitemap.xml' sitemapUrl = 'https://scanlibs.com/sitemap.xml'
sitemapContent = getContentFromURL(sitemapUrl) sitemapContent = getContentFromURL(sitemapUrl)
"""
sitemapXML = fromstring(sitemapContent) sitemapXML = fromstring(sitemapContent)
for elem in sitemapXML.iter(): for entry in sitemapXML.findall('{*}url')[1:]:
for el in elem.iter(): loc = entry.findtext('{*}loc')
print(el.tag, ':', el.text)
#print('!', elem.text, '!')
import xmltodict, json
"""
jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:]
for entry in jsonData:
loc = entry['loc']
print(loc) print(loc)
print(entry['lastmod']) print(entry.findtext('{*}lastmod'))
print(entry['image:image']['image:loc']) print(entry.find('{*}image').findtext('{*}loc'))
while True: while True:
content = getContentFromURL(loc) content = getContentFromURL(loc)
tree = html.fromstring(content) tree = html.fromstring(content)