Fix #1: Add XML native use without requiring xmltodict
This commit is contained in:
parent
0583199a16
commit
3ef42f5ca8
10
bookys.py
10
bookys.py
@ -1,4 +1,6 @@
|
||||
import requests, xmltodict
|
||||
import requests
|
||||
from lxml import html
|
||||
from xml.etree.ElementTree import fromstring
|
||||
|
||||
def getContentFromURL(url):
|
||||
return requests.get(url).text
|
||||
@ -10,9 +12,9 @@ for i in range(1, 203):
|
||||
print(i)
|
||||
url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml'
|
||||
sitemapContent = getContentFromURL(url)
|
||||
jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:]
|
||||
for entry in jsonData:
|
||||
loc = entry['loc']
|
||||
sitemapXML = fromstring(sitemapContent)
|
||||
for entry in sitemapXML:
|
||||
loc = entry.findtext('{*}loc')
|
||||
if loc.startswith('https://ww9.bookys-ebooks.com/livres/'):
|
||||
print(i, c)
|
||||
print(loc)
|
||||
|
23
scanlibs.py
23
scanlibs.py
@ -1,6 +1,6 @@
|
||||
import requests, re, xmltodict
|
||||
import requests
|
||||
from lxml import html
|
||||
#from xml.etree.ElementTree import fromstring
|
||||
from xml.etree.ElementTree import fromstring
|
||||
|
||||
def getContentFromURL(url):
|
||||
return requests.get(url).text
|
||||
@ -8,24 +8,13 @@ def getContentFromURL(url):
|
||||
sitemapUrl = 'https://scanlibs.com/sitemap.xml'
|
||||
|
||||
sitemapContent = getContentFromURL(sitemapUrl)
|
||||
"""
|
||||
sitemapXML = fromstring(sitemapContent)
|
||||
|
||||
for elem in sitemapXML.iter():
|
||||
for el in elem.iter():
|
||||
print(el.tag, ':', el.text)
|
||||
#print('!', elem.text, '!')
|
||||
|
||||
import xmltodict, json
|
||||
"""
|
||||
|
||||
jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:]
|
||||
|
||||
for entry in jsonData:
|
||||
loc = entry['loc']
|
||||
for entry in sitemapXML.findall('{*}url')[1:]:
|
||||
loc = entry.findtext('{*}loc')
|
||||
print(loc)
|
||||
print(entry['lastmod'])
|
||||
print(entry['image:image']['image:loc'])
|
||||
print(entry.findtext('{*}lastmod'))
|
||||
print(entry.find('{*}image').findtext('{*}loc'))
|
||||
while True:
|
||||
content = getContentFromURL(loc)
|
||||
tree = html.fromstring(content)
|
||||
|
Loading…
Reference in New Issue
Block a user