Fix #1: Add XML native use without requiring xmltodict
This commit is contained in:
parent
0583199a16
commit
3ef42f5ca8
10
bookys.py
10
bookys.py
@ -1,4 +1,6 @@
|
|||||||
import requests, xmltodict
|
import requests
|
||||||
|
from lxml import html
|
||||||
|
from xml.etree.ElementTree import fromstring
|
||||||
|
|
||||||
def getContentFromURL(url):
|
def getContentFromURL(url):
|
||||||
return requests.get(url).text
|
return requests.get(url).text
|
||||||
@ -10,9 +12,9 @@ for i in range(1, 203):
|
|||||||
print(i)
|
print(i)
|
||||||
url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml'
|
url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml'
|
||||||
sitemapContent = getContentFromURL(url)
|
sitemapContent = getContentFromURL(url)
|
||||||
jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:]
|
sitemapXML = fromstring(sitemapContent)
|
||||||
for entry in jsonData:
|
for entry in sitemapXML:
|
||||||
loc = entry['loc']
|
loc = entry.findtext('{*}loc')
|
||||||
if loc.startswith('https://ww9.bookys-ebooks.com/livres/'):
|
if loc.startswith('https://ww9.bookys-ebooks.com/livres/'):
|
||||||
print(i, c)
|
print(i, c)
|
||||||
print(loc)
|
print(loc)
|
||||||
|
23
scanlibs.py
23
scanlibs.py
@ -1,6 +1,6 @@
|
|||||||
import requests, re, xmltodict
|
import requests
|
||||||
from lxml import html
|
from lxml import html
|
||||||
#from xml.etree.ElementTree import fromstring
|
from xml.etree.ElementTree import fromstring
|
||||||
|
|
||||||
def getContentFromURL(url):
|
def getContentFromURL(url):
|
||||||
return requests.get(url).text
|
return requests.get(url).text
|
||||||
@ -8,24 +8,13 @@ def getContentFromURL(url):
|
|||||||
sitemapUrl = 'https://scanlibs.com/sitemap.xml'
|
sitemapUrl = 'https://scanlibs.com/sitemap.xml'
|
||||||
|
|
||||||
sitemapContent = getContentFromURL(sitemapUrl)
|
sitemapContent = getContentFromURL(sitemapUrl)
|
||||||
"""
|
|
||||||
sitemapXML = fromstring(sitemapContent)
|
sitemapXML = fromstring(sitemapContent)
|
||||||
|
|
||||||
for elem in sitemapXML.iter():
|
for entry in sitemapXML.findall('{*}url')[1:]:
|
||||||
for el in elem.iter():
|
loc = entry.findtext('{*}loc')
|
||||||
print(el.tag, ':', el.text)
|
|
||||||
#print('!', elem.text, '!')
|
|
||||||
|
|
||||||
import xmltodict, json
|
|
||||||
"""
|
|
||||||
|
|
||||||
jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:]
|
|
||||||
|
|
||||||
for entry in jsonData:
|
|
||||||
loc = entry['loc']
|
|
||||||
print(loc)
|
print(loc)
|
||||||
print(entry['lastmod'])
|
print(entry.findtext('{*}lastmod'))
|
||||||
print(entry['image:image']['image:loc'])
|
print(entry.find('{*}image').findtext('{*}loc'))
|
||||||
while True:
|
while True:
|
||||||
content = getContentFromURL(loc)
|
content = getContentFromURL(loc)
|
||||||
tree = html.fromstring(content)
|
tree = html.fromstring(content)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user