Use XPath
instead of unstable split
s
This commit is contained in:
parent
1a7d5b424a
commit
0583199a16
10
bookys.py
10
bookys.py
@ -1,24 +1,24 @@
|
||||
import requests
|
||||
import requests, xmltodict
|
||||
|
||||
def getContentFromURL(url):
|
||||
return requests.get(url).text
|
||||
|
||||
c = 0
|
||||
|
||||
for i in range(36, 203):#1, 203):
|
||||
# 36 is the first `i` value containing `livres`.
|
||||
for i in range(1, 203):
|
||||
print(i)
|
||||
url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml'
|
||||
sitemapContent = getContentFromURL(url)
|
||||
jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:]
|
||||
for entry in jsonData:
|
||||
#print(entry)
|
||||
loc = entry['loc']
|
||||
if loc.startswith('https://ww9.bookys-ebooks.com/livres/'):
|
||||
#print(entry)
|
||||
print(i, c)
|
||||
print(loc)
|
||||
content = getContentFromURL(loc)
|
||||
title = content.split('<title>')[1].split('\n')[0]
|
||||
tree = html.fromstring(content)
|
||||
title = tree.xpath('/html/body/div/div/div[2]/div[1]/div/div[2]/div[2]/h1')[0].text.strip()
|
||||
print(title)
|
||||
print()
|
||||
c += 1
|
||||
|
16
scanlibs.py
16
scanlibs.py
@ -1,6 +1,6 @@
|
||||
import requests, re
|
||||
import requests, re, xmltodict
|
||||
from lxml import html
|
||||
from xml.etree.ElementTree import XML, fromstring
|
||||
#from xml.etree.ElementTree import fromstring
|
||||
|
||||
def getContentFromURL(url):
|
||||
return requests.get(url).text
|
||||
@ -29,17 +29,15 @@ for entry in jsonData:
|
||||
while True:
|
||||
content = getContentFromURL(loc)
|
||||
tree = html.fromstring(content)
|
||||
title = content.split('<title>')[1].split('</title>')[0]
|
||||
title = tree.xpath('/html/head/title')[0].text
|
||||
if not title.startswith('503 Service Temporarily Unavailable'):
|
||||
break
|
||||
else:
|
||||
print('MISS')
|
||||
print(title)
|
||||
print()
|
||||
continue
|
||||
metaPattern = 'border-bottom:1px solid #f0f0f0">'
|
||||
meta = content.split(metaPattern)[1].split('</p>')[0]
|
||||
meta = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[1]')[0].text
|
||||
print(meta)
|
||||
description = content.split('"></span>')[1].split('<a class="rewlink"')[0]
|
||||
description = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[2]')[0].text
|
||||
print(description)
|
||||
print()
|
||||
print()
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user