diff --git a/bookys.py b/bookys.py
index 0496f16..44d35a5 100644
--- a/bookys.py
+++ b/bookys.py
@@ -1,24 +1,24 @@
-import requests
+import requests, xmltodict
def getContentFromURL(url):
return requests.get(url).text
c = 0
-for i in range(36, 203):#1, 203):
+# 36 is the first `i` value containing `livres`.
+for i in range(1, 203):
print(i)
url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml'
sitemapContent = getContentFromURL(url)
jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:]
for entry in jsonData:
- #print(entry)
loc = entry['loc']
if loc.startswith('https://ww9.bookys-ebooks.com/livres/'):
- #print(entry)
print(i, c)
print(loc)
content = getContentFromURL(loc)
- title = content.split('
')[1].split('\n')[0]
+ tree = html.fromstring(content)
+ title = tree.xpath('/html/body/div/div/div[2]/div[1]/div/div[2]/div[2]/h1')[0].text.strip()
print(title)
print()
c += 1
diff --git a/scanlibs.py b/scanlibs.py
index 80b9836..4ba76d4 100644
--- a/scanlibs.py
+++ b/scanlibs.py
@@ -1,6 +1,6 @@
-import requests, re
+import requests, re, xmltodict
from lxml import html
-from xml.etree.ElementTree import XML, fromstring
+#from xml.etree.ElementTree import fromstring
def getContentFromURL(url):
return requests.get(url).text
@@ -29,17 +29,15 @@ for entry in jsonData:
while True:
content = getContentFromURL(loc)
tree = html.fromstring(content)
- title = content.split('')[1].split('')[0]
+ title = tree.xpath('/html/head/title')[0].text
if not title.startswith('503 Service Temporarily Unavailable'):
break
else:
print('MISS')
print(title)
- print()
- continue
- metaPattern = 'border-bottom:1px solid #f0f0f0">'
- meta = content.split(metaPattern)[1].split('')[0]
+ meta = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[1]')[0].text
print(meta)
- description = content.split('">')[1].split('