commit 1a7d5b424aad29c8fd26c4822b27904a6ceaf5ad
Author: Benjamin Loison
Date: Mon May 1 23:15:23 2023 +0200
Add `{scanlibs,bookys}.py`
diff --git a/bookys.py b/bookys.py
new file mode 100644
index 0000000..0496f16
--- /dev/null
+++ b/bookys.py
@@ -0,0 +1,25 @@
+import requests
+
+def getContentFromURL(url):
+ return requests.get(url).text
+
+c = 0
+
+for i in range(36, 203):#1, 203):
+ print(i)
+ url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml'
+ sitemapContent = getContentFromURL(url)
+ jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:]
+ for entry in jsonData:
+ #print(entry)
+ loc = entry['loc']
+ if loc.startswith('https://ww9.bookys-ebooks.com/livres/'):
+ #print(entry)
+ print(i, c)
+ print(loc)
+ content = getContentFromURL(loc)
+ title = content.split('')[1].split('\n')[0]
+ print(title)
+ print()
+ c += 1
+
diff --git a/scanlibs.py b/scanlibs.py
new file mode 100644
index 0000000..80b9836
--- /dev/null
+++ b/scanlibs.py
@@ -0,0 +1,45 @@
+import requests, re
+from lxml import html
+from xml.etree.ElementTree import XML, fromstring
+
+def getContentFromURL(url):
+ return requests.get(url).text
+
+sitemapUrl = 'https://scanlibs.com/sitemap.xml'
+
+sitemapContent = getContentFromURL(sitemapUrl)
+"""
+sitemapXML = fromstring(sitemapContent)
+
+for elem in sitemapXML.iter():
+ for el in elem.iter():
+ print(el.tag, ':', el.text)
+ #print('!', elem.text, '!')
+
+import xmltodict, json
+"""
+
+jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:]
+
+for entry in jsonData:
+ loc = entry['loc']
+ print(loc)
+ print(entry['lastmod'])
+ print(entry['image:image']['image:loc'])
+ while True:
+ content = getContentFromURL(loc)
+ tree = html.fromstring(content)
+ title = content.split('')[1].split('')[0]
+ if not title.startswith('503 Service Temporarily Unavailable'):
+ break
+ else:
+ print('MISS')
+ print(title)
+ print()
+ continue
+ metaPattern = 'border-bottom:1px solid #f0f0f0">'
+ meta = content.split(metaPattern)[1].split('
')[0]
+ print(meta)
+ description = content.split('">')[1].split('