From 1a7d5b424aad29c8fd26c4822b27904a6ceaf5ad Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Mon, 1 May 2023 23:15:23 +0200 Subject: [PATCH] Add `{scanlibs,bookys}.py` --- bookys.py | 25 +++++++++++++++++++++++++ scanlibs.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 bookys.py create mode 100644 scanlibs.py diff --git a/bookys.py b/bookys.py new file mode 100644 index 0000000..0496f16 --- /dev/null +++ b/bookys.py @@ -0,0 +1,25 @@ +import requests + +def getContentFromURL(url): + return requests.get(url).text + +c = 0 + +for i in range(36, 203):#1, 203): + print(i) + url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml' + sitemapContent = getContentFromURL(url) + jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:] + for entry in jsonData: + #print(entry) + loc = entry['loc'] + if loc.startswith('https://ww9.bookys-ebooks.com/livres/'): + #print(entry) + print(i, c) + print(loc) + content = getContentFromURL(loc) + title = content.split('')[1].split('\n')[0] + print(title) + print() + c += 1 + diff --git a/scanlibs.py b/scanlibs.py new file mode 100644 index 0000000..80b9836 --- /dev/null +++ b/scanlibs.py @@ -0,0 +1,45 @@ +import requests, re +from lxml import html +from xml.etree.ElementTree import XML, fromstring + +def getContentFromURL(url): + return requests.get(url).text + +sitemapUrl = 'https://scanlibs.com/sitemap.xml' + +sitemapContent = getContentFromURL(sitemapUrl) +""" +sitemapXML = fromstring(sitemapContent) + +for elem in sitemapXML.iter(): + for el in elem.iter(): + print(el.tag, ':', el.text) + #print('!', elem.text, '!') + +import xmltodict, json +""" + +jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:] + +for entry in jsonData: + loc = entry['loc'] + print(loc) + print(entry['lastmod']) + print(entry['image:image']['image:loc']) + while True: + content = getContentFromURL(loc) + tree = html.fromstring(content) + title = content.split('<title>')[1].split('')[0] + if not title.startswith('503 Service Temporarily Unavailable'): + break + else: + print('MISS') + print(title) + print() + continue + metaPattern = 'border-bottom:1px solid #f0f0f0">' + meta = content.split(metaPattern)[1].split('

')[0] + print(meta) + description = content.split('">')[1].split('