From 0583199a16cad984891563c384b43bc2a18e919c Mon Sep 17 00:00:00 2001 From: Benjamin Loison Date: Mon, 1 May 2023 23:30:57 +0200 Subject: [PATCH] Use `XPath` instead of unstable `split`s --- bookys.py | 10 +++++----- scanlibs.py | 16 +++++++--------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/bookys.py b/bookys.py index 0496f16..44d35a5 100644 --- a/bookys.py +++ b/bookys.py @@ -1,24 +1,24 @@ -import requests +import requests, xmltodict def getContentFromURL(url): return requests.get(url).text c = 0 -for i in range(36, 203):#1, 203): +# 36 is the first `i` value containing `livres`. +for i in range(1, 203): print(i) url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml' sitemapContent = getContentFromURL(url) jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:] for entry in jsonData: - #print(entry) loc = entry['loc'] if loc.startswith('https://ww9.bookys-ebooks.com/livres/'): - #print(entry) print(i, c) print(loc) content = getContentFromURL(loc) - title = content.split('')[1].split('\n')[0] + tree = html.fromstring(content) + title = tree.xpath('/html/body/div/div/div[2]/div[1]/div/div[2]/div[2]/h1')[0].text.strip() print(title) print() c += 1 diff --git a/scanlibs.py b/scanlibs.py index 80b9836..4ba76d4 100644 --- a/scanlibs.py +++ b/scanlibs.py @@ -1,6 +1,6 @@ -import requests, re +import requests, re, xmltodict from lxml import html -from xml.etree.ElementTree import XML, fromstring +#from xml.etree.ElementTree import fromstring def getContentFromURL(url): return requests.get(url).text @@ -29,17 +29,15 @@ for entry in jsonData: while True: content = getContentFromURL(loc) tree = html.fromstring(content) - title = content.split('<title>')[1].split('')[0] + title = tree.xpath('/html/head/title')[0].text if not title.startswith('503 Service Temporarily Unavailable'): break else: print('MISS') print(title) - print() - continue - metaPattern = 'border-bottom:1px solid #f0f0f0">' - meta = content.split(metaPattern)[1].split('

')[0] + meta = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[1]')[0].text print(meta) - description = content.split('">')[1].split('