Use XPath instead of unstable splits

This commit is contained in:
Benjamin Loison 2023-05-01 23:30:57 +02:00
parent 1a7d5b424a
commit 0583199a16
Signed by: Benjamin_Loison
SSH Key Fingerprint: SHA256:BtnEgYTlHdOg1u+RmYcDE0mnfz1rhv5dSbQ2gyxW8B8
2 changed files with 12 additions and 14 deletions

View File

@ -1,24 +1,24 @@
import requests
import requests, xmltodict
def getContentFromURL(url):
return requests.get(url).text
c = 0
for i in range(36, 203):#1, 203):
# 36 is the first `i` value containing `livres`.
for i in range(1, 203):
print(i)
url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml'
sitemapContent = getContentFromURL(url)
jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:]
for entry in jsonData:
#print(entry)
loc = entry['loc']
if loc.startswith('https://ww9.bookys-ebooks.com/livres/'):
#print(entry)
print(i, c)
print(loc)
content = getContentFromURL(loc)
title = content.split('<title>')[1].split('\n')[0]
tree = html.fromstring(content)
title = tree.xpath('/html/body/div/div/div[2]/div[1]/div/div[2]/div[2]/h1')[0].text.strip()
print(title)
print()
c += 1

View File

@ -1,6 +1,6 @@
import requests, re
import requests, re, xmltodict
from lxml import html
from xml.etree.ElementTree import XML, fromstring
#from xml.etree.ElementTree import fromstring
def getContentFromURL(url):
return requests.get(url).text
@ -29,17 +29,15 @@ for entry in jsonData:
while True:
content = getContentFromURL(loc)
tree = html.fromstring(content)
title = content.split('<title>')[1].split('</title>')[0]
title = tree.xpath('/html/head/title')[0].text
if not title.startswith('503 Service Temporarily Unavailable'):
break
else:
print('MISS')
print(title)
print()
continue
metaPattern = 'border-bottom:1px solid #f0f0f0">'
meta = content.split(metaPattern)[1].split('</p>')[0]
meta = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[1]')[0].text
print(meta)
description = content.split('"></span>')[1].split('<a class="rewlink"')[0]
description = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[2]')[0].text
print(description)
print()
print()