Compare commits

..

No commits in common. "3ef42f5ca826a74ce5f55f62a69c16917eadf68e" and "1a7d5b424aad29c8fd26c4822b27904a6ceaf5ad" have entirely different histories.

2 changed files with 32 additions and 21 deletions

View File

@ -1,26 +1,24 @@
import requests import requests
from lxml import html
from xml.etree.ElementTree import fromstring
def getContentFromURL(url): def getContentFromURL(url):
return requests.get(url).text return requests.get(url).text
c = 0 c = 0
# 36 is the first `i` value containing `livres`. for i in range(36, 203):#1, 203):
for i in range(1, 203):
print(i) print(i)
url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml' url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml'
sitemapContent = getContentFromURL(url) sitemapContent = getContentFromURL(url)
sitemapXML = fromstring(sitemapContent) jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:]
for entry in sitemapXML: for entry in jsonData:
loc = entry.findtext('{*}loc') #print(entry)
loc = entry['loc']
if loc.startswith('https://ww9.bookys-ebooks.com/livres/'): if loc.startswith('https://ww9.bookys-ebooks.com/livres/'):
#print(entry)
print(i, c) print(i, c)
print(loc) print(loc)
content = getContentFromURL(loc) content = getContentFromURL(loc)
tree = html.fromstring(content) title = content.split('<title>')[1].split('\n')[0]
title = tree.xpath('/html/body/div/div/div[2]/div[1]/div/div[2]/div[2]/h1')[0].text.strip()
print(title) print(title)
print() print()
c += 1 c += 1

View File

@ -1,6 +1,6 @@
import requests import requests, re
from lxml import html from lxml import html
from xml.etree.ElementTree import fromstring from xml.etree.ElementTree import XML, fromstring
def getContentFromURL(url): def getContentFromURL(url):
return requests.get(url).text return requests.get(url).text
@ -8,25 +8,38 @@ def getContentFromURL(url):
sitemapUrl = 'https://scanlibs.com/sitemap.xml' sitemapUrl = 'https://scanlibs.com/sitemap.xml'
sitemapContent = getContentFromURL(sitemapUrl) sitemapContent = getContentFromURL(sitemapUrl)
"""
sitemapXML = fromstring(sitemapContent) sitemapXML = fromstring(sitemapContent)
for entry in sitemapXML.findall('{*}url')[1:]: for elem in sitemapXML.iter():
loc = entry.findtext('{*}loc') for el in elem.iter():
print(el.tag, ':', el.text)
#print('!', elem.text, '!')
import xmltodict, json
"""
jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:]
for entry in jsonData:
loc = entry['loc']
print(loc) print(loc)
print(entry.findtext('{*}lastmod')) print(entry['lastmod'])
print(entry.find('{*}image').findtext('{*}loc')) print(entry['image:image']['image:loc'])
while True: while True:
content = getContentFromURL(loc) content = getContentFromURL(loc)
tree = html.fromstring(content) tree = html.fromstring(content)
title = tree.xpath('/html/head/title')[0].text title = content.split('<title>')[1].split('</title>')[0]
if not title.startswith('503 Service Temporarily Unavailable'): if not title.startswith('503 Service Temporarily Unavailable'):
break break
else: else:
print('MISS') print('MISS')
print(title) print(title)
meta = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[1]')[0].text print()
continue
metaPattern = 'border-bottom:1px solid #f0f0f0">'
meta = content.split(metaPattern)[1].split('</p>')[0]
print(meta) print(meta)
description = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[2]')[0].text description = content.split('"></span>')[1].split('<a class="rewlink"')[0]
print(description) print(description)
print() print()