2023-05-02 21:56:02 +02:00
|
|
|
import requests
|
2023-05-01 23:15:23 +02:00
|
|
|
from lxml import html
|
2023-05-02 21:56:02 +02:00
|
|
|
from xml.etree.ElementTree import fromstring
|
2023-05-01 23:15:23 +02:00
|
|
|
|
|
|
|
def getContentFromURL(url):
|
|
|
|
return requests.get(url).text
|
|
|
|
|
|
|
|
sitemapUrl = 'https://scanlibs.com/sitemap.xml'
|
|
|
|
|
|
|
|
sitemapContent = getContentFromURL(sitemapUrl)
|
|
|
|
sitemapXML = fromstring(sitemapContent)
|
|
|
|
|
2023-05-02 21:56:02 +02:00
|
|
|
for entry in sitemapXML.findall('{*}url')[1:]:
|
|
|
|
loc = entry.findtext('{*}loc')
|
2023-05-01 23:15:23 +02:00
|
|
|
print(loc)
|
2023-05-02 21:56:02 +02:00
|
|
|
print(entry.findtext('{*}lastmod'))
|
|
|
|
print(entry.find('{*}image').findtext('{*}loc'))
|
2023-05-01 23:15:23 +02:00
|
|
|
while True:
|
|
|
|
content = getContentFromURL(loc)
|
|
|
|
tree = html.fromstring(content)
|
2023-05-01 23:30:57 +02:00
|
|
|
title = tree.xpath('/html/head/title')[0].text
|
2023-05-01 23:15:23 +02:00
|
|
|
if not title.startswith('503 Service Temporarily Unavailable'):
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
print('MISS')
|
|
|
|
print(title)
|
2023-05-01 23:30:57 +02:00
|
|
|
meta = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[1]')[0].text
|
2023-05-01 23:15:23 +02:00
|
|
|
print(meta)
|
2023-05-01 23:30:57 +02:00
|
|
|
description = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[2]')[0].text
|
2023-05-01 23:15:23 +02:00
|
|
|
print(description)
|
2023-05-01 23:30:57 +02:00
|
|
|
print()
|
|
|
|
|