Webscraping_tests/scanlibs.py

33 lines
1014 B
Python
Raw Normal View History

import requests
2023-05-01 23:15:23 +02:00
from lxml import html
from xml.etree.ElementTree import fromstring
2023-05-01 23:15:23 +02:00
def getContentFromURL(url):
return requests.get(url).text
sitemapUrl = 'https://scanlibs.com/sitemap.xml'
sitemapContent = getContentFromURL(sitemapUrl)
sitemapXML = fromstring(sitemapContent)
for entry in sitemapXML.findall('{*}url')[1:]:
loc = entry.findtext('{*}loc')
2023-05-01 23:15:23 +02:00
print(loc)
print(entry.findtext('{*}lastmod'))
print(entry.find('{*}image').findtext('{*}loc'))
2023-05-01 23:15:23 +02:00
while True:
content = getContentFromURL(loc)
tree = html.fromstring(content)
title = tree.xpath('/html/head/title')[0].text
2023-05-01 23:15:23 +02:00
if not title.startswith('503 Service Temporarily Unavailable'):
break
else:
print('MISS')
print(title)
meta = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[1]')[0].text
2023-05-01 23:15:23 +02:00
print(meta)
description = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[2]')[0].text
2023-05-01 23:15:23 +02:00
print(description)
print()