Webscraping_tests/scanlibs.py

44 lines
1.2 KiB
Python

import requests, re, xmltodict
from lxml import html
#from xml.etree.ElementTree import fromstring
def getContentFromURL(url):
return requests.get(url).text
sitemapUrl = 'https://scanlibs.com/sitemap.xml'
sitemapContent = getContentFromURL(sitemapUrl)
"""
sitemapXML = fromstring(sitemapContent)
for elem in sitemapXML.iter():
for el in elem.iter():
print(el.tag, ':', el.text)
#print('!', elem.text, '!')
import xmltodict, json
"""
jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:]
for entry in jsonData:
loc = entry['loc']
print(loc)
print(entry['lastmod'])
print(entry['image:image']['image:loc'])
while True:
content = getContentFromURL(loc)
tree = html.fromstring(content)
title = tree.xpath('/html/head/title')[0].text
if not title.startswith('503 Service Temporarily Unavailable'):
break
else:
print('MISS')
print(title)
meta = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[1]')[0].text
print(meta)
description = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[2]')[0].text
print(description)
print()