45 lines
1.2 KiB
Python
45 lines
1.2 KiB
Python
import requests, re
|
|
from lxml import html
|
|
from xml.etree.ElementTree import XML, fromstring
|
|
|
|
def getContentFromURL(url):
|
|
return requests.get(url).text
|
|
|
|
sitemapUrl = 'https://scanlibs.com/sitemap.xml'
|
|
|
|
sitemapContent = getContentFromURL(sitemapUrl)
|
|
"""
|
|
sitemapXML = fromstring(sitemapContent)
|
|
|
|
for elem in sitemapXML.iter():
|
|
for el in elem.iter():
|
|
print(el.tag, ':', el.text)
|
|
#print('!', elem.text, '!')
|
|
|
|
import xmltodict, json
|
|
"""
|
|
|
|
jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:]
|
|
|
|
for entry in jsonData:
|
|
loc = entry['loc']
|
|
print(loc)
|
|
print(entry['lastmod'])
|
|
print(entry['image:image']['image:loc'])
|
|
while True:
|
|
content = getContentFromURL(loc)
|
|
tree = html.fromstring(content)
|
|
title = content.split('<title>')[1].split('</title>')[0]
|
|
if not title.startswith('503 Service Temporarily Unavailable'):
|
|
break
|
|
else:
|
|
print('MISS')
|
|
print(title)
|
|
print()
|
|
continue
|
|
metaPattern = 'border-bottom:1px solid #f0f0f0">'
|
|
meta = content.split(metaPattern)[1].split('</p>')[0]
|
|
print(meta)
|
|
description = content.split('"></span>')[1].split('<a class="rewlink"')[0]
|
|
print(description)
|
|
print() |