Compare commits
2 Commits
1a7d5b424a
...
3ef42f5ca8
Author | SHA1 | Date | |
---|---|---|---|
3ef42f5ca8 | |||
0583199a16 |
16
bookys.py
16
bookys.py
@ -1,24 +1,26 @@
|
|||||||
import requests
|
import requests
|
||||||
|
from lxml import html
|
||||||
|
from xml.etree.ElementTree import fromstring
|
||||||
|
|
||||||
def getContentFromURL(url):
|
def getContentFromURL(url):
|
||||||
return requests.get(url).text
|
return requests.get(url).text
|
||||||
|
|
||||||
c = 0
|
c = 0
|
||||||
|
|
||||||
for i in range(36, 203):#1, 203):
|
# 36 is the first `i` value containing `livres`.
|
||||||
|
for i in range(1, 203):
|
||||||
print(i)
|
print(i)
|
||||||
url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml'
|
url = f'https://ww9.bookys-ebooks.com/sitemaps/items{i}.xml'
|
||||||
sitemapContent = getContentFromURL(url)
|
sitemapContent = getContentFromURL(url)
|
||||||
jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:]
|
sitemapXML = fromstring(sitemapContent)
|
||||||
for entry in jsonData:
|
for entry in sitemapXML:
|
||||||
#print(entry)
|
loc = entry.findtext('{*}loc')
|
||||||
loc = entry['loc']
|
|
||||||
if loc.startswith('https://ww9.bookys-ebooks.com/livres/'):
|
if loc.startswith('https://ww9.bookys-ebooks.com/livres/'):
|
||||||
#print(entry)
|
|
||||||
print(i, c)
|
print(i, c)
|
||||||
print(loc)
|
print(loc)
|
||||||
content = getContentFromURL(loc)
|
content = getContentFromURL(loc)
|
||||||
title = content.split('<title>')[1].split('\n')[0]
|
tree = html.fromstring(content)
|
||||||
|
title = tree.xpath('/html/body/div/div/div[2]/div[1]/div/div[2]/div[2]/h1')[0].text.strip()
|
||||||
print(title)
|
print(title)
|
||||||
print()
|
print()
|
||||||
c += 1
|
c += 1
|
||||||
|
33
scanlibs.py
33
scanlibs.py
@ -1,6 +1,6 @@
|
|||||||
import requests, re
|
import requests
|
||||||
from lxml import html
|
from lxml import html
|
||||||
from xml.etree.ElementTree import XML, fromstring
|
from xml.etree.ElementTree import fromstring
|
||||||
|
|
||||||
def getContentFromURL(url):
|
def getContentFromURL(url):
|
||||||
return requests.get(url).text
|
return requests.get(url).text
|
||||||
@ -8,38 +8,25 @@ def getContentFromURL(url):
|
|||||||
sitemapUrl = 'https://scanlibs.com/sitemap.xml'
|
sitemapUrl = 'https://scanlibs.com/sitemap.xml'
|
||||||
|
|
||||||
sitemapContent = getContentFromURL(sitemapUrl)
|
sitemapContent = getContentFromURL(sitemapUrl)
|
||||||
"""
|
|
||||||
sitemapXML = fromstring(sitemapContent)
|
sitemapXML = fromstring(sitemapContent)
|
||||||
|
|
||||||
for elem in sitemapXML.iter():
|
for entry in sitemapXML.findall('{*}url')[1:]:
|
||||||
for el in elem.iter():
|
loc = entry.findtext('{*}loc')
|
||||||
print(el.tag, ':', el.text)
|
|
||||||
#print('!', elem.text, '!')
|
|
||||||
|
|
||||||
import xmltodict, json
|
|
||||||
"""
|
|
||||||
|
|
||||||
jsonData = xmltodict.parse(sitemapContent)['urlset']['url'][1:]
|
|
||||||
|
|
||||||
for entry in jsonData:
|
|
||||||
loc = entry['loc']
|
|
||||||
print(loc)
|
print(loc)
|
||||||
print(entry['lastmod'])
|
print(entry.findtext('{*}lastmod'))
|
||||||
print(entry['image:image']['image:loc'])
|
print(entry.find('{*}image').findtext('{*}loc'))
|
||||||
while True:
|
while True:
|
||||||
content = getContentFromURL(loc)
|
content = getContentFromURL(loc)
|
||||||
tree = html.fromstring(content)
|
tree = html.fromstring(content)
|
||||||
title = content.split('<title>')[1].split('</title>')[0]
|
title = tree.xpath('/html/head/title')[0].text
|
||||||
if not title.startswith('503 Service Temporarily Unavailable'):
|
if not title.startswith('503 Service Temporarily Unavailable'):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
print('MISS')
|
print('MISS')
|
||||||
print(title)
|
print(title)
|
||||||
print()
|
meta = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[1]')[0].text
|
||||||
continue
|
|
||||||
metaPattern = 'border-bottom:1px solid #f0f0f0">'
|
|
||||||
meta = content.split(metaPattern)[1].split('</p>')[0]
|
|
||||||
print(meta)
|
print(meta)
|
||||||
description = content.split('"></span>')[1].split('<a class="rewlink"')[0]
|
description = tree.xpath('/html/body/div[1]/div/article/div/div/div/div/div[1]/p[2]')[0].text
|
||||||
print(description)
|
print(description)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user