Linguee_webscraper/main.py
2024-04-03 19:52:08 +02:00

63 lines
2.0 KiB
Python
Executable File

#!/usr/bin/python3
import requests
from lxml import html
import string
from pathlib import Path
charset = string.ascii_lowercase
url = 'https://www.linguee.fr/francais-anglais/search'
params = {
'ch': 0
}
MAXIMUM_SUGGESTIONS = 4
entries = set()
REQUESTS_FOLDER_PATH = 'requests'
Path(REQUESTS_FOLDER_PATH).mkdir(exist_ok = True)
def treatSuffixes(prefix):
#print(prefix)
for char in charset:
base = prefix + char
print(base)
baseFilePath = f'{REQUESTS_FOLDER_PATH}/{base}.html'
try:
with open(baseFilePath) as requestFile:
text = requestFile.read()
print('From file')
except:
params['qe'] = base
while True:
try:
text = requests.get(url, params = params).text
break
except requests.exceptions.ConnectionError:
continue
# Pay attention if change `base` elaboration to not allow unwanted folder file writing.
with open(baseFilePath, 'w') as requestFile:
requestFile.write(text)
tree = html.fromstring(text)
rows = tree.xpath('//div[@class="main_row"]')
rowsLen = len(rows)
assert rowsLen <= MAXIMUM_SUGGESTIONS, f'More than {MAXIMUM_SUGGESTIONS} rows!'
interestingEntries = True
for row in rows:
item = row.xpath('div[@class="main_item"]')[0]
entry = item.text_content()
wordType = row.xpath('div[@class="main_wordtype"]')
if wordType != []:
if item.attrib['lc'] == 'FR' and not entry in entries:
print(len(entries), entry, wordType[0].text_content())
entries.add(entry)
if not base in entry:
interestingEntries = False
else:
interestingEntries = False
if rowsLen == MAXIMUM_SUGGESTIONS and interestingEntries:
treatSuffixes(base)
treatSuffixes('')