#!/usr/bin/python3 import requests from lxml import html import string from pathlib import Path charset = string.ascii_lowercase url = 'https://www.linguee.fr/francais-anglais/search' params = { 'ch': 0 } MAXIMUM_SUGGESTIONS = 4 entries = set() REQUESTS_FOLDER_PATH = 'requests' Path(REQUESTS_FOLDER_PATH).mkdir(exist_ok = True) def treatSuffixes(prefix): #print(prefix) for char in charset: base = prefix + char print(base) baseFilePath = f'{REQUESTS_FOLDER_PATH}/{base}.html' try: with open(baseFilePath) as requestFile: text = requestFile.read() print('From file') except: params['qe'] = base while True: try: text = requests.get(url, params = params).text break except requests.exceptions.ConnectionError: continue # Pay attention if change `base` elaboration to not allow unwanted folder file writing. with open(baseFilePath, 'w') as requestFile: requestFile.write(text) tree = html.fromstring(text) rows = tree.xpath('//div[@class="main_row"]') rowsLen = len(rows) assert rowsLen <= MAXIMUM_SUGGESTIONS, f'More than {MAXIMUM_SUGGESTIONS} rows!' interestingEntries = True for row in rows: item = row.xpath('div[@class="main_item"]')[0] entry = item.text_content() wordType = row.xpath('div[@class="main_wordtype"]') if wordType != []: if item.attrib['lc'] == 'FR' and not entry in entries: print(len(entries), entry, wordType[0].text_content()) entries.add(entry) if not base in entry: interestingEntries = False else: interestingEntries = False if rowsLen == MAXIMUM_SUGGESTIONS and interestingEntries: treatSuffixes(base) treatSuffixes('')