Linguee_webscraper/main.py

#!/usr/bin/python3

import requests
from lxml import html
import string
from pathlib import Path

charset = string.ascii_lowercase
url = 'https://www.linguee.fr/francais-anglais/search'
params = {
    'ch': 0
}

MAXIMUM_SUGGESTIONS = 4

entries = set()

REQUESTS_FOLDER_PATH = 'requests'
Path(REQUESTS_FOLDER_PATH).mkdir(exist_ok = True)

def treatSuffixes(prefix):
    #print(prefix)
    for char in charset:
        base = prefix + char
        print(base)
        baseFilePath = f'{REQUESTS_FOLDER_PATH}/{base}.html'
        try:
            with open(baseFilePath) as requestFile:
                text = requestFile.read()
            print('From file')
        except:
            params['qe'] = base
            while True:
                try:
                    text = requests.get(url, params = params).text
                    break
                except requests.exceptions.ConnectionError:
                    continue
            # Pay attention if change `base` elaboration to not allow unwanted folder file writing.
            with open(baseFilePath, 'w') as requestFile:
                requestFile.write(text)
        tree = html.fromstring(text)
        rows = tree.xpath('//div[@class="main_row"]')
        rowsLen = len(rows)
        assert rowsLen <= MAXIMUM_SUGGESTIONS, f'More than {MAXIMUM_SUGGESTIONS} rows!'
        interestingEntries = True
        for row in rows:
            item = row.xpath('div[@class="main_item"]')[0]
            entry = item.text_content()
            wordType = row.xpath('div[@class="main_wordtype"]')
            if wordType != []:
                if item.attrib['lc'] == 'FR' and not entry in entries:
                    print(len(entries), entry, wordType[0].text_content())
                    entries.add(entry)
                if not base in entry:
                    interestingEntries = False
            else:
                interestingEntries = False
        if rowsLen == MAXIMUM_SUGGESTIONS and interestingEntries:
            treatSuffixes(base)

treatSuffixes('')