2024-04-01 01:43:04 +02:00
|
|
|
#!/usr/bin/python3
|
|
|
|
|
2024-04-01 00:58:05 +02:00
|
|
|
import requests
|
|
|
|
from lxml import html
|
2024-04-01 01:09:42 +02:00
|
|
|
import string
|
2024-04-01 00:58:05 +02:00
|
|
|
|
2024-04-01 01:09:42 +02:00
|
|
|
charset = string.ascii_lowercase
|
2024-04-01 00:58:05 +02:00
|
|
|
url = 'https://www.linguee.fr/francais-anglais/search'
|
|
|
|
params = {
|
|
|
|
'ch': 0
|
|
|
|
}
|
2024-04-01 01:09:42 +02:00
|
|
|
|
|
|
|
MAXIMUM_SUGGESTIONS = 4
|
|
|
|
|
|
|
|
entries = set()
|
|
|
|
|
|
|
|
def treatSuffixes(prefix):
|
2024-04-01 01:22:10 +02:00
|
|
|
#print(prefix)
|
2024-04-01 01:09:42 +02:00
|
|
|
for char in charset:
|
|
|
|
base = prefix + char
|
2024-04-01 01:22:10 +02:00
|
|
|
print(base)
|
2024-04-01 01:09:42 +02:00
|
|
|
params['qe'] = base
|
|
|
|
text = requests.get(url, params = params).text
|
2024-04-01 01:41:09 +02:00
|
|
|
# Pay attention if change `base` elaboration to not allow unwanted folder file writing.
|
|
|
|
with open(f'requests/{base}.html', 'w') as requestFile:
|
|
|
|
requestFile.write(text)
|
2024-04-01 01:09:42 +02:00
|
|
|
tree = html.fromstring(text)
|
2024-04-01 01:22:10 +02:00
|
|
|
rows = tree.xpath('//div[@class="main_row"]')
|
|
|
|
rowsLen = len(rows)
|
|
|
|
assert rowsLen <= MAXIMUM_SUGGESTIONS, f'More than {MAXIMUM_SUGGESTIONS} rows!'
|
2024-04-01 01:41:09 +02:00
|
|
|
interestingEntries = True
|
2024-04-01 01:22:10 +02:00
|
|
|
for row in rows:
|
|
|
|
item = row.xpath('div[@class="main_item"]')[0]
|
2024-04-01 01:09:42 +02:00
|
|
|
entry = item.text_content()
|
2024-04-01 01:22:10 +02:00
|
|
|
wordType = row.xpath('div[@class="main_wordtype"]')
|
2024-04-01 01:41:09 +02:00
|
|
|
if wordType != []:
|
|
|
|
if item.attrib['lc'] == 'FR' and not entry in entries:
|
|
|
|
print(len(entries), entry, wordType[0].text_content())
|
|
|
|
entries.add(entry)
|
|
|
|
if not base in entry:
|
|
|
|
interestingEntries = False
|
|
|
|
if rowsLen == MAXIMUM_SUGGESTIONS and interestingEntries:
|
2024-04-01 01:09:42 +02:00
|
|
|
treatSuffixes(base)
|
|
|
|
|
2024-04-01 01:43:04 +02:00
|
|
|
treatSuffixes('')
|