#31: Add zip files search

This commit is contained in:
Benjamin Loison 2023-02-07 20:15:36 +01:00
parent b45384bab7
commit e9b77369fb
3 changed files with 73 additions and 9 deletions

View File

@ -14,9 +14,50 @@ See <?php echoUrl('https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions
<input type="submit" value="Search">
</form>
<ul id="channels">
</ul>
<script>
var firstRun = true;
var conn;
// Could parse DOM instead of using following variable.
var channels = [];
function createA(text, href) {
var a = document.createElement('a');
var text = document.createTextNode(text);
a.appendChild(text);
a.href = href;
return a;
}
function treatLine(line) {
console.log(line);
if (line.startsWith('alert:')) {
alert(line.replace('alert:', ''));
} else {
var channelsDom = document.getElementById('channels');
const channelFileParts = line.split('/');
const channel = channelFileParts[0];
const channelFile = channelFileParts[1];
const channelHref = `channels/${channel}`;
if (!channels.includes(channel)) {
channels.push(channel);
channelDom = document.createElement('li');
var a = createA(channel, channelHref);
channelDom.appendChild(a);
var channelFilesDom = document.createElement('ul');
channelDom.appendChild(channelFilesDom);
channelsDom.appendChild(channelDom);
}
var channelDom = channelsDom.lastChild;
var channelFilesDom = channelDom.lastChild;
var channelFileDom = document.createElement('li');
var a = createA(channelFile, `${channelHref}/${channelFile}`);
channelFileDom.appendChild(a);
channelFilesDom.appendChild(channelFileDom);
}
}
function search(event) {
// We don't want to refresh the webpage which is the default behavior.
@ -25,11 +66,15 @@ See <?php echoUrl('https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions
if (firstRun) {
firstRun = false;
conn = new WebSocket('wss://crawler.yt.lemnoslife.com/websocket');
conn.onmessage = function(e) { console.log(e.data); };
conn.onmessage = function(e) {
e.data.split('\n').forEach(treatLine);
};
// We can't directly proceed with `conn.send`, as the connection may not be already established.
conn.onopen = function(e) { conn.send(query); };
} else {
// We assume at this point that the connection is established.
channels = [];
document.getElementById('channels').innerHTML = '';
conn.send(query);
}
}

View File

@ -1,6 +1,8 @@
#!/usr/bin/python3
import sys, time, fcntl, os
import sys, time, fcntl, os, zipfile
path = '/mnt/HDD0/YouTube_captions_search_engine/channels/'
clientId = sys.argv[1]
message = sys.argv[2]
@ -8,21 +10,35 @@ message = sys.argv[2]
clientFilePath = f'users/{clientId}.txt'
def write(s):
f = open(clientFilePath, 'w+')
f = open(clientFilePath, 'r+')
try:
fcntl.flock(f, fcntl.LOCK_EX)
# If the output file is empty, then it means that `websocket.php` read it. Anyway we don't wait it and we append what we want to output.
read = f.read()
f.write(f"{read}\n{s}")
# We are appening content, as we moved in-file cursor.
if read != '':
f.write("\n")
f.write(s)
f.flush()
fcntl.flock(f, fcntl.LOCK_UN)
f.close()
except Exception as e:
sys.exit(e)
f.close()
for i in range(10):
write(f'{i}: {message}')
time.sleep(2)
# Unclear if `os.listdir` takes a lot of time, as it's a generator.
# As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity.
for file in os.listdir(path):
if file.endswith('.zip'):
zip = zipfile.ZipFile(path + file)
for fileInZip in zip.namelist():
f = zip.open(fileInZip)
for line in f.readlines():
if message in str(line):
write(f'{file}/{fileInZip}')
break
f.close()
f = open(clientFilePath, 'r')
f = open(clientFilePath)
while True:
try:
fcntl.flock(f, fcntl.LOCK_EX)
@ -30,6 +46,7 @@ while True:
os.remove(clientFilePath)
break
else:
fcntl.flock(f, fcntl.LOCK_UN)
time.sleep(1)
except Exception as e:
sys.exit(e)

View File

@ -96,6 +96,7 @@ class MyProcess implements MessageComponentInterface
if (preg_match("/^[a-zA-Z0-9-_ ]+$/", $msg) !== 1) {
return;
}
$from->send('alert:Started searching...');
$client = $this->clients->offsetGet($from);
// If a previous request was received, we execute the new one with another client for simplicity otherwise with current file deletion approach, we can't tell the worker `search.py` that we don't care about its execution anymore.
if ($client->pid !== null) {
@ -137,6 +138,7 @@ class MyProcess implements MessageComponentInterface
} else {
// We don't need the periodic timer anymore, as the worker finished its work and acknowledged that `websocket.php` completely read its output.
$this->loop->cancelTimer($client->timer);
$from->send('alert:Search finished!');
}
});
}