#31: Add zip files search
This commit is contained in:
		| @@ -14,9 +14,50 @@ See <?php echoUrl('https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions | |||||||
|     <input type="submit" value="Search"> |     <input type="submit" value="Search"> | ||||||
| </form> | </form> | ||||||
|  |  | ||||||
|  | <ul id="channels"> | ||||||
|  | </ul> | ||||||
|  |  | ||||||
| <script> | <script> | ||||||
|     var firstRun = true; |     var firstRun = true; | ||||||
|     var conn; |     var conn; | ||||||
|  |     // Could parse DOM instead of using following variable. | ||||||
|  |     var channels = []; | ||||||
|  |  | ||||||
|  |     function createA(text, href) { | ||||||
|  |         var a = document.createElement('a'); | ||||||
|  |         var text = document.createTextNode(text); | ||||||
|  |         a.appendChild(text); | ||||||
|  |         a.href = href; | ||||||
|  |         return a; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     function treatLine(line) { | ||||||
|  |         console.log(line); | ||||||
|  |         if (line.startsWith('alert:')) { | ||||||
|  |             alert(line.replace('alert:', '')); | ||||||
|  |         } else { | ||||||
|  |             var channelsDom = document.getElementById('channels'); | ||||||
|  |             const channelFileParts = line.split('/'); | ||||||
|  |             const channel = channelFileParts[0]; | ||||||
|  |             const channelFile = channelFileParts[1]; | ||||||
|  |             const channelHref = `channels/${channel}`; | ||||||
|  |             if (!channels.includes(channel)) { | ||||||
|  |                 channels.push(channel); | ||||||
|  |                 channelDom = document.createElement('li'); | ||||||
|  |                 var a = createA(channel, channelHref); | ||||||
|  |                 channelDom.appendChild(a); | ||||||
|  |                 var channelFilesDom = document.createElement('ul'); | ||||||
|  |                 channelDom.appendChild(channelFilesDom); | ||||||
|  |                 channelsDom.appendChild(channelDom); | ||||||
|  |             } | ||||||
|  |             var channelDom = channelsDom.lastChild; | ||||||
|  |             var channelFilesDom = channelDom.lastChild; | ||||||
|  |             var channelFileDom = document.createElement('li'); | ||||||
|  |             var a = createA(channelFile, `${channelHref}/${channelFile}`); | ||||||
|  |             channelFileDom.appendChild(a); | ||||||
|  |             channelFilesDom.appendChild(channelFileDom); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|     function search(event) { |     function search(event) { | ||||||
|         // We don't want to refresh the webpage which is the default behavior. |         // We don't want to refresh the webpage which is the default behavior. | ||||||
| @@ -25,11 +66,15 @@ See <?php echoUrl('https://gitea.lemnoslife.com/Benjamin_Loison/YouTube_captions | |||||||
|         if (firstRun) { |         if (firstRun) { | ||||||
|             firstRun = false; |             firstRun = false; | ||||||
|             conn = new WebSocket('wss://crawler.yt.lemnoslife.com/websocket'); |             conn = new WebSocket('wss://crawler.yt.lemnoslife.com/websocket'); | ||||||
|             conn.onmessage = function(e) { console.log(e.data); }; |             conn.onmessage = function(e) { | ||||||
|  |                 e.data.split('\n').forEach(treatLine); | ||||||
|  |             }; | ||||||
|             // We can't directly proceed with `conn.send`, as the connection may not be already established. |             // We can't directly proceed with `conn.send`, as the connection may not be already established. | ||||||
|             conn.onopen = function(e) { conn.send(query); }; |             conn.onopen = function(e) { conn.send(query); }; | ||||||
|         } else { |         } else { | ||||||
|             // We assume at this point that the connection is established. |             // We assume at this point that the connection is established. | ||||||
|  |             channels = []; | ||||||
|  |             document.getElementById('channels').innerHTML = ''; | ||||||
|             conn.send(query); |             conn.send(query); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -1,6 +1,8 @@ | |||||||
| #!/usr/bin/python3 | #!/usr/bin/python3 | ||||||
|  |  | ||||||
| import sys, time, fcntl, os | import sys, time, fcntl, os, zipfile | ||||||
|  |  | ||||||
|  | path = '/mnt/HDD0/YouTube_captions_search_engine/channels/' | ||||||
|  |  | ||||||
| clientId = sys.argv[1] | clientId = sys.argv[1] | ||||||
| message = sys.argv[2] | message = sys.argv[2] | ||||||
| @@ -8,21 +10,35 @@ message = sys.argv[2] | |||||||
| clientFilePath = f'users/{clientId}.txt' | clientFilePath = f'users/{clientId}.txt' | ||||||
|  |  | ||||||
| def write(s): | def write(s): | ||||||
|     f = open(clientFilePath, 'w+') |     f = open(clientFilePath, 'r+') | ||||||
|     try: |     try: | ||||||
|         fcntl.flock(f, fcntl.LOCK_EX) |         fcntl.flock(f, fcntl.LOCK_EX) | ||||||
|         # If the output file is empty, then it means that `websocket.php` read it. Anyway we don't wait it and we append what we want to output. |         # If the output file is empty, then it means that `websocket.php` read it. Anyway we don't wait it and we append what we want to output. | ||||||
|         read = f.read() |         read = f.read() | ||||||
|         f.write(f"{read}\n{s}") |         # We are appening content, as we moved in-file cursor. | ||||||
|  |         if read != '': | ||||||
|  |             f.write("\n") | ||||||
|  |         f.write(s) | ||||||
|  |         f.flush() | ||||||
|  |         fcntl.flock(f, fcntl.LOCK_UN) | ||||||
|  |         f.close() | ||||||
|     except Exception as e: |     except Exception as e: | ||||||
|         sys.exit(e) |         sys.exit(e) | ||||||
|  |  | ||||||
|  | # Unclear if `os.listdir` takes a lot of time, as it's a generator. | ||||||
|  | # As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity. | ||||||
|  | for file in os.listdir(path): | ||||||
|  |     if file.endswith('.zip'): | ||||||
|  |         zip = zipfile.ZipFile(path + file) | ||||||
|  |         for fileInZip in zip.namelist(): | ||||||
|  |             f = zip.open(fileInZip) | ||||||
|  |             for line in f.readlines(): | ||||||
|  |                 if message in str(line): | ||||||
|  |                     write(f'{file}/{fileInZip}') | ||||||
|  |                     break | ||||||
|             f.close() |             f.close() | ||||||
|  |  | ||||||
| for i in range(10): | f = open(clientFilePath) | ||||||
|     write(f'{i}: {message}') |  | ||||||
|     time.sleep(2) |  | ||||||
|  |  | ||||||
| f = open(clientFilePath, 'r') |  | ||||||
| while True: | while True: | ||||||
|     try: |     try: | ||||||
|         fcntl.flock(f, fcntl.LOCK_EX) |         fcntl.flock(f, fcntl.LOCK_EX) | ||||||
| @@ -30,6 +46,7 @@ while True: | |||||||
|             os.remove(clientFilePath) |             os.remove(clientFilePath) | ||||||
|             break |             break | ||||||
|         else: |         else: | ||||||
|  |             fcntl.flock(f, fcntl.LOCK_UN) | ||||||
|             time.sleep(1) |             time.sleep(1) | ||||||
|     except Exception as e: |     except Exception as e: | ||||||
|         sys.exit(e) |         sys.exit(e) | ||||||
|   | |||||||
| @@ -96,6 +96,7 @@ class MyProcess implements MessageComponentInterface | |||||||
|         if (preg_match("/^[a-zA-Z0-9-_ ]+$/", $msg) !== 1) { |         if (preg_match("/^[a-zA-Z0-9-_ ]+$/", $msg) !== 1) { | ||||||
|             return; |             return; | ||||||
|         } |         } | ||||||
|  |         $from->send('alert:Started searching...'); | ||||||
|         $client = $this->clients->offsetGet($from); |         $client = $this->clients->offsetGet($from); | ||||||
|         // If a previous request was received, we execute the new one with another client for simplicity otherwise with current file deletion approach, we can't tell the worker `search.py` that we don't care about its execution anymore. |         // If a previous request was received, we execute the new one with another client for simplicity otherwise with current file deletion approach, we can't tell the worker `search.py` that we don't care about its execution anymore. | ||||||
|         if ($client->pid !== null) { |         if ($client->pid !== null) { | ||||||
| @@ -137,6 +138,7 @@ class MyProcess implements MessageComponentInterface | |||||||
|             } else { |             } else { | ||||||
|                 // We don't need the periodic timer anymore, as the worker finished its work and acknowledged that `websocket.php` completely read its output. |                 // We don't need the periodic timer anymore, as the worker finished its work and acknowledged that `websocket.php` completely read its output. | ||||||
|                 $this->loop->cancelTimer($client->timer); |                 $this->loop->cancelTimer($client->timer); | ||||||
|  |                 $from->send('alert:Search finished!'); | ||||||
|             } |             } | ||||||
|         }); |         }); | ||||||
|     } |     } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user