See [Benjamin-Loison/cpython/issues/43](https://github.com/Benjamin-Loison/cpython/issues/43).
		
			
				
	
	
		
			97 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			97 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/python3
 | |
| 
 | |
| import sys, time, fcntl, os, zipfile, webvtt, re
 | |
| from io import StringIO
 | |
| 
 | |
| path = '/mnt/HDD0/YouTube_captions_search_engine/channels/'
 | |
| 
 | |
| clientId = sys.argv[1]
 | |
| pathSearchMessageParts = sys.argv[2].split(' ')
 | |
| pathSearch = pathSearchMessageParts[1]
 | |
| message = ' '.join(pathSearchMessageParts[2:])
 | |
| 
 | |
| pathSearchRegex = re.compile(pathSearch)
 | |
| messageRegex = re.compile(message)
 | |
| 
 | |
| isPathSearchAChannelId = re.fullmatch(r'[a-zA-Z0-9-_]{24}', pathSearch)
 | |
| 
 | |
| searchOnlyCaptions = pathSearchMessageParts[0] == 'search-only-captions'
 | |
| 
 | |
| clientFilePath = f'users/{clientId}.txt'
 | |
| 
 | |
| def write(s):
 | |
|     with open(clientFilePath, 'r+') as f:
 | |
|         try:
 | |
|             fcntl.flock(f, fcntl.LOCK_EX)
 | |
|             # If the output file is empty, then it means that `websocket.php` read it. Anyway we don't wait it and we append what we want to output.
 | |
|             read = f.read()
 | |
|             # We are appening content, as we moved in-file cursor.
 | |
|             if read != '':
 | |
|                 f.write('\n')
 | |
|             f.write(s)
 | |
|             f.flush()
 | |
|             fcntl.flock(f, fcntl.LOCK_UN)
 | |
|         except Exception as e:
 | |
|             sys.exit(e)
 | |
| 
 | |
| def cleanCaption(caption):
 | |
|     return caption.replace('\n', ' ')
 | |
| 
 | |
| # As `zipgrep` doesn't support arguments to stop on first match for each file, we proceed manually to keep a good theoretical complexity.
 | |
| if isPathSearchAChannelId:
 | |
|     file = pathSearch + '.zip'
 | |
|     if os.path.isfile(path + file):
 | |
|         files = [file]
 | |
|     else:
 | |
|         write(f'progress:0 / 0')
 | |
| else:
 | |
|     files = [file for file in os.listdir(path) if file.endswith('.zip')]
 | |
| 
 | |
| for fileIndex, file in enumerate(files):
 | |
|     write(f'progress:{fileIndex} / {len(files)}')
 | |
|     zip = zipfile.ZipFile(path + file)
 | |
|     for fileInZip in zip.namelist():
 | |
|         endsWithVtt = fileInZip.endswith('.vtt')
 | |
|         if searchOnlyCaptions and not endsWithVtt:
 | |
|             continue
 | |
|         toWrite = f'{file}/{fileInZip}'
 | |
|         if not bool(pathSearchRegex.search(toWrite)):
 | |
|             continue
 | |
|         with zip.open(fileInZip) as f:
 | |
|             if endsWithVtt:
 | |
|                 content = f.read().decode('utf-8')
 | |
|                 stringIOf = StringIO(content)
 | |
|                 wholeCaption = ' '.join([cleanCaption(caption.text) for caption in webvtt.read_buffer(stringIOf)])
 | |
|                 messagePositions = [m.start() for m in messageRegex.finditer(wholeCaption)]
 | |
|                 if messagePositions != []:
 | |
|                     timestamps = []
 | |
|                     for messagePosition in messagePositions:
 | |
|                         stringIOf = StringIO(content)
 | |
|                         for caption in webvtt.read_buffer(stringIOf):
 | |
|                             text = cleanCaption(caption.text)
 | |
|                             if messagePosition <= len(text):
 | |
|                                 timestamp = str(int(caption.start_in_seconds))
 | |
|                                 timestamps += [timestamp]
 | |
|                                 break
 | |
|                             messagePosition -= len(text) + 1
 | |
|                     write(f'{toWrite}|{"|".join(timestamps)}')
 | |
|             else:
 | |
|                 for line in f.readlines():
 | |
|                     if message in str(line):
 | |
|                         write(toWrite)
 | |
|                         break
 | |
|     write(f'progress:{fileIndex + 1} / {len(files)}')
 | |
| 
 | |
| with open(clientFilePath) as f:
 | |
|     while True:
 | |
|         try:
 | |
|             fcntl.flock(f, fcntl.LOCK_EX)
 | |
|             if f.read() == '':
 | |
|                 os.remove(clientFilePath)
 | |
|                 break
 | |
|             else:
 | |
|                 fcntl.flock(f, fcntl.LOCK_UN)
 | |
|                 time.sleep(1)
 | |
|         except Exception as e:
 | |
|             sys.exit(e)
 |