Better search algorithm and unicode handling

- Use a line based search (20% faster)
- Parse file as bytes to avoid offset by unicode characters
This commit is contained in:
Ceda EI 2025-05-02 17:07:04 +05:30
parent 04abeec7b6
commit 4230489d4b

22
main.py
View File

@ -92,20 +92,19 @@ def search_log(name: str, query: str, start: int = 0):
if resp: if resp:
return resp return resp
query_b = query.encode()
matches = [] matches = []
buffer_size = max(len(query), 1024)
break_outer = False break_outer = False
with open(path) as f: with open(path, "rb") as f:
f.seek(start) f.seek(start)
head = start count = start
tail = start + 2 * buffer_size
buffer = f.read(tail) while line := f.readline():
while True:
offset = 0 offset = 0
while True: while True:
index = buffer.find(query, offset) index = line.find(query_b, offset)
if index != -1: if index != -1:
matches.append(head + index) matches.append(count + index)
offset = index + 1 offset = index + 1
if len(matches) >= 100: if len(matches) >= 100:
break_outer = True break_outer = True
@ -114,11 +113,6 @@ def search_log(name: str, query: str, start: int = 0):
break break
if break_outer: if break_outer:
break break
count += len(line)
data = f.read(buffer_size)
if not data:
break
buffer = buffer[-len(query):] + data
head = tail - len(query)
tail += len(data)
return {"matches": matches} return {"matches": matches}