More efficient flat file list format and various UX improvements (#3)

This is a major upgrade with assorted things included.

- Navigation flows improved, search appears in URL history, cleared when navigating to another folder
- More efficient file list format for faster loads
- Efficient updates, never re-send full root another time (except at connection)
- Large number of watching and filelist updates (inotify issues remain)
- File size coloring
- Fixed ZIP generation random glitches (thread race condition)
- Code refactoring, cleanup, typing fixes
- More tests

Reviewed-on: #3
This commit is contained in:
Leo Vasanko
2023-11-12 23:20:40 +00:00
parent fb03fa5430
commit 9854dd01cc
101 changed files with 728 additions and 527 deletions

View File

@@ -104,11 +104,11 @@ async def watch(req, ws):
)
uuid = token_bytes(16)
try:
with watching.tree_lock:
with watching.state.lock:
q = watching.pubsub[uuid] = asyncio.Queue()
# Init with disk usage and full tree
await ws.send(watching.format_du())
await ws.send(watching.format_tree())
await ws.send(watching.format_space(watching.state.space))
await ws.send(watching.format_root(watching.state.root))
# Send updates
while True:
await ws.send(await q.get())

View File

@@ -1,10 +1,8 @@
import asyncio
import datetime
import mimetypes
from collections import deque
from concurrent.futures import ThreadPoolExecutor
from importlib.resources import files
from pathlib import Path
from pathlib import Path, PurePath, PurePosixPath
from stat import S_IFDIR, S_IFREG
from urllib.parse import unquote
from wsgiref.handlers import format_date_time
@@ -12,15 +10,13 @@ from wsgiref.handlers import format_date_time
import brotli
import sanic.helpers
from blake3 import blake3
from natsort import natsorted, ns
from sanic import Blueprint, Sanic, empty, raw
from sanic.exceptions import Forbidden, NotFound
from sanic.exceptions import Forbidden, NotFound, ServerError
from sanic.log import logging
from stream_zip import ZIP_AUTO, stream_zip
from cista import auth, config, session, watching
from cista.api import bp
from cista.protocol import DirEntry
from cista.util.apphelpers import handle_sanic_exception
# Workaround until Sanic PR #2824 is merged
@@ -36,7 +32,9 @@ app.exception(Exception)(handle_sanic_exception)
async def main_start(app, loop):
config.load_config()
await watching.start(app, loop)
app.ctx.threadexec = ThreadPoolExecutor(max_workers=8)
app.ctx.threadexec = ThreadPoolExecutor(
max_workers=8, thread_name_prefix="cista-ioworker"
)
@app.after_server_stop
@@ -49,8 +47,8 @@ async def main_stop(app, loop):
async def use_session(req):
req.ctx.session = session.get(req)
try:
req.ctx.username = req.ctx.session["username"]
req.ctx.user = config.config.users[req.ctx.session["username"]] # type: ignore
req.ctx.username = req.ctx.session["username"] # type: ignore
req.ctx.user = config.config.users[req.ctx.username]
except (AttributeError, KeyError, TypeError):
req.ctx.username = None
req.ctx.user = None
@@ -81,22 +79,16 @@ def http_fileserver(app, _):
www = {}
@app.before_server_start
async def load_wwwroot(*_ignored):
global www
www = await asyncio.get_event_loop().run_in_executor(None, _load_wwwroot, www)
def _load_wwwroot(www):
wwwnew = {}
base = files("cista") / "wwwroot"
paths = ["."]
base = Path(__file__).with_name("wwwroot")
paths = [PurePath()]
while paths:
path = paths.pop(0)
current = base / path
for p in current.iterdir():
if p.is_dir():
paths.append(current / p.parts[-1])
paths.append(p.relative_to(base))
continue
name = p.relative_to(base).as_posix()
mime = mimetypes.guess_type(name)[0] or "application/octet-stream"
@@ -127,15 +119,35 @@ def _load_wwwroot(www):
if len(br) >= len(data):
br = False
wwwnew[name] = data, br, headers
if not wwwnew:
raise ServerError(
"Web frontend missing. Did you forget npm run build?",
extra={"wwwroot": str(base)},
quiet=True,
)
return wwwnew
@app.add_task
@app.before_server_start
async def start(app):
await load_wwwroot(app)
if app.debug:
app.add_task(refresh_wwwroot())
async def load_wwwroot(app):
global www
www = await asyncio.get_event_loop().run_in_executor(
app.ctx.threadexec, _load_wwwroot, www
)
async def refresh_wwwroot():
while True:
await asyncio.sleep(0.5)
try:
wwwold = www
await load_wwwroot()
await load_wwwroot(app)
changes = ""
for name in sorted(www):
attr = www[name]
@@ -151,7 +163,6 @@ async def refresh_wwwroot():
print("Error loading wwwroot", e)
if not app.debug:
return
await asyncio.sleep(0.5)
@app.route("/<path:path>", methods=["GET", "HEAD"])
@@ -166,66 +177,70 @@ async def wwwroot(req, path=""):
return empty(304, headers=headers)
# Brotli compressed?
if br and "br" in req.headers.accept_encoding.split(", "):
headers = {
**headers,
"content-encoding": "br",
}
headers = {**headers, "content-encoding": "br"}
data = br
return raw(data, headers=headers)
def get_files(wanted: set) -> list[tuple[PurePosixPath, Path]]:
loc = PurePosixPath()
idx = 0
ret = []
level: int | None = None
parent: PurePosixPath | None = None
with watching.state.lock:
root = watching.state.root
while idx < len(root):
f = root[idx]
loc = PurePosixPath(*loc.parts[: f.level - 1]) / f.name
if parent is not None and f.level <= level:
level = parent = None
if f.key in wanted:
level, parent = f.level, loc.parent
if parent is not None:
wanted.discard(f.key)
ret.append((loc.relative_to(parent), watching.rootpath / loc))
idx += 1
return ret
@app.get("/zip/<keys>/<zipfile:ext=zip>")
async def zip_download(req, keys, zipfile, ext):
"""Download a zip archive of the given keys"""
wanted = set(keys.split("+"))
with watching.tree_lock:
q = deque([([], None, watching.tree[""].dir)])
files = []
while q:
locpar, relpar, d = q.pop()
for name, attr in d.items():
loc = [*locpar, name]
rel = None
if relpar or attr.key in wanted:
rel = [*relpar, name] if relpar else [name]
wanted.discard(attr.key)
isdir = isinstance(attr, DirEntry)
if isdir:
q.append((loc, rel, attr.dir))
if rel:
files.append(
("/".join(rel), Path(watching.rootpath.joinpath(*loc)))
)
files = get_files(wanted)
if not files:
raise NotFound(
"No files found",
context={"keys": keys, "zipfile": zipfile, "wanted": wanted},
context={"keys": keys, "zipfile": f"{zipfile}.{ext}", "wanted": wanted},
)
if wanted:
raise NotFound("Files not found", context={"missing": wanted})
files = natsorted(files, key=lambda f: f[0], alg=ns.IGNORECASE)
def local_files(files):
for rel, p in files:
s = p.stat()
size = s.st_size
modified = datetime.datetime.fromtimestamp(s.st_mtime, datetime.UTC)
name = rel.as_posix()
if p.is_dir():
yield rel, modified, S_IFDIR | 0o755, ZIP_AUTO(size), b""
yield f"{name}/", modified, S_IFDIR | 0o755, ZIP_AUTO(size), iter(b"")
else:
yield rel, modified, S_IFREG | 0o644, ZIP_AUTO(size), contents(p)
yield name, modified, S_IFREG | 0o644, ZIP_AUTO(size), contents(p, size)
def contents(name):
def contents(name, size):
with name.open("rb") as f:
while chunk := f.read(65536):
while size > 0 and (chunk := f.read(min(size, 1 << 20))):
size -= len(chunk)
yield chunk
assert size == 0
def worker():
try:
for chunk in stream_zip(local_files(files)):
asyncio.run_coroutine_threadsafe(queue.put(chunk), loop)
asyncio.run_coroutine_threadsafe(queue.put(chunk), loop).result()
except Exception:
logging.exception("Error streaming ZIP")
raise
@@ -238,7 +253,10 @@ async def zip_download(req, keys, zipfile, ext):
thread = loop.run_in_executor(app.ctx.threadexec, worker)
# Stream the response
res = await req.respond(content_type="application/zip")
res = await req.respond(
content_type="application/zip",
headers={"cache-control": "no-store"},
)
while chunk := await queue.get():
await res.send(chunk)

View File

@@ -68,10 +68,10 @@ def verify(request, *, privileged=False):
if request.ctx.user:
if request.ctx.user.privileged:
return
raise Forbidden("Access Forbidden: Only for privileged users")
raise Forbidden("Access Forbidden: Only for privileged users", quiet=True)
elif config.config.public or request.ctx.user:
return
raise Unauthorized("Login required", "cookie")
raise Unauthorized("Login required", "cookie", quiet=True)
bp = Blueprint("auth")

View File

@@ -112,47 +112,43 @@ class ErrorMsg(msgspec.Struct):
## Directory listings
class FileEntry(msgspec.Struct):
key: str
size: int
mtime: int
class DirEntry(msgspec.Struct):
key: str
size: int
mtime: int
dir: DirList
def __getitem__(self, name):
return self.dir[name]
def __setitem__(self, name, value):
self.dir[name] = value
def __contains__(self, name):
return name in self.dir
def __delitem__(self, name):
del self.dir[name]
@property
def props(self):
return {k: v for k, v in self.__struct_fields__ if k != "dir"}
DirList = dict[str, FileEntry | DirEntry]
class UpdateEntry(msgspec.Struct, omit_defaults=True):
"""Updates the named entry in the tree. Fields that are set replace old values. A list of entries recurses directories."""
class FileEntry(msgspec.Struct, array_like=True):
level: int
name: str
key: str
deleted: bool = False
size: int | None = None
mtime: int | None = None
dir: DirList | None = None
mtime: int
size: int
isfile: int
def __repr__(self):
return self.key or "FileEntry()"
class Update(msgspec.Struct, array_like=True):
...
class UpdKeep(Update, tag="k"):
count: int
class UpdDel(Update, tag="d"):
count: int
class UpdIns(Update, tag="i"):
items: list[FileEntry]
class UpdateMessage(msgspec.Struct):
update: list[UpdKeep | UpdDel | UpdIns]
class Space(msgspec.Struct):
disk: int
free: int
usage: int
storage: int
def make_dir_data(root):

View File

@@ -1,20 +1,137 @@
import asyncio
import shutil
import stat
import sys
import threading
import time
from os import stat_result
from pathlib import Path, PurePosixPath
import msgspec
from natsort import humansorted, natsort_keygen, ns
from sanic.log import logging
from cista import config
from cista.fileio import fuid
from cista.protocol import DirEntry, FileEntry, UpdateEntry
from cista.protocol import FileEntry, Space, UpdDel, UpdIns, UpdKeep
pubsub = {}
tree = {"": None}
tree_lock = threading.Lock()
sortkey = natsort_keygen(alg=ns.LOCALE)
class State:
def __init__(self):
self.lock = threading.RLock()
self._space = Space(0, 0, 0, 0)
self._listing: list[FileEntry] = []
@property
def space(self):
with self.lock:
return self._space
@space.setter
def space(self, space):
with self.lock:
self._space = space
@property
def root(self) -> list[FileEntry]:
with self.lock:
return self._listing[:]
@root.setter
def root(self, listing: list[FileEntry]):
with self.lock:
self._listing = listing
def _slice(self, idx: PurePosixPath | tuple[PurePosixPath, int]):
relpath, relfile = idx if isinstance(idx, tuple) else (idx, 0)
begin, end = 0, len(self._listing)
level = 0
isfile = 0
# Special case for root
if not relpath.parts:
return slice(begin, end)
begin += 1
for part in relpath.parts:
level += 1
found = False
while begin < end:
entry = self._listing[begin]
if entry.level < level:
break
if entry.level == level:
if entry.name == part:
found = True
if level == len(relpath.parts):
isfile = relfile
else:
begin += 1
break
cmp = entry.isfile - isfile or sortkey(entry.name) > sortkey(part)
if cmp > 0:
break
begin += 1
if not found:
return slice(begin, begin)
# Found the starting point, now find the end of the slice
for end in range(begin + 1, len(self._listing) + 1):
if end == len(self._listing) or self._listing[end].level <= level:
break
return slice(begin, end)
def __getitem__(self, index: PurePosixPath | tuple[PurePosixPath, int]):
with self.lock:
return self._listing[self._slice(index)]
def __setitem__(
self, index: tuple[PurePosixPath, int], value: list[FileEntry]
) -> None:
rel, isfile = index
with self.lock:
if rel.parts:
parent = self._slice(rel.parent)
if parent.start == parent.stop:
raise ValueError(
f"Parent folder {rel.as_posix()} is missing for {rel.name}"
)
self._listing[self._slice(index)] = value
def __delitem__(self, relpath: PurePosixPath):
with self.lock:
del self._listing[self._slice(relpath)]
def _index(self, rel: PurePosixPath):
idx = 0
ret = []
def _dir(self, idx: int):
level = self._listing[idx].level + 1
end = len(self._listing)
idx += 1
ret = []
while idx < end and (r := self._listing[idx]).level >= level:
if r.level == level:
ret.append(idx)
return ret, idx
def update(self, rel: PurePosixPath, value: FileEntry):
begin = 0
parents = []
while self._listing[begin].level < len(rel.parts):
parents.append(begin)
state = State()
rootpath: Path = None # type: ignore
quit = False
modified_flags = (
@@ -26,23 +143,22 @@ modified_flags = (
"IN_MOVED_FROM",
"IN_MOVED_TO",
)
disk_usage = None
def watcher_thread(loop):
global disk_usage, rootpath
global rootpath
import inotify.adapters
while True:
rootpath = config.config.path
i = inotify.adapters.InotifyTree(rootpath.as_posix())
old = format_tree() if tree[""] else None
with tree_lock:
# Initialize the tree from filesystem
tree[""] = walk(rootpath)
msg = format_tree()
if msg != old:
asyncio.run_coroutine_threadsafe(broadcast(msg), loop)
# Initialize the tree from filesystem
new = walk()
with state.lock:
old = state.root
if old != new:
state.root = new
broadcast(format_update(old, new), loop)
# The watching is not entirely reliable, so do a full refresh every minute
refreshdl = time.monotonic() + 60.0
@@ -52,9 +168,10 @@ def watcher_thread(loop):
return
# Disk usage update
du = shutil.disk_usage(rootpath)
if du != disk_usage:
disk_usage = du
asyncio.run_coroutine_threadsafe(broadcast(format_du()), loop)
space = Space(*du, storage=state.root[0].size)
if space != state.space:
state.space = space
broadcast(format_space(space), loop)
break
# Do a full refresh?
if time.monotonic() > refreshdl:
@@ -75,144 +192,141 @@ def watcher_thread(loop):
def watcher_thread_poll(loop):
global disk_usage, rootpath
global rootpath
while not quit:
rootpath = config.config.path
old = format_tree() if tree[""] else None
with tree_lock:
# Initialize the tree from filesystem
tree[""] = walk(rootpath)
msg = format_tree()
if msg != old:
asyncio.run_coroutine_threadsafe(broadcast(msg), loop)
new = walk()
with state.lock:
old = state.root
if old != new:
state.root = new
broadcast(format_update(old, new), loop)
# Disk usage update
du = shutil.disk_usage(rootpath)
if du != disk_usage:
disk_usage = du
asyncio.run_coroutine_threadsafe(broadcast(format_du()), loop)
space = Space(*du, storage=state.root[0].size)
if space != state.space:
state.space = space
broadcast(format_space(space), loop)
time.sleep(1.0)
time.sleep(2.0)
def format_du():
return msgspec.json.encode(
{
"space": {
"disk": disk_usage.total,
"used": disk_usage.used,
"free": disk_usage.free,
"storage": tree[""].size,
},
},
).decode()
def format_tree():
root = tree[""]
return msgspec.json.encode({"root": root}).decode()
def walk(path: Path) -> DirEntry | FileEntry | None:
def walk(rel=PurePosixPath()) -> list[FileEntry]: # noqa: B008
path = rootpath / rel
try:
s = path.stat()
key = fuid(s)
assert key, repr(key)
mtime = int(s.st_mtime)
if path.is_file():
return FileEntry(key, s.st_size, mtime)
st = path.stat()
except OSError:
return []
return _walk(rel, int(not stat.S_ISDIR(st.st_mode)), st)
tree = {
p.name: v
for p in path.iterdir()
if not p.name.startswith(".")
if (v := walk(p)) is not None
}
if tree:
size = sum(v.size for v in tree.values())
mtime = max(mtime, *(v.mtime for v in tree.values()))
else:
size = 0
return DirEntry(key, size, mtime, tree)
def _walk(rel: PurePosixPath, isfile: int, st: stat_result) -> list[FileEntry]:
entry = FileEntry(
level=len(rel.parts),
name=rel.name,
key=fuid(st),
mtime=int(st.st_mtime),
size=st.st_size if isfile else 0,
isfile=isfile,
)
if isfile:
return [entry]
ret = [entry]
path = rootpath / rel
try:
li = []
for f in path.iterdir():
if f.name.startswith("."):
continue # No dotfiles
s = f.stat()
li.append((int(not stat.S_ISDIR(s.st_mode)), f.name, s))
for [isfile, name, s] in humansorted(li):
subtree = _walk(rel / name, isfile, s)
child = subtree[0]
entry.mtime = max(entry.mtime, child.mtime)
entry.size += child.size
ret.extend(subtree)
except FileNotFoundError:
return None
pass # Things may be rapidly in motion
except OSError as e:
print("OS error walking path", path, e)
return None
return ret
def update(relpath: PurePosixPath, loop):
"""Called by inotify updates, check the filesystem and broadcast any changes."""
if rootpath is None or relpath is None:
print("ERROR", rootpath, relpath)
new = walk(rootpath / relpath)
with tree_lock:
update = update_internal(relpath, new)
if not update:
return # No changes
msg = msgspec.json.encode({"update": update}).decode()
asyncio.run_coroutine_threadsafe(broadcast(msg), loop)
new = walk(relpath)
with state.lock:
old = state[relpath]
if old == new:
return
old = state.root
if new:
state[relpath, new[0].isfile] = new
else:
del state[relpath]
broadcast(format_update(old, state.root), loop)
def update_internal(
relpath: PurePosixPath,
new: DirEntry | FileEntry | None,
) -> list[UpdateEntry]:
path = "", *relpath.parts
old = tree
elems = []
for name in path:
if name not in old:
# File or folder created
old = None
elems.append((name, None))
if len(elems) < len(path):
# We got a notify for an item whose parent is not in tree
print("Tree out of sync DEBUG", relpath)
print(elems)
print("Current tree:")
print(tree[""])
print("Walking all:")
print(walk(rootpath))
raise ValueError("Tree out of sync")
break
old = old[name]
elems.append((name, old))
if old == new:
return []
mt = new.mtime if new else 0
szdiff = (new.size if new else 0) - (old.size if old else 0)
# Update parents
def format_update(old, new):
# Make keep/del/insert diff until one of the lists ends
oidx, nidx = 0, 0
update = []
for name, entry in elems[:-1]:
u = UpdateEntry(name, entry.key)
if szdiff:
entry.size += szdiff
u.size = entry.size
if mt > entry.mtime:
u.mtime = entry.mtime = mt
update.append(u)
# The last element is the one that changed
name, entry = elems[-1]
parent = elems[-2][1] if len(elems) > 1 else tree
u = UpdateEntry(name, new.key if new else entry.key)
if new:
parent[name] = new
if u.size != new.size:
u.size = new.size
if u.mtime != new.mtime:
u.mtime = new.mtime
if isinstance(new, DirEntry) and u.dir != new.dir:
u.dir = new.dir
else:
del parent[name]
u.deleted = True
update.append(u)
return update
keep_count = 0
while oidx < len(old) and nidx < len(new):
if old[oidx] == new[nidx]:
keep_count += 1
oidx += 1
nidx += 1
continue
if keep_count > 0:
update.append(UpdKeep(keep_count))
keep_count = 0
del_count = 0
rest = new[nidx:]
while oidx < len(old) and old[oidx] not in rest:
del_count += 1
oidx += 1
if del_count:
update.append(UpdDel(del_count))
continue
insert_items = []
rest = old[oidx:]
while nidx < len(new) and new[nidx] not in rest:
insert_items.append(new[nidx])
nidx += 1
update.append(UpdIns(insert_items))
# Diff any remaining
if keep_count > 0:
update.append(UpdKeep(keep_count))
if oidx < len(old):
update.append(UpdDel(len(old) - oidx))
elif nidx < len(new):
update.append(UpdIns(new[nidx:]))
return msgspec.json.encode({"update": update}).decode()
async def broadcast(msg):
def format_space(usage):
return msgspec.json.encode({"space": usage}).decode()
def format_root(root):
return msgspec.json.encode({"root": root}).decode()
def broadcast(msg, loop):
return asyncio.run_coroutine_threadsafe(abroadcast(msg), loop).result()
async def abroadcast(msg):
try:
for queue in pubsub.values():
queue.put_nowait(msg)
@@ -223,8 +337,9 @@ async def broadcast(msg):
async def start(app, loop):
config.load_config()
use_inotify = False and sys.platform == "linux"
app.ctx.watcher = threading.Thread(
target=watcher_thread if sys.platform == "linux" else watcher_thread_poll,
target=watcher_thread if use_inotify else watcher_thread_poll,
args=[loop],
)
app.ctx.watcher.start()