cista-storage/cista/app.py

import asyncio
import mimetypes
from concurrent.futures import ThreadPoolExecutor
from importlib.resources import files
from urllib.parse import unquote
from wsgiref.handlers import format_date_time

import brotli
import sanic.helpers
from blake3 import blake3
from sanic import Blueprint, Sanic, empty, raw
from sanic.exceptions import Forbidden, NotFound
from sanic.log import logging

from cista import auth, config, session, watching
from cista.api import bp
from cista.protocol import DirEntry
from cista.util.apphelpers import handle_sanic_exception

# Workaround until Sanic PR #2824 is merged
sanic.helpers._ENTITY_HEADERS = frozenset()

app = Sanic("cista", strict_slashes=True)
app.blueprint(auth.bp)
app.blueprint(bp)
app.exception(Exception)(handle_sanic_exception)


@app.before_server_start
async def main_start(app, loop):
    config.load_config()
    await watching.start(app, loop)
    app.ctx.threadexec = ThreadPoolExecutor(max_workers=8)


@app.after_server_stop
async def main_stop(app, loop):
    await watching.stop(app, loop)
    app.ctx.threadexec.shutdown()


@app.on_request
async def use_session(req):
    req.ctx.session = session.get(req)
    try:
        req.ctx.username = req.ctx.session["username"]
        req.ctx.user = config.config.users[req.ctx.session["username"]]  # type: ignore
    except (AttributeError, KeyError, TypeError):
        req.ctx.username = None
        req.ctx.user = None
    # CSRF protection
    if req.method == "GET" and req.headers.upgrade != "websocket":
        return  # Ordinary GET requests are fine
    # Check that origin matches host, for browsers which should all send Origin.
    # Curl doesn't send any Origin header, so we allow it anyway.
    origin = req.headers.origin
    if origin and origin.split("//", 1)[1] != req.host:
        raise Forbidden("Invalid origin: Cross-Site requests not permitted")


@app.before_server_start
def http_fileserver(app, _):
    bp = Blueprint("fileserver")
    bp.on_request(auth.verify)
    bp.static(
        "/files/",
        config.config.path,
        use_content_range=True,
        stream_large_files=True,
        directory_view=True,
    )
    app.blueprint(bp)


www = {}


@app.before_server_start
async def load_wwwroot(*_ignored):
    global www
    www = await asyncio.get_event_loop().run_in_executor(None, _load_wwwroot, www)


def _load_wwwroot(www):
    wwwnew = {}
    base = files("cista") / "wwwroot"
    paths = ["."]
    while paths:
        path = paths.pop(0)
        current = base / path
        for p in current.iterdir():
            if p.is_dir():
                paths.append(current / p.parts[-1])
                continue
            name = p.relative_to(base).as_posix()
            mime = mimetypes.guess_type(name)[0] or "application/octet-stream"
            mtime = p.stat().st_mtime
            data = p.read_bytes()
            etag = blake3(data).hexdigest(length=8)
            if name == "index.html":
                name = ""
            # Use old data if not changed
            if name in www and www[name][2]["etag"] == etag:
                wwwnew[name] = www[name]
                continue
            # Add charset definition
            if mime.startswith("text/"):
                mime = f"{mime}; charset=UTF-8"
            # Asset files names will change whenever the content changes
            cached = name.startswith("assets/")
            headers = {
                "etag": etag,
                "last-modified": format_date_time(mtime),
                "cache-control": "max-age=31536000, immutable"
                if cached
                else "no-cache",
                "content-type": mime,
            }
            # Precompress with Brotli
            br = brotli.compress(data)
            if len(br) >= len(data):
                br = False
            wwwnew[name] = data, br, headers
    return wwwnew


@app.add_task
async def refresh_wwwroot():
    while True:
        try:
            wwwold = www
            await load_wwwroot()
            changes = ""
            for name in sorted(www):
                attr = www[name]
                if wwwold.get(name) == attr:
                    continue
                headers = attr[2]
                changes += f"{headers['last-modified']} {headers['etag']} /{name}\n"
            for name in sorted(set(wwwold) - set(www)):
                changes += f"Deleted /{name}\n"
            if changes:
                print(f"Updated wwwroot:\n{changes}", end="", flush=True)
        except Exception as e:
            print("Error loading wwwroot", e)
        if not app.debug:
            return
        await asyncio.sleep(0.5)


@app.route("/<path:path>", methods=["GET", "HEAD"])
async def wwwroot(req, path=""):
    """Frontend files only"""
    name = unquote(path)
    if name not in www:
        raise NotFound(f"File not found: /{path}", extra={"name": name})
    data, br, headers = www[name]
    if req.headers.if_none_match == headers["etag"]:
        # The client has it cached, respond 304 Not Modified
        return empty(304, headers=headers)
    # Brotli compressed?
    if br and "br" in req.headers.accept_encoding.split(", "):
        headers = {
            **headers,
            "content-encoding": "br",
        }
        data = br
    return raw(data, headers=headers)


import datetime
from collections import deque
from pathlib import Path
from stat import S_IFREG

from stream_zip import ZIP_AUTO, stream_zip


@app.get("/zip/<keys>/<zipfile:ext=zip>")
async def zip_download(req, keys, zipfile, ext):
    """Download a zip archive of the given keys"""
    wanted = set(keys.split("+"))
    with watching.tree_lock:
        q = deque([([], None, watching.tree[""].dir)])
        files = []
        while q:
            locpar, relpar, d = q.pop()
            for name, attr in d.items():
                loc = [*locpar, name]
                rel = None
                if relpar or attr.key in wanted:
                    rel = [*relpar, name] if relpar else [name]
                    wanted.remove(attr.key)
                if isinstance(attr, DirEntry):
                    q.append((loc, rel, attr.dir))
                elif rel:
                    files.append(
                        (
                            "/".join(rel),
                            Path(watching.rootpath.joinpath(*loc)),
                            attr.mtime,
                            attr.size,
                        )
                    )

    if not files:
        raise NotFound(
            "No files found",
            context={"keys": keys, "zipfile": zipfile, "wanted": wanted},
        )
    if wanted:
        raise NotFound("Files not found", context={"missing": wanted})

    for rel, p, mtime, size in files:
        if not p.is_file():
            raise NotFound(f"File not found {rel}")

    def local_files(files):
        for rel, p, mtime, size in files:
            modified = datetime.datetime.fromtimestamp(mtime, datetime.UTC)
            yield rel, modified, S_IFREG | 0o644, ZIP_AUTO(size), contents(p)

    def contents(name):
        with name.open("rb") as f:
            while chunk := f.read(65536):
                yield chunk

    def worker():
        try:
            for chunk in stream_zip(local_files(files)):
                asyncio.run_coroutine_threadsafe(queue.put(chunk), loop)
        except Exception:
            logging.exception("Error streaming ZIP")
            raise
        finally:
            asyncio.run_coroutine_threadsafe(queue.put(None), loop)

    # Don't block the event loop: run in a thread
    queue = asyncio.Queue(maxsize=1)
    loop = asyncio.get_event_loop()
    thread = loop.run_in_executor(app.ctx.threadexec, worker)

    # Stream the response
    res = await req.respond(content_type="application/zip")
    while chunk := await queue.get():
        await res.send(chunk)

    await thread  # If it raises, the response will fail download
Ruff 2023-11-01 19:36:10 +00:00			`import asyncio`
Cleanup, bugfixes. Added access control on files and API. 2023-10-23 02:51:39 +01:00			`import mimetypes`
Zip download support, streaming. Needs cleanup 2023-11-07 23:30:35 +00:00			`from concurrent.futures import ThreadPoolExecutor`
Cleanup 2023-10-15 05:31:54 +01:00			`from importlib.resources import files`
Cleanup, bugfixes. Added access control on files and API. 2023-10-23 02:51:39 +01:00			`from urllib.parse import unquote`
Ruff 2023-11-01 19:36:10 +00:00			`from wsgiref.handlers import format_date_time`
Restructuring as a Python package. 2023-10-14 23:29:50 +01:00
Merged something 2023-11-01 14:03:17 +00:00			`import brotli`
Ruff 2023-11-01 19:36:10 +00:00			`import sanic.helpers`
Implemented HTTP caching and updates on wwwroot, much faster page loads. 2023-11-01 17:08:05 +00:00			`from blake3 import blake3`
Ruff 2023-11-01 19:36:10 +00:00			`from sanic import Blueprint, Sanic, empty, raw`
Cleanup, bugfixes. Added access control on files and API. 2023-10-23 02:51:39 +01:00			`from sanic.exceptions import Forbidden, NotFound`
Zip download support, streaming. Needs cleanup 2023-11-07 23:30:35 +00:00			`from sanic.log import logging`
Restructuring as a Python package. 2023-10-14 23:29:50 +01:00
Cleanup, bugfixes. Added access control on files and API. 2023-10-23 02:51:39 +01:00			`from cista import auth, config, session, watching`
			`from cista.api import bp`
Zip download support, streaming. Needs cleanup 2023-11-07 23:30:35 +00:00			`from cista.protocol import DirEntry`
Cleanup, bugfixes. Added access control on files and API. 2023-10-23 02:51:39 +01:00			`from cista.util.apphelpers import handle_sanic_exception`

Implemented HTTP caching and updates on wwwroot, much faster page loads. 2023-11-01 17:08:05 +00:00			`# Workaround until Sanic PR #2824 is merged`
			`sanic.helpers._ENTITY_HEADERS = frozenset()`

Cleanup, bugfixes. Added access control on files and API. 2023-10-23 02:51:39 +01:00			`app = Sanic("cista", strict_slashes=True)`
			`app.blueprint(auth.bp)`
			`app.blueprint(bp)`
			`app.exception(Exception)(handle_sanic_exception)`
Restructuring as a Python package. 2023-10-14 23:29:50 +01:00
Formatting and fix Internal Server Error on upload 2023-10-28 21:20:34 +01:00
Cleanup, bugfixes. Added access control on files and API. 2023-10-23 02:51:39 +01:00			`@app.before_server_start`
			`async def main_start(app, loop):`
			`config.load_config()`
			`await watching.start(app, loop)`
Zip download support, streaming. Needs cleanup 2023-11-07 23:30:35 +00:00			`app.ctx.threadexec = ThreadPoolExecutor(max_workers=8)`
Cleanup, bugfixes. Added access control on files and API. 2023-10-23 02:51:39 +01:00
Formatting and fix Internal Server Error on upload 2023-10-28 21:20:34 +01:00
Cleanup, bugfixes. Added access control on files and API. 2023-10-23 02:51:39 +01:00			`@app.after_server_stop`
			`async def main_stop(app, loop):`
			`await watching.stop(app, loop)`
Zip download support, streaming. Needs cleanup 2023-11-07 23:30:35 +00:00			`app.ctx.threadexec.shutdown()`
Restructuring as a Python package. 2023-10-14 23:29:50 +01:00
Formatting and fix Internal Server Error on upload 2023-10-28 21:20:34 +01:00
Implemented control commands and tests. Rewritten error and session/flash handling. 2023-10-21 02:44:43 +01:00			`@app.on_request`
Cleanup, bugfixes. Added access control on files and API. 2023-10-23 02:51:39 +01:00			`async def use_session(req):`
			`req.ctx.session = session.get(req)`
			`try:`
Connection status/error messages 2023-11-07 18:01:34 +00:00			`req.ctx.username = req.ctx.session["username"]`
Fix field name in session cookie; prevented logged in useds authenticating. 2023-10-23 23:47:57 +01:00			`req.ctx.user = config.config.users[req.ctx.session["username"]] # type: ignore`
Cleanup, bugfixes. Added access control on files and API. 2023-10-23 02:51:39 +01:00			`except (AttributeError, KeyError, TypeError):`
Connection status/error messages 2023-11-07 18:01:34 +00:00			`req.ctx.username = None`
Cleanup, bugfixes. Added access control on files and API. 2023-10-23 02:51:39 +01:00			`req.ctx.user = None`
Implemented control commands and tests. Rewritten error and session/flash handling. 2023-10-21 02:44:43 +01:00			`# CSRF protection`
Cleanup, bugfixes. Added access control on files and API. 2023-10-23 02:51:39 +01:00			`if req.method == "GET" and req.headers.upgrade != "websocket":`
Implemented control commands and tests. Rewritten error and session/flash handling. 2023-10-21 02:44:43 +01:00			`return # Ordinary GET requests are fine`
Cleanup, bugfixes. Added access control on files and API. 2023-10-23 02:51:39 +01:00			`# Check that origin matches host, for browsers which should all send Origin.`
			`# Curl doesn't send any Origin header, so we allow it anyway.`
			`origin = req.headers.origin`
			`if origin and origin.split("//", 1)[1] != req.host:`
Implemented control commands and tests. Rewritten error and session/flash handling. 2023-10-21 02:44:43 +01:00			`raise Forbidden("Invalid origin: Cross-Site requests not permitted")`

Formatting and fix Internal Server Error on upload 2023-10-28 21:20:34 +01:00
Restructuring as a Python package. 2023-10-14 23:29:50 +01:00			`@app.before_server_start`
Cleanup, bugfixes. Added access control on files and API. 2023-10-23 02:51:39 +01:00			`def http_fileserver(app, _):`
			`bp = Blueprint("fileserver")`
			`bp.on_request(auth.verify)`
Formatting and fix Internal Server Error on upload 2023-10-28 21:20:34 +01:00			`bp.static(`
			`"/files/",`
			`config.config.path,`
			`use_content_range=True,`
			`stream_large_files=True,`
			`directory_view=True,`
			`)`
Cleanup, bugfixes. Added access control on files and API. 2023-10-23 02:51:39 +01:00			`app.blueprint(bp)`

Formatting and fix Internal Server Error on upload 2023-10-28 21:20:34 +01:00
Merged something 2023-11-01 14:03:17 +00:00			`www = {}`


			`@app.before_server_start`
Implemented HTTP caching and updates on wwwroot, much faster page loads. 2023-11-01 17:08:05 +00:00			`async def load_wwwroot(*_ignored):`
Realtime updates of wwwroot files when --dev is used. 2023-11-01 14:53:57 +00:00			`global www`
Implemented HTTP caching and updates on wwwroot, much faster page loads. 2023-11-01 17:08:05 +00:00			`www = await asyncio.get_event_loop().run_in_executor(None, _load_wwwroot, www)`


			`def _load_wwwroot(www):`
Realtime updates of wwwroot files when --dev is used. 2023-11-01 14:53:57 +00:00			`wwwnew = {}`
Faster wwwroot serving, uses RAM cache of brotli compressed data for all assets. 2023-11-01 14:40:08 +00:00			`base = files("cista") / "wwwroot"`
			`paths = ["."]`
			`while paths:`
			`path = paths.pop(0)`
			`current = base / path`
Merged something 2023-11-01 14:03:17 +00:00			`for p in current.iterdir():`
			`if p.is_dir():`
			`paths.append(current / p.parts[-1])`
			`continue`
			`name = p.relative_to(base).as_posix()`
			`mime = mimetypes.guess_type(name)[0] or "application/octet-stream"`
Implemented HTTP caching and updates on wwwroot, much faster page loads. 2023-11-01 17:08:05 +00:00			`mtime = p.stat().st_mtime`
Merged something 2023-11-01 14:03:17 +00:00			`data = p.read_bytes()`
Implemented HTTP caching and updates on wwwroot, much faster page loads. 2023-11-01 17:08:05 +00:00			`etag = blake3(data).hexdigest(length=8)`
			`if name == "index.html":`
			`name = ""`
Realtime updates of wwwroot files when --dev is used. 2023-11-01 14:53:57 +00:00			`# Use old data if not changed`
Implemented HTTP caching and updates on wwwroot, much faster page loads. 2023-11-01 17:08:05 +00:00			`if name in www and www[name][2]["etag"] == etag:`
Realtime updates of wwwroot files when --dev is used. 2023-11-01 14:53:57 +00:00			`wwwnew[name] = www[name]`
			`continue`
Add charset=UTF-8 2023-11-01 17:32:48 +00:00			`# Add charset definition`
			`if mime.startswith("text/"):`
			`mime = f"{mime}; charset=UTF-8"`
			`# Asset files names will change whenever the content changes`
Implemented HTTP caching and updates on wwwroot, much faster page loads. 2023-11-01 17:08:05 +00:00			`cached = name.startswith("assets/")`
			`headers = {`
			`"etag": etag,`
			`"last-modified": format_date_time(mtime),`
			`"cache-control": "max-age=31536000, immutable"`
			`if cached`
			`else "no-cache",`
			`"content-type": mime,`
			`}`
Realtime updates of wwwroot files when --dev is used. 2023-11-01 14:53:57 +00:00			`# Precompress with Brotli`
Merged something 2023-11-01 14:03:17 +00:00			`br = brotli.compress(data)`
			`if len(br) >= len(data):`
			`br = False`
Implemented HTTP caching and updates on wwwroot, much faster page loads. 2023-11-01 17:08:05 +00:00			`wwwnew[name] = data, br, headers`
			`return wwwnew`
Realtime updates of wwwroot files when --dev is used. 2023-11-01 14:53:57 +00:00
Allow multiple commands on control socket without disconnecting. 2023-11-01 14:57:54 +00:00
Realtime updates of wwwroot files when --dev is used. 2023-11-01 14:53:57 +00:00			`@app.add_task`
			`async def refresh_wwwroot():`
Implemented HTTP caching and updates on wwwroot, much faster page loads. 2023-11-01 17:08:05 +00:00			`while True:`
			`try:`
			`wwwold = www`
			`await load_wwwroot()`
			`changes = ""`
			`for name in sorted(www):`
			`attr = www[name]`
			`if wwwold.get(name) == attr:`
			`continue`
			`headers = attr[2]`
			`changes += f"{headers['last-modified']} {headers['etag']} /{name}\n"`
			`for name in sorted(set(wwwold) - set(www)):`
			`changes += f"Deleted /{name}\n"`
			`if changes:`
			`print(f"Updated wwwroot:\n{changes}", end="", flush=True)`
			`except Exception as e:`
			`print("Error loading wwwroot", e)`
			`if not app.debug:`
			`return`
Realtime updates of wwwroot files when --dev is used. 2023-11-01 14:53:57 +00:00			`await asyncio.sleep(0.5)`
Merged something 2023-11-01 14:03:17 +00:00
Allow multiple commands on control socket without disconnecting. 2023-11-01 14:57:54 +00:00
Implemented HTTP caching and updates on wwwroot, much faster page loads. 2023-11-01 17:08:05 +00:00			`@app.route("/<path:path>", methods=["GET", "HEAD"])`
Cleanup, bugfixes. Added access control on files and API. 2023-10-23 02:51:39 +01:00			`async def wwwroot(req, path=""):`
			`"""Frontend files only"""`
Implemented HTTP caching and updates on wwwroot, much faster page loads. 2023-11-01 17:08:05 +00:00			`name = unquote(path)`
Faster wwwroot serving, uses RAM cache of brotli compressed data for all assets. 2023-11-01 14:40:08 +00:00			`if name not in www:`
			`raise NotFound(f"File not found: /{path}", extra={"name": name})`
Implemented HTTP caching and updates on wwwroot, much faster page loads. 2023-11-01 17:08:05 +00:00			`data, br, headers = www[name]`
			`if req.headers.if_none_match == headers["etag"]:`
			`# The client has it cached, respond 304 Not Modified`
			`return empty(304, headers=headers)`
Faster wwwroot serving, uses RAM cache of brotli compressed data for all assets. 2023-11-01 14:40:08 +00:00			`# Brotli compressed?`
			`if br and "br" in req.headers.accept_encoding.split(", "):`
Implemented HTTP caching and updates on wwwroot, much faster page loads. 2023-11-01 17:08:05 +00:00			`headers = {`
			`**headers,`
			`"content-encoding": "br",`
			`}`
Faster wwwroot serving, uses RAM cache of brotli compressed data for all assets. 2023-11-01 14:40:08 +00:00			`data = br`
Implemented HTTP caching and updates on wwwroot, much faster page loads. 2023-11-01 17:08:05 +00:00			`return raw(data, headers=headers)`
Zip download support, streaming. Needs cleanup 2023-11-07 23:30:35 +00:00

			`import datetime`
			`from collections import deque`
			`from pathlib import Path`
			`from stat import S_IFREG`

			`from stream_zip import ZIP_AUTO, stream_zip`


			`@app.get("/zip/<keys>/<zipfile:ext=zip>")`
			`async def zip_download(req, keys, zipfile, ext):`
			`"""Download a zip archive of the given keys"""`
			`wanted = set(keys.split("+"))`
			`with watching.tree_lock:`
			`q = deque([([], None, watching.tree[""].dir)])`
			`files = []`
			`while q:`
			`locpar, relpar, d = q.pop()`
			`for name, attr in d.items():`
			`loc = [*locpar, name]`
			`rel = None`
			`if relpar or attr.key in wanted:`
			`rel = [*relpar, name] if relpar else [name]`
			`wanted.remove(attr.key)`
			`if isinstance(attr, DirEntry):`
			`q.append((loc, rel, attr.dir))`
			`elif rel:`
			`files.append(`
			`(`
			`"/".join(rel),`
			`Path(watching.rootpath.joinpath(*loc)),`
			`attr.mtime,`
			`attr.size,`
			`)`
			`)`

			`if not files:`
			`raise NotFound(`
			`"No files found",`
			`context={"keys": keys, "zipfile": zipfile, "wanted": wanted},`
			`)`
			`if wanted:`
			`raise NotFound("Files not found", context={"missing": wanted})`

			`for rel, p, mtime, size in files:`
			`if not p.is_file():`
			`raise NotFound(f"File not found {rel}")`

			`def local_files(files):`
			`for rel, p, mtime, size in files:`
			`modified = datetime.datetime.fromtimestamp(mtime, datetime.UTC)`
			`yield rel, modified, S_IFREG \| 0o644, ZIP_AUTO(size), contents(p)`

			`def contents(name):`
			`with name.open("rb") as f:`
			`while chunk := f.read(65536):`
			`yield chunk`

			`def worker():`
			`try:`
			`for chunk in stream_zip(local_files(files)):`
			`asyncio.run_coroutine_threadsafe(queue.put(chunk), loop)`
			`except Exception:`
			`logging.exception("Error streaming ZIP")`
			`raise`
			`finally:`
			`asyncio.run_coroutine_threadsafe(queue.put(None), loop)`

			`# Don't block the event loop: run in a thread`
			`queue = asyncio.Queue(maxsize=1)`
			`loop = asyncio.get_event_loop()`
			`thread = loop.run_in_executor(app.ctx.threadexec, worker)`

			`# Stream the response`
			`res = await req.respond(content_type="application/zip")`
			`while chunk := await queue.get():`
			`await res.send(chunk)`

			`await thread # If it raises, the response will fail download`