Conversion of User Guide to the SHH stack (#2781)

2023-09-06 15:44:00 +03:00
parent 47215d4635
commit d255d1aae1
332 changed files with 51495 additions and 2013 deletions
--- a/guide/webapp/display/search/init.py
+++ b/guide/webapp/display/search/init.py
--- a/guide/webapp/display/search/renderer.py
+++ b/guide/webapp/display/search/renderer.py
@@ -0,0 +1,68 @@
+from contextlib import contextmanager
+from urllib.parse import unquote
+
+from webapp.display.search.search import Searcher
+
+from html5tagger import Builder, E  # type: ignore
+from sanic import Request
+
+from ..base import BaseRenderer
+from ..layouts.main import MainLayout
+
+
+class SearchRenderer(BaseRenderer):
+    def render(
+        self, request: Request, language: str, searcher: Searcher, full: bool
+    ) -> Builder:
+        builder = self.get_builder(
+            full=request.headers.get("HX-Request") is None,
+            language=language,
+        )
+        self._body(request, builder, language, searcher, full)
+        return builder
+
+    def _body(
+        self,
+        request: Request,
+        builder: Builder,
+        language: str,
+        searcher: Searcher,
+        full: bool,
+    ):
+        with self._base(request, builder, full):
+            builder.h1("Search")
+            self._results(request, builder, searcher, language)
+
+    @contextmanager
+    def _base(self, request: Request, builder: Builder, full: bool):
+        layout = MainLayout(builder)
+        with layout(request, full):
+            yield
+
+    def _results(
+        self,
+        request: Request,
+        builder: Builder,
+        searcher: Searcher,
+        language: str,
+    ):
+        query = unquote(request.args.get("q", ""))
+        results = searcher.search(query, language)
+        if not query or not results:
+            builder.p("No results found")
+            return
+
+        with builder.div(class_="container"):
+            with builder.ul():
+                for _, doc in results:
+                    builder.li(
+                        E.a(
+                            doc.title,
+                            href=f"/{doc.page.relative_path}",
+                            hx_get=f"/{doc.page.relative_path}",
+                            hx_target="#content",
+                            hx_swap="innerHTML",
+                            hx_push_url="true",
+                        ),
+                        f" - {doc.page.relative_path}",
+                    )
--- a/guide/webapp/display/search/search.py
+++ b/guide/webapp/display/search/search.py
@@ -0,0 +1,175 @@
+from __future__ import annotations
+
+from collections import Counter
+from pathlib import Path
+from typing import ClassVar
+
+from msgspec import Struct
+from webapp.display.page import Page
+
+
+class Stemmer:
+    STOP_WORDS: ClassVar[set[str]] = set(
+        "a about above after again against all am an and any are aren't as at be because been before being below between both but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he he'd he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once only or other ought our ours ourselves out over own same shan't she she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's these they they'd they'll they're they've this those through to too under until up very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your yours yourself yourselves".split()  # noqa: E501
+    )
+    PREFIXES = set("auto be fore over re un under".split())
+    SUFFIXES = set(
+        "able al ance ant ate ed en er ful hood ing ion ish ity ive ize less ly ment ness ous ship sion tion y".split()  # noqa: E501
+    )
+    VOWELS = set("aeiou")
+    PLURALIZATION = set("s es ies".split())
+
+    def stem(self, word: str) -> str:
+        if word in self.STOP_WORDS:
+            return word
+        if word in self.PREFIXES:
+            return word
+        for suffix in self.SUFFIXES | self.PLURALIZATION:
+            if word.endswith(suffix):
+                return self._stem(word[: -len(suffix)])
+        return word
+
+    def _stem(self, word: str) -> str:
+        if word.endswith("e"):
+            return word[:-1]
+        if word.endswith("y") and word[-2] not in self.VOWELS:
+            return word[:-1]
+        return word
+
+    def __call__(self, word: str) -> str:
+        return self.stem(word)
+
+
+class Document(Struct, kw_only=True):
+    TITLE_WEIGHT: ClassVar[int] = 3
+    BODY_WEIGHT: ClassVar[int] = 1
+
+    page: Page
+    language: str
+    term_frequency: dict[str, float] = {}
+
+    @property
+    def title(self) -> str:
+        return self.page.meta.title
+
+    @property
+    def text(self) -> str:
+        return self.page.content
+
+    @property
+    def weighted_text(self) -> str:
+        """Return the text with the title repeated."""
+        return " ".join(
+            [self.title] * self.TITLE_WEIGHT + [self.text] * self.BODY_WEIGHT
+        )
+
+    def _term_frequency(self, stemmer: Stemmer) -> None:
+        """Count the number of times each word appears in the document."""
+        words = [
+            stemmer(word)
+            for word in self.weighted_text.lower().split()
+            if word not in Stemmer.STOP_WORDS
+        ]
+        num_words = len(words)
+        word_count = Counter(words)
+        self.term_frequency = {
+            word: count / num_words for word, count in word_count.items()
+        }
+
+    def process(self, stemmer: Stemmer) -> Document:
+        """Process the document."""
+        self._term_frequency(stemmer)
+        return self
+
+
+def _inverse_document_frequency(docs: list[Document]) -> dict[str, float]:
+    """Count the number of documents each word appears in."""
+    num_docs = len(docs)
+    word_count: Counter[str] = Counter()
+    for doc in docs:
+        word_count.update(doc.term_frequency.keys())
+    return {word: num_docs / count for word, count in word_count.items()}
+
+
+def _tf_idf_vector(
+    document: Document, idf: dict[str, float]
+) -> dict[str, float]:
+    """Calculate the TF-IDF vector for a document."""
+    return {
+        word: tf * idf[word]
+        for word, tf in document.term_frequency.items()
+        if word in idf
+    }
+
+
+def _cosine_similarity(
+    vec1: dict[str, float], vec2: dict[str, float]
+) -> float:
+    """Calculate the cosine similarity between two vectors."""
+    if not vec1 or not vec2:
+        return 0.0
+    dot_product = sum(vec1.get(word, 0) * vec2.get(word, 0) for word in vec1)
+    magnitude1 = sum(value**2 for value in vec1.values()) ** 0.5
+    magnitude2 = sum(value**2 for value in vec2.values()) ** 0.5
+    return dot_product / (magnitude1 * magnitude2)
+
+
+def _search(
+    query: str,
+    language: str,
+    vectors: list[dict[str, float]],
+    idf: dict[str, float],
+    documents: list[Document],
+    stemmer: Stemmer,
+) -> list[tuple[float, Document]]:
+    dummy_page = Page(Path(), query)
+    tf_idf_query = _tf_idf_vector(
+        Document(page=dummy_page, language=language).process(stemmer), idf
+    )
+    similarities = [
+        _cosine_similarity(tf_idf_query, vector) for vector in vectors
+    ]
+    return [
+        (similarity, document)
+        for similarity, document in sorted(
+            zip(similarities, documents),
+            reverse=True,
+            key=lambda pair: pair[0],
+        )[:10]
+        if similarity > 0
+    ]
+
+
+class Searcher:
+    def __init__(
+        self,
+        stemmer: Stemmer,
+        documents: list[Document],
+    ):
+        self._documents: dict[str, list[Document]] = {}
+        for document in documents:
+            self._documents.setdefault(document.language, []).append(document)
+        self._idf = {
+            language: _inverse_document_frequency(documents)
+            for language, documents in self._documents.items()
+        }
+        self._vectors = {
+            language: [
+                _tf_idf_vector(document, self._idf[language])
+                for document in documents
+            ]
+            for language, documents in self._documents.items()
+        }
+        self._stemmer = stemmer
+
+    def search(
+        self, query: str, language: str
+    ) -> list[tuple[float, Document]]:
+        return _search(
+            query,
+            language,
+            self._vectors[language],
+            self._idf[language],
+            self._documents[language],
+            self._stemmer,
+        )