Conversion of User Guide to the SHH stack (#2781)

This commit is contained in:
Adam Hopkins
2023-09-06 15:44:00 +03:00
committed by GitHub
parent 47215d4635
commit d255d1aae1
332 changed files with 51495 additions and 2013 deletions

View File

View File

@@ -0,0 +1,68 @@
from contextlib import contextmanager
from urllib.parse import unquote
from webapp.display.search.search import Searcher
from html5tagger import Builder, E # type: ignore
from sanic import Request
from ..base import BaseRenderer
from ..layouts.main import MainLayout
class SearchRenderer(BaseRenderer):
def render(
self, request: Request, language: str, searcher: Searcher, full: bool
) -> Builder:
builder = self.get_builder(
full=request.headers.get("HX-Request") is None,
language=language,
)
self._body(request, builder, language, searcher, full)
return builder
def _body(
self,
request: Request,
builder: Builder,
language: str,
searcher: Searcher,
full: bool,
):
with self._base(request, builder, full):
builder.h1("Search")
self._results(request, builder, searcher, language)
@contextmanager
def _base(self, request: Request, builder: Builder, full: bool):
layout = MainLayout(builder)
with layout(request, full):
yield
def _results(
self,
request: Request,
builder: Builder,
searcher: Searcher,
language: str,
):
query = unquote(request.args.get("q", ""))
results = searcher.search(query, language)
if not query or not results:
builder.p("No results found")
return
with builder.div(class_="container"):
with builder.ul():
for _, doc in results:
builder.li(
E.a(
doc.title,
href=f"/{doc.page.relative_path}",
hx_get=f"/{doc.page.relative_path}",
hx_target="#content",
hx_swap="innerHTML",
hx_push_url="true",
),
f" - {doc.page.relative_path}",
)

View File

@@ -0,0 +1,175 @@
from __future__ import annotations
from collections import Counter
from pathlib import Path
from typing import ClassVar
from msgspec import Struct
from webapp.display.page import Page
class Stemmer:
STOP_WORDS: ClassVar[set[str]] = set(
"a about above after again against all am an and any are aren't as at be because been before being below between both but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he he'd he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once only or other ought our ours ourselves out over own same shan't she she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's these they they'd they'll they're they've this those through to too under until up very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your yours yourself yourselves".split() # noqa: E501
)
PREFIXES = set("auto be fore over re un under".split())
SUFFIXES = set(
"able al ance ant ate ed en er ful hood ing ion ish ity ive ize less ly ment ness ous ship sion tion y".split() # noqa: E501
)
VOWELS = set("aeiou")
PLURALIZATION = set("s es ies".split())
def stem(self, word: str) -> str:
if word in self.STOP_WORDS:
return word
if word in self.PREFIXES:
return word
for suffix in self.SUFFIXES | self.PLURALIZATION:
if word.endswith(suffix):
return self._stem(word[: -len(suffix)])
return word
def _stem(self, word: str) -> str:
if word.endswith("e"):
return word[:-1]
if word.endswith("y") and word[-2] not in self.VOWELS:
return word[:-1]
return word
def __call__(self, word: str) -> str:
return self.stem(word)
class Document(Struct, kw_only=True):
TITLE_WEIGHT: ClassVar[int] = 3
BODY_WEIGHT: ClassVar[int] = 1
page: Page
language: str
term_frequency: dict[str, float] = {}
@property
def title(self) -> str:
return self.page.meta.title
@property
def text(self) -> str:
return self.page.content
@property
def weighted_text(self) -> str:
"""Return the text with the title repeated."""
return " ".join(
[self.title] * self.TITLE_WEIGHT + [self.text] * self.BODY_WEIGHT
)
def _term_frequency(self, stemmer: Stemmer) -> None:
"""Count the number of times each word appears in the document."""
words = [
stemmer(word)
for word in self.weighted_text.lower().split()
if word not in Stemmer.STOP_WORDS
]
num_words = len(words)
word_count = Counter(words)
self.term_frequency = {
word: count / num_words for word, count in word_count.items()
}
def process(self, stemmer: Stemmer) -> Document:
"""Process the document."""
self._term_frequency(stemmer)
return self
def _inverse_document_frequency(docs: list[Document]) -> dict[str, float]:
"""Count the number of documents each word appears in."""
num_docs = len(docs)
word_count: Counter[str] = Counter()
for doc in docs:
word_count.update(doc.term_frequency.keys())
return {word: num_docs / count for word, count in word_count.items()}
def _tf_idf_vector(
document: Document, idf: dict[str, float]
) -> dict[str, float]:
"""Calculate the TF-IDF vector for a document."""
return {
word: tf * idf[word]
for word, tf in document.term_frequency.items()
if word in idf
}
def _cosine_similarity(
vec1: dict[str, float], vec2: dict[str, float]
) -> float:
"""Calculate the cosine similarity between two vectors."""
if not vec1 or not vec2:
return 0.0
dot_product = sum(vec1.get(word, 0) * vec2.get(word, 0) for word in vec1)
magnitude1 = sum(value**2 for value in vec1.values()) ** 0.5
magnitude2 = sum(value**2 for value in vec2.values()) ** 0.5
return dot_product / (magnitude1 * magnitude2)
def _search(
query: str,
language: str,
vectors: list[dict[str, float]],
idf: dict[str, float],
documents: list[Document],
stemmer: Stemmer,
) -> list[tuple[float, Document]]:
dummy_page = Page(Path(), query)
tf_idf_query = _tf_idf_vector(
Document(page=dummy_page, language=language).process(stemmer), idf
)
similarities = [
_cosine_similarity(tf_idf_query, vector) for vector in vectors
]
return [
(similarity, document)
for similarity, document in sorted(
zip(similarities, documents),
reverse=True,
key=lambda pair: pair[0],
)[:10]
if similarity > 0
]
class Searcher:
def __init__(
self,
stemmer: Stemmer,
documents: list[Document],
):
self._documents: dict[str, list[Document]] = {}
for document in documents:
self._documents.setdefault(document.language, []).append(document)
self._idf = {
language: _inverse_document_frequency(documents)
for language, documents in self._documents.items()
}
self._vectors = {
language: [
_tf_idf_vector(document, self._idf[language])
for document in documents
]
for language, documents in self._documents.items()
}
self._stemmer = stemmer
def search(
self, query: str, language: str
) -> list[tuple[float, Document]]:
return _search(
query,
language,
self._vectors[language],
self._idf[language],
self._documents[language],
self._stemmer,
)