Initial commit

This commit is contained in:
Leo Vasanko 2025-08-17 22:57:52 -06:00
commit dd3d0c63c4
7 changed files with 616 additions and 0 deletions

19
README.md Normal file
View File

@ -0,0 +1,19 @@
# Data Compression Benchmark
This project benchmarks the performance of various compression algorithms (zstd, brotli, gzip) using different compression levels on all files in a given directory.
You can easily see which algorithm and setting is best for your use case, benchmarking on the actual files you'll be compressing.
## Demo on a NodeJS project's static client files
!(Compression)[compression.png]
!(Decompression)[decompression.png]
## How to Run
The recommended way is to use [uv](https://github.com/astral-sh/uv) to run from git:
```fish
uvx --from git+https://git.zi.fi/LeoVasanko/compression-benchmark.git compression-benchmark /path/to/folder
```

329
bench.ipynb Normal file
View File

@ -0,0 +1,329 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "1883540a",
"metadata": {},
"outputs": [],
"source": [
"# /// script\n",
"# dependencies = [\n",
"# \"zstandard\",\n",
"# \"brotli\",\n",
"# \"matplotlib\",\n",
"# \"tqdm\",\n",
"# \"numpy\",\n",
"# ]\n",
"# ///\n",
"\n",
"import gc\n",
"import gzip\n",
"import time\n",
"from dataclasses import dataclass\n",
"from pathlib import Path\n",
"from typing import Callable\n",
"\n",
"import brotli\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"from tqdm.auto import tqdm\n",
"from zstandard import ZstdCompressor, ZstdDecompressor\n",
"\n",
"# What should we benchmark?\n",
"FOLDER = Path(\".\")\n",
"ROUNDS = 5\n",
"ALGS = {\n",
" \"zstd\": {\n",
" \"levels\": [1, 2, 3, 4, 5, 6, 7, 9, 10, 12, 14, 16, 22],\n",
" \"c\": lambda b, q: ZstdCompressor(level=q).compress(b),\n",
" \"d\": lambda b: ZstdDecompressor().decompress(b),\n",
" },\n",
" \"brotli\": {\n",
" \"levels\": [1, 2, 3, 4, 5, 7, 9, 11],\n",
" \"c\": lambda b, q: brotli.compress(b, quality=q),\n",
" \"d\": lambda b: brotli.decompress(b),\n",
" },\n",
" \"gzip\": {\n",
" \"levels\": [1, 3, 4, 5, 9],\n",
" \"c\": lambda b, q: gzip.compress(b, compresslevel=q),\n",
" \"d\": lambda b: gzip.decompress(b),\n",
" },\n",
"}\n",
"\n",
"# In case you were wondering:\n",
"# Sharing the ZstdCompressor across operations or using threads makes a little difference (with small files)\n",
"\n",
"files = [p.read_bytes() for p in FOLDER.rglob(\"*\") if p.is_file()]\n",
"orig_total = sum(len(b) for b in files)\n",
"print(f\"Loaded {len(files)} files, total {orig_total / 1e6:.2f} MB\")\n",
"assert files, \"No files found in the specified folder\"\n",
"\n",
"\n",
"@dataclass\n",
"class BenchPoint:\n",
" name: str\n",
" lvl: int\n",
" compress: Callable\n",
" decompress: Callable\n",
" compdata: list[bytes]\n",
" comp_time_ns: int = 0 # median compression time (ns)\n",
" decomp_time_ns: int = 0 # median decompression time (ns)\n",
"\n",
"\n",
"bench_points = [\n",
" BenchPoint(alg, lvl, spec[\"c\"], spec[\"d\"], [])\n",
" for alg, spec in ALGS.items()\n",
" for lvl in spec[\"levels\"]\n",
"]\n",
"P = len(bench_points)\n",
"C_ns = np.zeros((P, ROUNDS), dtype=np.int64) # compression time (ns)\n",
"D_ns = np.zeros((P, ROUNDS), dtype=np.int64) # decompression time (ns)\n",
"\n",
"# Benchmark\n",
"gc.disable()\n",
"time.time = lambda: 0 # make gzip deterministic (timestamp header)\n",
"progress = tqdm(range(ROUNDS), desc=\"Rounds\")\n",
"for r in progress:\n",
" for i, bp in enumerate(bench_points):\n",
" progress.set_postfix_str(f\"Compress {bp.name}/{bp.lvl}\")\n",
" for f, data in enumerate(files):\n",
" t0 = time.perf_counter_ns()\n",
" out = bp.compress(data, bp.lvl)\n",
" t1 = time.perf_counter_ns()\n",
" C_ns[i, r] += t1 - t0\n",
" if r == 0:\n",
" bp.compdata.append(out)\n",
" else:\n",
" assert out == bp.compdata[f], (\n",
" f\"Compressed data changed between rounds for {bp.name}, level={bp.lvl}\"\n",
" )\n",
" progress.set_postfix_str(f\"Decompress {bp.name}/{bp.lvl}\")\n",
" for file in bp.compdata:\n",
" t0 = time.perf_counter_ns()\n",
" _ = bp.decompress(file)\n",
" t1 = time.perf_counter_ns()\n",
" D_ns[i, r] += t1 - t0\n",
" progress.set_postfix_str(\"Benchmark Done\")\n",
"gc.enable()\n",
"\n",
"# Store median times directly in bench points\n",
"Cns, Dns = np.median(C_ns, 1), np.median(D_ns, 1)\n",
"for i, bp in enumerate(bench_points):\n",
" bp.comp_time_ns = int(Cns[i])\n",
" bp.decomp_time_ns = int(Dns[i])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "12a0cf49",
"metadata": {},
"outputs": [],
"source": [
"def plot_xy(bench_points, time_attr, title, xlabel, ylabel, ylabel2):\n",
" fig, ax1 = plt.subplots(figsize=(9, 5), dpi=300)\n",
"\n",
" # Collect all y values (Mbit/s) and their corresponding x values for overlap detection\n",
" all_data_points = []\n",
"\n",
" for alg_name in ALGS:\n",
" alg_points = [bp for bp in bench_points if bp.name == alg_name]\n",
" comp_totals = np.array([sum(len(b) for b in bp.compdata) for bp in alg_points])\n",
" times_ns = np.array([getattr(bp, time_attr) for bp in alg_points])\n",
" levels = [bp.lvl for bp in alg_points]\n",
"\n",
" x = 100.0 * comp_totals / orig_total # compression percentages\n",
" y_mbps = orig_total * 8000.0 / times_ns # mbps - this is what we plot\n",
"\n",
" # Store all data points for overlap detection\n",
" for xx, yy in zip(x, y_mbps):\n",
" all_data_points.append((xx, yy))\n",
"\n",
" # Plot using Mbit/s (so faster operations are at the top)\n",
" ax1.plot(x, y_mbps, marker=\"s\", linewidth=1.25, label=alg_name, zorder=2)\n",
" levelstyle = {\n",
" \"ha\": \"center\",\n",
" \"va\": \"center\",\n",
" \"fontsize\": 7,\n",
" \"zorder\": 3,\n",
" \"bbox\": dict(\n",
" boxstyle=\"round,pad=0.2\",\n",
" facecolor=\"white\",\n",
" edgecolor=\"black\",\n",
" lw=0.4,\n",
" ),\n",
" }\n",
" for xx, yy, L in zip(x, y_mbps, levels):\n",
" ax1.text(xx, yy, str(L), **levelstyle)\n",
"\n",
" # Set up the axis\n",
" ax1.set_xlabel(xlabel)\n",
" ax1.set_ylabel(ylabel) # Compression/Decompression speed (Mbit/s)\n",
"\n",
" # Get plot dimensions\n",
" y_min, y_max = ax1.get_ylim()\n",
" x_min, x_max = ax1.get_xlim()\n",
"\n",
" # Extract unique y values and sort them\n",
" unique_y_values = sorted(set(point[1] for point in all_data_points))\n",
"\n",
" # Filter to y values within the plot range\n",
" visible_y_values = [y for y in unique_y_values if y_min <= y <= y_max]\n",
"\n",
" # Determine minimum spacing to avoid overlap (as a percentage of the y-range)\n",
" y_range = y_max - y_min\n",
" min_spacing = y_range * 0.05 # 5% of the y-range as minimum spacing\n",
"\n",
" # Select y values with sufficient spacing\n",
" selected_y_values = []\n",
" for y in visible_y_values:\n",
" # Check if this y value is far enough from already selected ones\n",
" if not selected_y_values or all(\n",
" abs(y - selected) >= min_spacing for selected in selected_y_values\n",
" ):\n",
" selected_y_values.append(y)\n",
"\n",
" # Function to format time labels\n",
" def format_time_label(ms):\n",
" if ms < 10:\n",
" return f\"{ms:.1f} ms\" # One decimal for <10ms\n",
" elif ms < 1000:\n",
" return f\"{ms:.0f} ms\" # No decimal for <1000ms\n",
" else:\n",
" seconds = ms / 1000\n",
" return f\"{seconds:.1f} s\" # Seconds with one decimal above 1000ms\n",
"\n",
" # Draw black lines from data points to right margin with labels\n",
" for y_mbps in selected_y_values:\n",
" # Find the rightmost x position for this y value\n",
" rightmost_x = max(\n",
" [point[0] for point in all_data_points if abs(point[1] - y_mbps) < 1e-6],\n",
" default=x_min,\n",
" )\n",
"\n",
" # Convert Mbit/s to milliseconds for the label\n",
" ms = orig_total * 8000.0 / (y_mbps * 1e6)\n",
"\n",
" # Draw a black line from the rightmost data point to the right margin\n",
" ax1.plot(\n",
" [rightmost_x, x_max],\n",
" [y_mbps, y_mbps],\n",
" color=\"black\",\n",
" linestyle=\"-\",\n",
" alpha=0.3,\n",
" linewidth=0.8,\n",
" zorder=1,\n",
" )\n",
"\n",
" # Add text label on the right side with improved formatting\n",
" ms_label = format_time_label(ms)\n",
"\n",
" ax1.text(\n",
" x_max,\n",
" y_mbps,\n",
" ms_label,\n",
" verticalalignment=\"center\",\n",
" horizontalalignment=\"left\",\n",
" fontsize=7,\n",
" color=\"black\",\n",
" alpha=0.8,\n",
" bbox=dict(\n",
" boxstyle=\"round,pad=0.1\", facecolor=\"white\", alpha=0.9, edgecolor=\"none\"\n",
" ),\n",
" )\n",
"\n",
" # Add the original fixed millisecond reference lines (only if they don't conflict)\n",
" ms_reference_values = [0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]\n",
"\n",
" for ms in ms_reference_values:\n",
" # Convert milliseconds to Mbit/s: mbps = orig_total * 8000.0 / (ms * 1e6)\n",
" mbps = orig_total * 8000.0 / (ms * 1e6)\n",
"\n",
" # Only draw the line if it falls within the current y-axis range and isn't too close to data points\n",
" if y_min <= mbps <= y_max:\n",
" # Check if this reference line is far enough from data points\n",
" if not any(abs(mbps - y) < min_spacing for y in selected_y_values):\n",
" ax1.axhline(\n",
" y=mbps,\n",
" color=\"gray\",\n",
" linestyle=\"--\",\n",
" alpha=0.3,\n",
" linewidth=0.5,\n",
" zorder=1,\n",
" )\n",
" # Add text label on the right side with improved formatting\n",
" ms_label = format_time_label(ms)\n",
" ax1.text(\n",
" x_max,\n",
" mbps,\n",
" ms_label,\n",
" verticalalignment=\"center\",\n",
" horizontalalignment=\"left\",\n",
" fontsize=7,\n",
" color=\"gray\",\n",
" alpha=0.6,\n",
" bbox=dict(\n",
" boxstyle=\"round,pad=0.1\",\n",
" facecolor=\"white\",\n",
" alpha=0.8,\n",
" edgecolor=\"none\",\n",
" ),\n",
" )\n",
"\n",
" ax1.set_title(title)\n",
" ax1.grid(True, linestyle=\":\", linewidth=0.7, alpha=0.6, zorder=0)\n",
" ax1.legend()\n",
" plt.tight_layout()\n",
" plt.show()\n",
"\n",
"\n",
"plot_xy(\n",
" bench_points,\n",
" \"comp_time_ns\",\n",
" f\"Compression — {len(files)} files, total {orig_total / 1_048_576:.2f} MiB (median of {ROUNDS} rounds)\",\n",
" \"Compressed size (% of original)\",\n",
" \"Compression speed (Mbit/s)\",\n",
" \"Time (ms)\",\n",
")\n",
"plot_xy(\n",
" bench_points,\n",
" \"decomp_time_ns\",\n",
" f\"Decompression — {len(files)} files, total {orig_total / 1_048_576:.2f} MiB (median of {ROUNDS} rounds)\",\n",
" \"Compressed size (% of original)\",\n",
" \"Decompression speed (Mbit/s)\",\n",
" \"Time (ms)\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a8a2bdcd",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

BIN
compression.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 234 KiB

View File

@ -0,0 +1,4 @@
from .bench import main
if __name__ == "__main__":
main()

View File

@ -0,0 +1,245 @@
import argparse
import gc
import gzip
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Callable
import brotli
import matplotlib.pyplot as plt
import numpy as np
from tqdm.auto import tqdm
from zstandard import ZstdCompressor, ZstdDecompressor
ALGS = {
"zstd": {
"levels": [1, 2, 3, 4, 5, 6, 7, 9, 10, 12, 14, 16, 22],
"c": lambda b, q: ZstdCompressor(level=q).compress(b),
"d": lambda b: ZstdDecompressor().decompress(b),
},
"brotli": {
"levels": [1, 2, 3, 4, 5, 7, 9, 11],
"c": lambda b, q: brotli.compress(b, quality=q),
"d": lambda b: brotli.decompress(b),
},
"gzip": {
"levels": [1, 3, 4, 5, 9],
"c": lambda b, q: gzip.compress(b, compresslevel=q),
"d": lambda b: gzip.decompress(b),
},
}
@dataclass
class BenchPoint:
name: str
lvl: int
compress: Callable
decompress: Callable
compdata: list
comp_time_ns: int = 0
decomp_time_ns: int = 0
def plot_xy(
bench_points, time_attr, title, xlabel, ylabel, ylabel2, orig_total, out_path
):
fig, ax1 = plt.subplots(figsize=(9, 5), dpi=300)
all_data_points = []
for alg_name in ALGS:
alg_points = [bp for bp in bench_points if bp.name == alg_name]
comp_totals = np.array([sum(len(b) for b in bp.compdata) for bp in alg_points])
times_ns = np.array([getattr(bp, time_attr) for bp in alg_points])
levels = [bp.lvl for bp in alg_points]
x = 100.0 * comp_totals / orig_total
y_mbps = orig_total * 8000.0 / times_ns
for xx, yy in zip(x, y_mbps):
all_data_points.append((xx, yy))
ax1.plot(x, y_mbps, marker="s", linewidth=1.25, label=alg_name, zorder=2)
levelstyle = {
"ha": "center",
"va": "center",
"fontsize": 7,
"zorder": 3,
"bbox": dict(
boxstyle="round,pad=0.2",
facecolor="white",
edgecolor="black",
lw=0.4,
),
}
for xx, yy, L in zip(x, y_mbps, levels):
ax1.text(xx, yy, str(L), **levelstyle)
ax1.set_xlabel(xlabel)
ax1.set_ylabel(ylabel)
y_min, y_max = ax1.get_ylim()
x_min, x_max = ax1.get_xlim()
unique_y_values = sorted(set(point[1] for point in all_data_points))
visible_y_values = [y for y in unique_y_values if y_min <= y <= y_max]
y_range = y_max - y_min
min_spacing = y_range * 0.05
selected_y_values = []
for y in visible_y_values:
if not selected_y_values or all(
abs(y - selected) >= min_spacing for selected in selected_y_values
):
selected_y_values.append(y)
def format_time_label(ms):
if ms < 10:
return f"{ms:.1f} ms"
elif ms < 1000:
return f"{ms:.0f} ms"
else:
seconds = ms / 1000
return f"{seconds:.1f} s"
for y_mbps in selected_y_values:
rightmost_x = max(
[point[0] for point in all_data_points if abs(point[1] - y_mbps) < 1e-6],
default=x_min,
)
ms = orig_total * 8000.0 / (y_mbps * 1e6)
ax1.plot(
[rightmost_x, x_max],
[y_mbps, y_mbps],
color="black",
linestyle="-",
alpha=0.3,
linewidth=0.8,
zorder=1,
)
ms_label = format_time_label(ms)
ax1.text(
x_max,
y_mbps,
ms_label,
verticalalignment="center",
horizontalalignment="left",
fontsize=7,
color="black",
alpha=0.8,
bbox=dict(
boxstyle="round,pad=0.1", facecolor="white", alpha=0.9, edgecolor="none"
),
)
ms_reference_values = [0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]
for ms in ms_reference_values:
mbps = orig_total * 8000.0 / (ms * 1e6)
if y_min <= mbps <= y_max:
if not any(abs(mbps - y) < min_spacing for y in selected_y_values):
ax1.axhline(
y=mbps,
color="gray",
linestyle="--",
alpha=0.3,
linewidth=0.5,
zorder=1,
)
ms_label = format_time_label(ms)
ax1.text(
x_max,
mbps,
ms_label,
verticalalignment="center",
horizontalalignment="left",
fontsize=7,
color="gray",
alpha=0.6,
bbox=dict(
boxstyle="round,pad=0.1",
facecolor="white",
alpha=0.8,
edgecolor="none",
),
)
ax1.set_title(title)
ax1.grid(True, linestyle=":", linewidth=0.7, alpha=0.6, zorder=0)
ax1.legend()
plt.tight_layout()
plt.savefig(out_path)
plt.close(fig)
def run_benchmark(folder: Path, rounds: int):
files = [p.read_bytes() for p in folder.rglob("*") if p.is_file()]
orig_total = sum(len(b) for b in files)
print(f"Loaded {len(files)} files, total {orig_total / 1e6:.2f} MB")
assert files, "No files found in the specified folder"
bench_points = [
BenchPoint(alg, lvl, spec["c"], spec["d"], [])
for alg, spec in ALGS.items()
for lvl in spec["levels"]
]
P = len(bench_points)
C_ns = np.zeros((P, rounds), dtype=np.int64)
D_ns = np.zeros((P, rounds), dtype=np.int64)
gc.disable()
time.time = lambda: 0
progress = tqdm(range(rounds), desc="Rounds")
for r in progress:
for i, bp in enumerate(bench_points):
progress.set_postfix_str(f"Compress {bp.name}/{bp.lvl}")
for f, data in enumerate(files):
t0 = time.perf_counter_ns()
out = bp.compress(data, bp.lvl)
t1 = time.perf_counter_ns()
C_ns[i, r] += t1 - t0
if r == 0:
bp.compdata.append(out)
else:
assert out == bp.compdata[f], (
f"Compressed data changed between rounds for {bp.name}, level={bp.lvl}"
)
progress.set_postfix_str(f"Decompress {bp.name}/{bp.lvl}")
for file in bp.compdata:
t0 = time.perf_counter_ns()
_ = bp.decompress(file)
t1 = time.perf_counter_ns()
D_ns[i, r] += t1 - t0
progress.set_postfix_str("Benchmark Done")
gc.enable()
Cns, Dns = np.median(C_ns, 1), np.median(D_ns, 1)
for i, bp in enumerate(bench_points):
bp.comp_time_ns = int(Cns[i])
bp.decomp_time_ns = int(Dns[i])
return bench_points, files, orig_total
def main():
parser = argparse.ArgumentParser(description="Compression benchmark.")
parser.add_argument("folder", type=str, help="Folder to benchmark")
parser.add_argument("--rounds", type=int, default=5, help="Number of rounds")
parser.add_argument(
"--outdir", type=str, default=".", help="Output directory for PNGs"
)
args = parser.parse_args()
folder = Path(args.folder)
rounds = args.rounds
outdir = Path(args.outdir)
outdir.mkdir(parents=True, exist_ok=True)
bench_points, files, orig_total = run_benchmark(folder, rounds)
plot_xy(
bench_points,
"comp_time_ns",
f"Compression — {len(files)} files, total {orig_total / 1_048_576:.2f} MiB (median of {rounds} rounds)",
"Compressed size (% of original)",
"Compression speed (Mbit/s)",
"Time (ms)",
orig_total,
outdir / "compression.png",
)
plot_xy(
bench_points,
"decomp_time_ns",
f"Decompression — {len(files)} files, total {orig_total / 1_048_576:.2f} MiB (median of {rounds} rounds)",
"Compressed size (% of original)",
"Decompression speed (Mbit/s)",
"Time (ms)",
orig_total,
outdir / "decompression.png",
)
print(f"Saved plots to {outdir}")

BIN
decompression.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 216 KiB

19
pyproject.toml Normal file
View File

@ -0,0 +1,19 @@
[project]
name = "compression-benchmark"
version = "0.1.0"
description = "Benchmark compression algorithms (zstd, brotli, gzip) on your own files."
authors = [
{ name = "Leo Vasanko" }
]
license = { text = "MIT" }
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"zstandard",
"brotli",
"matplotlib",
"tqdm",
"numpy"
]
[project.scripts]
compression-benchmark = "compression_benchmark.bench:main"