Initial commit
This commit is contained in:
commit
dd3d0c63c4
19
README.md
Normal file
19
README.md
Normal file
@ -0,0 +1,19 @@
|
||||
|
||||
# Data Compression Benchmark
|
||||
|
||||
This project benchmarks the performance of various compression algorithms (zstd, brotli, gzip) using different compression levels on all files in a given directory.
|
||||
|
||||
You can easily see which algorithm and setting is best for your use case, benchmarking on the actual files you'll be compressing.
|
||||
|
||||
## Demo on a NodeJS project's static client files
|
||||
|
||||
!(Compression)[compression.png]
|
||||
!(Decompression)[decompression.png]
|
||||
|
||||
## How to Run
|
||||
|
||||
The recommended way is to use [uv](https://github.com/astral-sh/uv) to run from git:
|
||||
|
||||
```fish
|
||||
uvx --from git+https://git.zi.fi/LeoVasanko/compression-benchmark.git compression-benchmark /path/to/folder
|
||||
```
|
329
bench.ipynb
Normal file
329
bench.ipynb
Normal file
@ -0,0 +1,329 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1883540a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# /// script\n",
|
||||
"# dependencies = [\n",
|
||||
"# \"zstandard\",\n",
|
||||
"# \"brotli\",\n",
|
||||
"# \"matplotlib\",\n",
|
||||
"# \"tqdm\",\n",
|
||||
"# \"numpy\",\n",
|
||||
"# ]\n",
|
||||
"# ///\n",
|
||||
"\n",
|
||||
"import gc\n",
|
||||
"import gzip\n",
|
||||
"import time\n",
|
||||
"from dataclasses import dataclass\n",
|
||||
"from pathlib import Path\n",
|
||||
"from typing import Callable\n",
|
||||
"\n",
|
||||
"import brotli\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import numpy as np\n",
|
||||
"from tqdm.auto import tqdm\n",
|
||||
"from zstandard import ZstdCompressor, ZstdDecompressor\n",
|
||||
"\n",
|
||||
"# What should we benchmark?\n",
|
||||
"FOLDER = Path(\".\")\n",
|
||||
"ROUNDS = 5\n",
|
||||
"ALGS = {\n",
|
||||
" \"zstd\": {\n",
|
||||
" \"levels\": [1, 2, 3, 4, 5, 6, 7, 9, 10, 12, 14, 16, 22],\n",
|
||||
" \"c\": lambda b, q: ZstdCompressor(level=q).compress(b),\n",
|
||||
" \"d\": lambda b: ZstdDecompressor().decompress(b),\n",
|
||||
" },\n",
|
||||
" \"brotli\": {\n",
|
||||
" \"levels\": [1, 2, 3, 4, 5, 7, 9, 11],\n",
|
||||
" \"c\": lambda b, q: brotli.compress(b, quality=q),\n",
|
||||
" \"d\": lambda b: brotli.decompress(b),\n",
|
||||
" },\n",
|
||||
" \"gzip\": {\n",
|
||||
" \"levels\": [1, 3, 4, 5, 9],\n",
|
||||
" \"c\": lambda b, q: gzip.compress(b, compresslevel=q),\n",
|
||||
" \"d\": lambda b: gzip.decompress(b),\n",
|
||||
" },\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# In case you were wondering:\n",
|
||||
"# Sharing the ZstdCompressor across operations or using threads makes a little difference (with small files)\n",
|
||||
"\n",
|
||||
"files = [p.read_bytes() for p in FOLDER.rglob(\"*\") if p.is_file()]\n",
|
||||
"orig_total = sum(len(b) for b in files)\n",
|
||||
"print(f\"Loaded {len(files)} files, total {orig_total / 1e6:.2f} MB\")\n",
|
||||
"assert files, \"No files found in the specified folder\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@dataclass\n",
|
||||
"class BenchPoint:\n",
|
||||
" name: str\n",
|
||||
" lvl: int\n",
|
||||
" compress: Callable\n",
|
||||
" decompress: Callable\n",
|
||||
" compdata: list[bytes]\n",
|
||||
" comp_time_ns: int = 0 # median compression time (ns)\n",
|
||||
" decomp_time_ns: int = 0 # median decompression time (ns)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"bench_points = [\n",
|
||||
" BenchPoint(alg, lvl, spec[\"c\"], spec[\"d\"], [])\n",
|
||||
" for alg, spec in ALGS.items()\n",
|
||||
" for lvl in spec[\"levels\"]\n",
|
||||
"]\n",
|
||||
"P = len(bench_points)\n",
|
||||
"C_ns = np.zeros((P, ROUNDS), dtype=np.int64) # compression time (ns)\n",
|
||||
"D_ns = np.zeros((P, ROUNDS), dtype=np.int64) # decompression time (ns)\n",
|
||||
"\n",
|
||||
"# Benchmark\n",
|
||||
"gc.disable()\n",
|
||||
"time.time = lambda: 0 # make gzip deterministic (timestamp header)\n",
|
||||
"progress = tqdm(range(ROUNDS), desc=\"Rounds\")\n",
|
||||
"for r in progress:\n",
|
||||
" for i, bp in enumerate(bench_points):\n",
|
||||
" progress.set_postfix_str(f\"Compress {bp.name}/{bp.lvl}\")\n",
|
||||
" for f, data in enumerate(files):\n",
|
||||
" t0 = time.perf_counter_ns()\n",
|
||||
" out = bp.compress(data, bp.lvl)\n",
|
||||
" t1 = time.perf_counter_ns()\n",
|
||||
" C_ns[i, r] += t1 - t0\n",
|
||||
" if r == 0:\n",
|
||||
" bp.compdata.append(out)\n",
|
||||
" else:\n",
|
||||
" assert out == bp.compdata[f], (\n",
|
||||
" f\"Compressed data changed between rounds for {bp.name}, level={bp.lvl}\"\n",
|
||||
" )\n",
|
||||
" progress.set_postfix_str(f\"Decompress {bp.name}/{bp.lvl}\")\n",
|
||||
" for file in bp.compdata:\n",
|
||||
" t0 = time.perf_counter_ns()\n",
|
||||
" _ = bp.decompress(file)\n",
|
||||
" t1 = time.perf_counter_ns()\n",
|
||||
" D_ns[i, r] += t1 - t0\n",
|
||||
" progress.set_postfix_str(\"Benchmark Done\")\n",
|
||||
"gc.enable()\n",
|
||||
"\n",
|
||||
"# Store median times directly in bench points\n",
|
||||
"Cns, Dns = np.median(C_ns, 1), np.median(D_ns, 1)\n",
|
||||
"for i, bp in enumerate(bench_points):\n",
|
||||
" bp.comp_time_ns = int(Cns[i])\n",
|
||||
" bp.decomp_time_ns = int(Dns[i])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "12a0cf49",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def plot_xy(bench_points, time_attr, title, xlabel, ylabel, ylabel2):\n",
|
||||
" fig, ax1 = plt.subplots(figsize=(9, 5), dpi=300)\n",
|
||||
"\n",
|
||||
" # Collect all y values (Mbit/s) and their corresponding x values for overlap detection\n",
|
||||
" all_data_points = []\n",
|
||||
"\n",
|
||||
" for alg_name in ALGS:\n",
|
||||
" alg_points = [bp for bp in bench_points if bp.name == alg_name]\n",
|
||||
" comp_totals = np.array([sum(len(b) for b in bp.compdata) for bp in alg_points])\n",
|
||||
" times_ns = np.array([getattr(bp, time_attr) for bp in alg_points])\n",
|
||||
" levels = [bp.lvl for bp in alg_points]\n",
|
||||
"\n",
|
||||
" x = 100.0 * comp_totals / orig_total # compression percentages\n",
|
||||
" y_mbps = orig_total * 8000.0 / times_ns # mbps - this is what we plot\n",
|
||||
"\n",
|
||||
" # Store all data points for overlap detection\n",
|
||||
" for xx, yy in zip(x, y_mbps):\n",
|
||||
" all_data_points.append((xx, yy))\n",
|
||||
"\n",
|
||||
" # Plot using Mbit/s (so faster operations are at the top)\n",
|
||||
" ax1.plot(x, y_mbps, marker=\"s\", linewidth=1.25, label=alg_name, zorder=2)\n",
|
||||
" levelstyle = {\n",
|
||||
" \"ha\": \"center\",\n",
|
||||
" \"va\": \"center\",\n",
|
||||
" \"fontsize\": 7,\n",
|
||||
" \"zorder\": 3,\n",
|
||||
" \"bbox\": dict(\n",
|
||||
" boxstyle=\"round,pad=0.2\",\n",
|
||||
" facecolor=\"white\",\n",
|
||||
" edgecolor=\"black\",\n",
|
||||
" lw=0.4,\n",
|
||||
" ),\n",
|
||||
" }\n",
|
||||
" for xx, yy, L in zip(x, y_mbps, levels):\n",
|
||||
" ax1.text(xx, yy, str(L), **levelstyle)\n",
|
||||
"\n",
|
||||
" # Set up the axis\n",
|
||||
" ax1.set_xlabel(xlabel)\n",
|
||||
" ax1.set_ylabel(ylabel) # Compression/Decompression speed (Mbit/s)\n",
|
||||
"\n",
|
||||
" # Get plot dimensions\n",
|
||||
" y_min, y_max = ax1.get_ylim()\n",
|
||||
" x_min, x_max = ax1.get_xlim()\n",
|
||||
"\n",
|
||||
" # Extract unique y values and sort them\n",
|
||||
" unique_y_values = sorted(set(point[1] for point in all_data_points))\n",
|
||||
"\n",
|
||||
" # Filter to y values within the plot range\n",
|
||||
" visible_y_values = [y for y in unique_y_values if y_min <= y <= y_max]\n",
|
||||
"\n",
|
||||
" # Determine minimum spacing to avoid overlap (as a percentage of the y-range)\n",
|
||||
" y_range = y_max - y_min\n",
|
||||
" min_spacing = y_range * 0.05 # 5% of the y-range as minimum spacing\n",
|
||||
"\n",
|
||||
" # Select y values with sufficient spacing\n",
|
||||
" selected_y_values = []\n",
|
||||
" for y in visible_y_values:\n",
|
||||
" # Check if this y value is far enough from already selected ones\n",
|
||||
" if not selected_y_values or all(\n",
|
||||
" abs(y - selected) >= min_spacing for selected in selected_y_values\n",
|
||||
" ):\n",
|
||||
" selected_y_values.append(y)\n",
|
||||
"\n",
|
||||
" # Function to format time labels\n",
|
||||
" def format_time_label(ms):\n",
|
||||
" if ms < 10:\n",
|
||||
" return f\"{ms:.1f} ms\" # One decimal for <10ms\n",
|
||||
" elif ms < 1000:\n",
|
||||
" return f\"{ms:.0f} ms\" # No decimal for <1000ms\n",
|
||||
" else:\n",
|
||||
" seconds = ms / 1000\n",
|
||||
" return f\"{seconds:.1f} s\" # Seconds with one decimal above 1000ms\n",
|
||||
"\n",
|
||||
" # Draw black lines from data points to right margin with labels\n",
|
||||
" for y_mbps in selected_y_values:\n",
|
||||
" # Find the rightmost x position for this y value\n",
|
||||
" rightmost_x = max(\n",
|
||||
" [point[0] for point in all_data_points if abs(point[1] - y_mbps) < 1e-6],\n",
|
||||
" default=x_min,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Convert Mbit/s to milliseconds for the label\n",
|
||||
" ms = orig_total * 8000.0 / (y_mbps * 1e6)\n",
|
||||
"\n",
|
||||
" # Draw a black line from the rightmost data point to the right margin\n",
|
||||
" ax1.plot(\n",
|
||||
" [rightmost_x, x_max],\n",
|
||||
" [y_mbps, y_mbps],\n",
|
||||
" color=\"black\",\n",
|
||||
" linestyle=\"-\",\n",
|
||||
" alpha=0.3,\n",
|
||||
" linewidth=0.8,\n",
|
||||
" zorder=1,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Add text label on the right side with improved formatting\n",
|
||||
" ms_label = format_time_label(ms)\n",
|
||||
"\n",
|
||||
" ax1.text(\n",
|
||||
" x_max,\n",
|
||||
" y_mbps,\n",
|
||||
" ms_label,\n",
|
||||
" verticalalignment=\"center\",\n",
|
||||
" horizontalalignment=\"left\",\n",
|
||||
" fontsize=7,\n",
|
||||
" color=\"black\",\n",
|
||||
" alpha=0.8,\n",
|
||||
" bbox=dict(\n",
|
||||
" boxstyle=\"round,pad=0.1\", facecolor=\"white\", alpha=0.9, edgecolor=\"none\"\n",
|
||||
" ),\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Add the original fixed millisecond reference lines (only if they don't conflict)\n",
|
||||
" ms_reference_values = [0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]\n",
|
||||
"\n",
|
||||
" for ms in ms_reference_values:\n",
|
||||
" # Convert milliseconds to Mbit/s: mbps = orig_total * 8000.0 / (ms * 1e6)\n",
|
||||
" mbps = orig_total * 8000.0 / (ms * 1e6)\n",
|
||||
"\n",
|
||||
" # Only draw the line if it falls within the current y-axis range and isn't too close to data points\n",
|
||||
" if y_min <= mbps <= y_max:\n",
|
||||
" # Check if this reference line is far enough from data points\n",
|
||||
" if not any(abs(mbps - y) < min_spacing for y in selected_y_values):\n",
|
||||
" ax1.axhline(\n",
|
||||
" y=mbps,\n",
|
||||
" color=\"gray\",\n",
|
||||
" linestyle=\"--\",\n",
|
||||
" alpha=0.3,\n",
|
||||
" linewidth=0.5,\n",
|
||||
" zorder=1,\n",
|
||||
" )\n",
|
||||
" # Add text label on the right side with improved formatting\n",
|
||||
" ms_label = format_time_label(ms)\n",
|
||||
" ax1.text(\n",
|
||||
" x_max,\n",
|
||||
" mbps,\n",
|
||||
" ms_label,\n",
|
||||
" verticalalignment=\"center\",\n",
|
||||
" horizontalalignment=\"left\",\n",
|
||||
" fontsize=7,\n",
|
||||
" color=\"gray\",\n",
|
||||
" alpha=0.6,\n",
|
||||
" bbox=dict(\n",
|
||||
" boxstyle=\"round,pad=0.1\",\n",
|
||||
" facecolor=\"white\",\n",
|
||||
" alpha=0.8,\n",
|
||||
" edgecolor=\"none\",\n",
|
||||
" ),\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" ax1.set_title(title)\n",
|
||||
" ax1.grid(True, linestyle=\":\", linewidth=0.7, alpha=0.6, zorder=0)\n",
|
||||
" ax1.legend()\n",
|
||||
" plt.tight_layout()\n",
|
||||
" plt.show()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"plot_xy(\n",
|
||||
" bench_points,\n",
|
||||
" \"comp_time_ns\",\n",
|
||||
" f\"Compression — {len(files)} files, total {orig_total / 1_048_576:.2f} MiB (median of {ROUNDS} rounds)\",\n",
|
||||
" \"Compressed size (% of original)\",\n",
|
||||
" \"Compression speed (Mbit/s)\",\n",
|
||||
" \"Time (ms)\",\n",
|
||||
")\n",
|
||||
"plot_xy(\n",
|
||||
" bench_points,\n",
|
||||
" \"decomp_time_ns\",\n",
|
||||
" f\"Decompression — {len(files)} files, total {orig_total / 1_048_576:.2f} MiB (median of {ROUNDS} rounds)\",\n",
|
||||
" \"Compressed size (% of original)\",\n",
|
||||
" \"Decompression speed (Mbit/s)\",\n",
|
||||
" \"Time (ms)\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a8a2bdcd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
BIN
compression.png
Normal file
BIN
compression.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 234 KiB |
4
compression_benchmark/__main__.py
Normal file
4
compression_benchmark/__main__.py
Normal file
@ -0,0 +1,4 @@
|
||||
from .bench import main
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
245
compression_benchmark/bench.py
Normal file
245
compression_benchmark/bench.py
Normal file
@ -0,0 +1,245 @@
|
||||
import argparse
|
||||
import gc
|
||||
import gzip
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Callable
|
||||
|
||||
import brotli
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from tqdm.auto import tqdm
|
||||
from zstandard import ZstdCompressor, ZstdDecompressor
|
||||
|
||||
ALGS = {
|
||||
"zstd": {
|
||||
"levels": [1, 2, 3, 4, 5, 6, 7, 9, 10, 12, 14, 16, 22],
|
||||
"c": lambda b, q: ZstdCompressor(level=q).compress(b),
|
||||
"d": lambda b: ZstdDecompressor().decompress(b),
|
||||
},
|
||||
"brotli": {
|
||||
"levels": [1, 2, 3, 4, 5, 7, 9, 11],
|
||||
"c": lambda b, q: brotli.compress(b, quality=q),
|
||||
"d": lambda b: brotli.decompress(b),
|
||||
},
|
||||
"gzip": {
|
||||
"levels": [1, 3, 4, 5, 9],
|
||||
"c": lambda b, q: gzip.compress(b, compresslevel=q),
|
||||
"d": lambda b: gzip.decompress(b),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class BenchPoint:
|
||||
name: str
|
||||
lvl: int
|
||||
compress: Callable
|
||||
decompress: Callable
|
||||
compdata: list
|
||||
comp_time_ns: int = 0
|
||||
decomp_time_ns: int = 0
|
||||
|
||||
|
||||
def plot_xy(
|
||||
bench_points, time_attr, title, xlabel, ylabel, ylabel2, orig_total, out_path
|
||||
):
|
||||
fig, ax1 = plt.subplots(figsize=(9, 5), dpi=300)
|
||||
all_data_points = []
|
||||
for alg_name in ALGS:
|
||||
alg_points = [bp for bp in bench_points if bp.name == alg_name]
|
||||
comp_totals = np.array([sum(len(b) for b in bp.compdata) for bp in alg_points])
|
||||
times_ns = np.array([getattr(bp, time_attr) for bp in alg_points])
|
||||
levels = [bp.lvl for bp in alg_points]
|
||||
x = 100.0 * comp_totals / orig_total
|
||||
y_mbps = orig_total * 8000.0 / times_ns
|
||||
for xx, yy in zip(x, y_mbps):
|
||||
all_data_points.append((xx, yy))
|
||||
ax1.plot(x, y_mbps, marker="s", linewidth=1.25, label=alg_name, zorder=2)
|
||||
levelstyle = {
|
||||
"ha": "center",
|
||||
"va": "center",
|
||||
"fontsize": 7,
|
||||
"zorder": 3,
|
||||
"bbox": dict(
|
||||
boxstyle="round,pad=0.2",
|
||||
facecolor="white",
|
||||
edgecolor="black",
|
||||
lw=0.4,
|
||||
),
|
||||
}
|
||||
for xx, yy, L in zip(x, y_mbps, levels):
|
||||
ax1.text(xx, yy, str(L), **levelstyle)
|
||||
ax1.set_xlabel(xlabel)
|
||||
ax1.set_ylabel(ylabel)
|
||||
y_min, y_max = ax1.get_ylim()
|
||||
x_min, x_max = ax1.get_xlim()
|
||||
unique_y_values = sorted(set(point[1] for point in all_data_points))
|
||||
visible_y_values = [y for y in unique_y_values if y_min <= y <= y_max]
|
||||
y_range = y_max - y_min
|
||||
min_spacing = y_range * 0.05
|
||||
selected_y_values = []
|
||||
for y in visible_y_values:
|
||||
if not selected_y_values or all(
|
||||
abs(y - selected) >= min_spacing for selected in selected_y_values
|
||||
):
|
||||
selected_y_values.append(y)
|
||||
|
||||
def format_time_label(ms):
|
||||
if ms < 10:
|
||||
return f"{ms:.1f} ms"
|
||||
elif ms < 1000:
|
||||
return f"{ms:.0f} ms"
|
||||
else:
|
||||
seconds = ms / 1000
|
||||
return f"{seconds:.1f} s"
|
||||
|
||||
for y_mbps in selected_y_values:
|
||||
rightmost_x = max(
|
||||
[point[0] for point in all_data_points if abs(point[1] - y_mbps) < 1e-6],
|
||||
default=x_min,
|
||||
)
|
||||
ms = orig_total * 8000.0 / (y_mbps * 1e6)
|
||||
ax1.plot(
|
||||
[rightmost_x, x_max],
|
||||
[y_mbps, y_mbps],
|
||||
color="black",
|
||||
linestyle="-",
|
||||
alpha=0.3,
|
||||
linewidth=0.8,
|
||||
zorder=1,
|
||||
)
|
||||
ms_label = format_time_label(ms)
|
||||
ax1.text(
|
||||
x_max,
|
||||
y_mbps,
|
||||
ms_label,
|
||||
verticalalignment="center",
|
||||
horizontalalignment="left",
|
||||
fontsize=7,
|
||||
color="black",
|
||||
alpha=0.8,
|
||||
bbox=dict(
|
||||
boxstyle="round,pad=0.1", facecolor="white", alpha=0.9, edgecolor="none"
|
||||
),
|
||||
)
|
||||
ms_reference_values = [0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]
|
||||
for ms in ms_reference_values:
|
||||
mbps = orig_total * 8000.0 / (ms * 1e6)
|
||||
if y_min <= mbps <= y_max:
|
||||
if not any(abs(mbps - y) < min_spacing for y in selected_y_values):
|
||||
ax1.axhline(
|
||||
y=mbps,
|
||||
color="gray",
|
||||
linestyle="--",
|
||||
alpha=0.3,
|
||||
linewidth=0.5,
|
||||
zorder=1,
|
||||
)
|
||||
ms_label = format_time_label(ms)
|
||||
ax1.text(
|
||||
x_max,
|
||||
mbps,
|
||||
ms_label,
|
||||
verticalalignment="center",
|
||||
horizontalalignment="left",
|
||||
fontsize=7,
|
||||
color="gray",
|
||||
alpha=0.6,
|
||||
bbox=dict(
|
||||
boxstyle="round,pad=0.1",
|
||||
facecolor="white",
|
||||
alpha=0.8,
|
||||
edgecolor="none",
|
||||
),
|
||||
)
|
||||
ax1.set_title(title)
|
||||
ax1.grid(True, linestyle=":", linewidth=0.7, alpha=0.6, zorder=0)
|
||||
ax1.legend()
|
||||
plt.tight_layout()
|
||||
plt.savefig(out_path)
|
||||
plt.close(fig)
|
||||
|
||||
|
||||
def run_benchmark(folder: Path, rounds: int):
|
||||
files = [p.read_bytes() for p in folder.rglob("*") if p.is_file()]
|
||||
orig_total = sum(len(b) for b in files)
|
||||
print(f"Loaded {len(files)} files, total {orig_total / 1e6:.2f} MB")
|
||||
assert files, "No files found in the specified folder"
|
||||
bench_points = [
|
||||
BenchPoint(alg, lvl, spec["c"], spec["d"], [])
|
||||
for alg, spec in ALGS.items()
|
||||
for lvl in spec["levels"]
|
||||
]
|
||||
P = len(bench_points)
|
||||
C_ns = np.zeros((P, rounds), dtype=np.int64)
|
||||
D_ns = np.zeros((P, rounds), dtype=np.int64)
|
||||
gc.disable()
|
||||
time.time = lambda: 0
|
||||
progress = tqdm(range(rounds), desc="Rounds")
|
||||
for r in progress:
|
||||
for i, bp in enumerate(bench_points):
|
||||
progress.set_postfix_str(f"Compress {bp.name}/{bp.lvl}")
|
||||
for f, data in enumerate(files):
|
||||
t0 = time.perf_counter_ns()
|
||||
out = bp.compress(data, bp.lvl)
|
||||
t1 = time.perf_counter_ns()
|
||||
C_ns[i, r] += t1 - t0
|
||||
if r == 0:
|
||||
bp.compdata.append(out)
|
||||
else:
|
||||
assert out == bp.compdata[f], (
|
||||
f"Compressed data changed between rounds for {bp.name}, level={bp.lvl}"
|
||||
)
|
||||
progress.set_postfix_str(f"Decompress {bp.name}/{bp.lvl}")
|
||||
for file in bp.compdata:
|
||||
t0 = time.perf_counter_ns()
|
||||
_ = bp.decompress(file)
|
||||
t1 = time.perf_counter_ns()
|
||||
D_ns[i, r] += t1 - t0
|
||||
progress.set_postfix_str("Benchmark Done")
|
||||
gc.enable()
|
||||
Cns, Dns = np.median(C_ns, 1), np.median(D_ns, 1)
|
||||
for i, bp in enumerate(bench_points):
|
||||
bp.comp_time_ns = int(Cns[i])
|
||||
bp.decomp_time_ns = int(Dns[i])
|
||||
return bench_points, files, orig_total
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Compression benchmark.")
|
||||
parser.add_argument("folder", type=str, help="Folder to benchmark")
|
||||
parser.add_argument("--rounds", type=int, default=5, help="Number of rounds")
|
||||
parser.add_argument(
|
||||
"--outdir", type=str, default=".", help="Output directory for PNGs"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
folder = Path(args.folder)
|
||||
rounds = args.rounds
|
||||
outdir = Path(args.outdir)
|
||||
outdir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
bench_points, files, orig_total = run_benchmark(folder, rounds)
|
||||
|
||||
plot_xy(
|
||||
bench_points,
|
||||
"comp_time_ns",
|
||||
f"Compression — {len(files)} files, total {orig_total / 1_048_576:.2f} MiB (median of {rounds} rounds)",
|
||||
"Compressed size (% of original)",
|
||||
"Compression speed (Mbit/s)",
|
||||
"Time (ms)",
|
||||
orig_total,
|
||||
outdir / "compression.png",
|
||||
)
|
||||
plot_xy(
|
||||
bench_points,
|
||||
"decomp_time_ns",
|
||||
f"Decompression — {len(files)} files, total {orig_total / 1_048_576:.2f} MiB (median of {rounds} rounds)",
|
||||
"Compressed size (% of original)",
|
||||
"Decompression speed (Mbit/s)",
|
||||
"Time (ms)",
|
||||
orig_total,
|
||||
outdir / "decompression.png",
|
||||
)
|
||||
print(f"Saved plots to {outdir}")
|
BIN
decompression.png
Normal file
BIN
decompression.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 216 KiB |
19
pyproject.toml
Normal file
19
pyproject.toml
Normal file
@ -0,0 +1,19 @@
|
||||
[project]
|
||||
name = "compression-benchmark"
|
||||
version = "0.1.0"
|
||||
description = "Benchmark compression algorithms (zstd, brotli, gzip) on your own files."
|
||||
authors = [
|
||||
{ name = "Leo Vasanko" }
|
||||
]
|
||||
license = { text = "MIT" }
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"zstandard",
|
||||
"brotli",
|
||||
"matplotlib",
|
||||
"tqdm",
|
||||
"numpy"
|
||||
]
|
||||
[project.scripts]
|
||||
compression-benchmark = "compression_benchmark.bench:main"
|
Loading…
x
Reference in New Issue
Block a user