[
  {
    "id": "state-of-browser-small-medium-llms-march-2026",
    "date": "2026-04-25",
    "title": "From community ports to first-party releases: small-to-medium browser LLMs in March 2026",
    "summary": "The in-browser open LLM catalog grew up in 2025-2026: from three community ports to two dozen first-party releases in eighteen months.",
    "abstract": "In late 2024 the in-browser open-source LLM catalog was effectively three community ports: Microsoft Phi-3, Meta Llama 3.2, and HuggingFace SmolLM2. By March 2026 it runs to roughly two dozen first-party releases from a multi-vendor cohort that did not exist eighteen months earlier, including IBM Granite 4 with explicit ONNX-web variants, OpenAI's first open-weight family in years, Liquid AI's hybrid LFM2.5 line, and a 1-bit entrant from Caltech that emerged from stealth on the last day of the window. This paper maps the trajectory through its inflection points, the labs driving releases, the catalog as it stands, and what 2026-2027 has telegraphed.",
    "header": "Research",
    "topic": "local-inference",
    "tags": [
      "small models",
      "ONNX",
      "transformers.js",
      "open-source LLM",
      "on-device AI"
    ],
    "authors": [
      "Julien Borrel",
      "Claude Opus"
    ],
    "image": "/medias/research.small.models.jpg",
    "slug": "state-of-browser-small-medium-llms-march-2026",
    "wordCount": 8094,
    "readingTimeMinutes": 45
  },
  {
    "id": "three-walls-browser-llm-inference-2026",
    "date": "2026-04-25",
    "title": "The three walls of in-browser LLM inference: the state of affairs in April 2026",
    "summary": "In April 2026 browser LLM inference passed an inflection point: runtime architecture is solved, and per-tab VRAM is the new ceiling.",
    "abstract": "In-browser LLM inference has lived under three constraint walls in sequence: protobuf's 2 GB cap on `.onnx` files, WebAssembly's 4 GB linear-memory limit, and the browser-allocated WebGPU VRAM budget.\nAs of April 2026, the first two are effectively cleared — the first by ONNX's External Data format, the second by ONNX Runtime's new C++ WebGPU execution provider — while the third remains the binding constraint, with no portable spec query and substantial per-platform variance.\nThis paper traces how each wall arose, how the February 2026 transformers.js v4 / ORT C++ EP inflection collapsed two of them, and what genuinely fits in a browser tab today across Apple Silicon, NVIDIA, AMD, Intel, and mobile.",
    "header": "Research",
    "topic": "local-inference",
    "tags": [
      "WebGPU",
      "transformers.js",
      "ONNX Runtime",
      "in-browser LLM",
      "WebAssembly"
    ],
    "authors": [
      "Julien Borrel",
      "Claude Opus"
    ],
    "image": "/medias/research.webgpu.color.png",
    "slug": "three-walls-browser-llm-inference-2026",
    "wordCount": 9232,
    "readingTimeMinutes": 51
  }
]