queue runner: fix nullptr deref on build exception after releasing a machine reservation

Add metric for builds waiting for download slot
(cherry picked from commit f23ec71227911891807706b6b978836e4d80edde)
2025-02-16 13:27:26 +01:00 · 2025-02-12 10:35:17 +01:00 · 2025-02-12 10:35:17 +01:00 · 2025-02-12 10:35:17 +01:00 · 2025-02-12 10:35:17 +01:00 · 2025-02-12 10:35:17 +01:00
39 changed files with 1051 additions and 496 deletions
--- a/flake.lock
+++ b/flake.lock
@@ -1,10 +1,51 @@
 {
  "nodes": {
+    "flake-parts": {
+      "inputs": {
+        "nixpkgs-lib": [
+          "nix-eval-jobs",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1722555600,
+        "narHash": "sha256-XOQkdLafnb/p9ij77byFQjDf5m5QYl9b2REiVClC+x4=",
+        "owner": "hercules-ci",
+        "repo": "flake-parts",
+        "rev": "8471fe90ad337a8074e957b69ca4d0089218391d",
+        "type": "github"
+      },
+      "original": {
+        "owner": "hercules-ci",
+        "repo": "flake-parts",
+        "type": "github"
+      }
+    },
+    "libgit2": {
+      "flake": false,
+      "locked": {
+        "lastModified": 1715853528,
+        "narHash": "sha256-J2rCxTecyLbbDdsyBWn9w7r3pbKRMkI9E7RvRgAqBdY=",
+        "owner": "libgit2",
+        "repo": "libgit2",
+        "rev": "36f7e21ad757a3dacc58cf7944329da6bc1d6e96",
+        "type": "github"
+      },
+      "original": {
+        "owner": "libgit2",
+        "ref": "v1.8.1",
+        "repo": "libgit2",
+        "type": "github"
+      }
+    },
    "nix": {
      "inputs": {
        "flake-compat": [],
        "flake-parts": [],
        "git-hooks-nix": [],
+        "libgit2": [
+          "libgit2"
+        ],
        "nixpkgs": [
          "nixpkgs"
        ],
@@ -12,58 +53,88 @@
        "nixpkgs-regression": []
      },
      "locked": {
-        "lastModified": 1739899400,
-        "narHash": "sha256-q/RgA4bB7zWai4oPySq9mch7qH14IEeom2P64SXdqHs=",
+        "lastModified": 1726787955,
+        "narHash": "sha256-XFznzb8L4SdUm9u+w3DPpMWJhffuv+/6+aiVl00slns=",
        "owner": "NixOS",
        "repo": "nix",
-        "rev": "e310c19a1aeb1ce1ed4d41d5ab2d02db596e0918",
+        "rev": "a7fdef6858dd45b9d7bda7c92324c63faee7f509",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
-        "ref": "2.26-maintenance",
+        "ref": "2.24-maintenance",
        "repo": "nix",
        "type": "github"
      }
    },
    "nix-eval-jobs": {
-      "flake": false,
+      "inputs": {
+        "flake-parts": "flake-parts",
+        "nix-github-actions": [],
+        "nixpkgs": [
+          "nixpkgs"
+        ],
+        "treefmt-nix": "treefmt-nix"
+      },
      "locked": {
-        "lastModified": 1739500569,
-        "narHash": "sha256-3wIReAqdTALv39gkWXLMZQvHyBOc3yPkWT2ZsItxedY=",
+        "lastModified": 1733814344,
+        "narHash": "sha256-3wwtKpS5tUBdjaGeSia7CotonbiRB6K5Kp0dsUt3nzU=",
        "owner": "nix-community",
        "repo": "nix-eval-jobs",
-        "rev": "4b392b284877d203ae262e16af269f702df036bc",
+        "rev": "889ea1406736b53cf165b6c28398aae3969418d1",
        "type": "github"
      },
      "original": {
        "owner": "nix-community",
+        "ref": "release-2.24",
        "repo": "nix-eval-jobs",
        "type": "github"
      }
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1739461644,
-        "narHash": "sha256-1o1qR0KYozYGRrnqytSpAhVBYLNBHX+Lv6I39zGRzKM=",
+        "lastModified": 1726688310,
+        "narHash": "sha256-Xc9lEtentPCEtxc/F1e6jIZsd4MPDYv4Kugl9WtXlz0=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "97a719c9f0a07923c957cf51b20b329f9fb9d43f",
+        "rev": "dbebdd67a6006bb145d98c8debf9140ac7e651d0",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
-        "ref": "nixos-24.11-small",
+        "ref": "nixos-24.05-small",
        "repo": "nixpkgs",
        "type": "github"
      }
    },
    "root": {
      "inputs": {
+        "libgit2": "libgit2",
        "nix": "nix",
        "nix-eval-jobs": "nix-eval-jobs",
        "nixpkgs": "nixpkgs"
      }
+    },
+    "treefmt-nix": {
+      "inputs": {
+        "nixpkgs": [
+          "nix-eval-jobs",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1723303070,
+        "narHash": "sha256-krGNVA30yptyRonohQ+i9cnK+CfCpedg6z3qzqVJcTs=",
+        "owner": "numtide",
+        "repo": "treefmt-nix",
+        "rev": "14c092e0326de759e16b37535161b3cb9770cea3",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "treefmt-nix",
+        "type": "github"
+      }
    }
  },
  "root": "root",
--- a/flake.nix
+++ b/flake.nix
@@ -1,25 +1,25 @@
 {
  description = "A Nix-based continuous build system";

-  inputs.nixpkgs.url = "github:NixOS/nixpkgs/nixos-24.11-small";
+  inputs.nixpkgs.url = "github:NixOS/nixpkgs/nixos-24.05-small";

-  inputs.nix = {
-    url = "github:NixOS/nix/2.26-maintenance";
-    inputs.nixpkgs.follows = "nixpkgs";
+  inputs.libgit2 = { url = "github:libgit2/libgit2/v1.8.1"; flake = false; };
+  inputs.nix.url = "github:NixOS/nix/2.24-maintenance";
+  inputs.nix.inputs.nixpkgs.follows = "nixpkgs";
+  inputs.nix.inputs.libgit2.follows = "libgit2";

-    # hide nix dev tooling from our lock file
-    inputs.flake-parts.follows = "";
-    inputs.git-hooks-nix.follows = "";
-    inputs.nixpkgs-regression.follows = "";
-    inputs.nixpkgs-23-11.follows = "";
-    inputs.flake-compat.follows = "";
-  };
+  inputs.nix-eval-jobs.url = "github:nix-community/nix-eval-jobs/release-2.24";
+  inputs.nix-eval-jobs.inputs.nixpkgs.follows = "nixpkgs";

-  inputs.nix-eval-jobs = {
-    url = "github:nix-community/nix-eval-jobs";
-    # We want to control the deps precisely
-    flake = false;
-  };
+  # hide nix dev tooling from our lock file
+  inputs.nix.inputs.flake-parts.follows = "";
+  inputs.nix.inputs.git-hooks-nix.follows = "";
+  inputs.nix.inputs.nixpkgs-regression.follows = "";
+  inputs.nix.inputs.nixpkgs-23-11.follows = "";
+  inputs.nix.inputs.flake-compat.follows = "";
+
+  # hide nix-eval-jobs dev tooling from our lock file
+  inputs.nix-eval-jobs.inputs.nix-github-actions.follows = "";

  outputs = { self, nixpkgs, nix, nix-eval-jobs, ... }:
    let
@@ -30,9 +30,9 @@

      # A Nixpkgs overlay that provides a 'hydra' package.
      overlays.default = final: prev: {
-        nix-eval-jobs = final.callPackage nix-eval-jobs {};
        hydra = final.callPackage ./package.nix {
          inherit (nixpkgs.lib) fileset;
+          nix-eval-jobs = nix-eval-jobs.packages.${final.system}.default;
          rawSrc = self;
          nix-perl-bindings = final.nixComponents.nix-perl-bindings;
        };
@@ -74,20 +74,11 @@
      });

      packages = forEachSystem (system: {
-        nix-eval-jobs = nixpkgs.legacyPackages.${system}.callPackage nix-eval-jobs {
-          nix = nix.packages.${system}.nix;
-        };
        hydra = nixpkgs.legacyPackages.${system}.callPackage ./package.nix {
          inherit (nixpkgs.lib) fileset;
-          inherit (self.packages.${system}) nix-eval-jobs;
+          nix-eval-jobs = nix-eval-jobs.packages.${system}.default;
          rawSrc = self;
-          inherit (nix.packages.${system})
-            nix-util
-            nix-store
-            nix-main
-            nix-cmd
-            nix-cli
-            ;
+          nix = nix.packages.${system}.nix;
          nix-perl-bindings = nix.hydraJobs.perlBindings.${system};
        };
        default = self.packages.${system}.hydra;
--- a/foreman/start-manual.sh
+++ b/foreman/start-manual.sh
@@ -1,6 +1,6 @@
 #!/bin/sh

-exec mdbook serve \
+mdbook serve \
  --port 63332 \
  --dest-dir ./.hydra-data/manual \
  ./doc/manual/
--- a/meson.build
+++ b/meson.build
@@ -8,22 +8,22 @@ project('hydra', 'cpp',
  ],
 )

-nix_util_dep = dependency('nix-util', required: true)
 nix_store_dep = dependency('nix-store', required: true)
 nix_main_dep = dependency('nix-main', required: true)
+nix_expr_dep = dependency('nix-expr', required: true)
+nix_flake_dep = dependency('nix-flake', required: true)
+nix_cmd_dep = dependency('nix-cmd', required: true)

 # Nix need extra flags not provided in its pkg-config files.
 nix_dep = declare_dependency(
  dependencies: [
-    nix_util_dep,
    nix_store_dep,
    nix_main_dep,
+    nix_expr_dep,
+    nix_flake_dep,
+    nix_cmd_dep,
  ],
-  compile_args: [
-    '-include', 'nix/config-util.hh',
-    '-include', 'nix/config-store.hh',
-    '-include', 'nix/config-main.hh',
-  ],
+  compile_args: ['-include', 'nix/config.h'],
 )

 pqxx_dep = dependency('libpqxx', required: true)
--- a/package.nix
+++ b/package.nix
@@ -8,11 +8,7 @@

 , perlPackages

-, nix-util
-, nix-store
-, nix-main
-, nix-cmd
-, nix-cli
+, nix
 , nix-perl-bindings
 , git

@@ -94,6 +90,7 @@ let
        DateTime
        DBDPg
        DBDSQLite
+        DBIxClassHelpers
        DigestSHA1
        EmailMIME
        EmailSender
@@ -166,7 +163,7 @@ stdenv.mkDerivation (finalAttrs: {
    nukeReferences
    pkg-config
    mdbook
-    nix-cli
+    nix
    perlDeps
    perl
    unzip
@@ -176,10 +173,7 @@ stdenv.mkDerivation (finalAttrs: {
    libpqxx
    openssl
    libxslt
-    nix-util
-    nix-store
-    nix-main
-    nix-cmd
+    nix
    perlDeps
    perl
    boost
@@ -206,14 +200,13 @@ stdenv.mkDerivation (finalAttrs: {
    glibcLocales
    libressl.nc
    python3
-    nix-cli
  ];

  hydraPath = lib.makeBinPath (
    [
      subversion
      openssh
-      nix-cli
+      nix
      coreutils
      findutils
      pixz
@@ -274,7 +267,7 @@ stdenv.mkDerivation (finalAttrs: {
            --prefix PATH ':' $out/bin:$hydraPath \
            --set HYDRA_RELEASE ${version} \
            --set HYDRA_HOME $out/libexec/hydra \
-            --set NIX_RELEASE ${nix-cli.name or "unknown"} \
+            --set NIX_RELEASE ${nix.name or "unknown"} \
            --set NIX_EVAL_JOBS_RELEASE ${nix-eval-jobs.name or "unknown"}
    done
  '';
@@ -282,5 +275,5 @@ stdenv.mkDerivation (finalAttrs: {
  dontStrip = true;

  meta.description = "Build of Hydra on ${stdenv.system}";
-  passthru = { inherit perlDeps; };
+  passthru = { inherit perlDeps nix; };
 })
--- a/src/hydra-build-step/hydra-build-step.cc
+++ b/src/hydra-build-step/hydra-build-step.cc
@@ -1,213 +0,0 @@
-/* This is a helper program that performs a build step, i.e. a single
-   derivation. In addition to a derivation path, it takes three store
-   URLs as arguments:
-
-   * --store: The store that will hold the resulting store paths
-       (typically a binary cache).
-
-   * --eval-store: The store that holds the .drv files, as produced by
-       hydra-evaluator.
-
-   * --build-store: The store that performs the build (often a
-       SSHStore for remote builds).
-
-   The build log is written to the path indicated by --log-file.
-*/
-
-#include "util.hh"
-#include "shared.hh"
-#include "common-eval-args.hh"
-#include "store-api.hh"
-#include "build-result.hh"
-#include "derivations.hh"
-#include "worker-protocol.hh"
-
-#include <chrono>
-
-using namespace nix;
-
-// FIXME: cut&paste
-static std::string_view getS(const std::vector<Logger::Field> & fields, size_t n)
-{
-    assert(n < fields.size());
-    assert(fields[n].type == Logger::Field::tString);
-    return fields[n].s;
-}
-
-void mainWrapped(std::list<std::string> args)
-{
-    verbosity = lvlError;
-
-    struct MyArgs : MixEvalArgs, MixCommonArgs, RootArgs
-    {
-        Path drvPath;
-        std::optional<std::string> buildStoreUrl;
-        std::optional<Path> logPath;
-        std::optional<uint64_t> maxOutputSize;
-
-        MyArgs() : MixCommonArgs("hydra-build-step")
-        {
-            expectArg("drv-path", &drvPath);
-
-            addFlag({
-                .longName = "build-store",
-                .description = "The Nix store to use for building the derivation.",
-                //.category = category,
-                .labels = {"store-url"},
-                .handler = {&buildStoreUrl},
-            });
-
-            addFlag({
-                .longName = "log-file",
-                .description = "The path to the build log.",
-                .labels = {"path"},
-                .handler = {&logPath},
-            });
-
-            addFlag({
-                .longName = "max-output-size",
-                .description = "Maximum size of the outputs.",
-                .labels = {"bytes"},
-                .handler = {&maxOutputSize},
-            });
-        }
-    };
-
-    /* A logger that intercepts all build log lines and writes them to
-       the log file. */
-    MyArgs myArgs;
-    myArgs.parseCmdline(args);
-
-    struct MyLogger : public Logger
-    {
-        Logger & prev;
-        AutoCloseFD logFile;
-
-        MyLogger(Logger & prev, Path logPath) : prev(prev)
-        {
-            logFile = open(logPath.c_str(), O_CREAT | O_TRUNC | O_WRONLY, 0666);
-            if (!logFile)
-                throw SysError("creating log file '%s'", logPath);
-        }
-
-        void log(Verbosity lvl, std::string_view s) override
-        { prev.log(lvl, s); }
-
-        void logEI(const ErrorInfo & ei) override
-        { prev.logEI(ei); }
-
-        void writeToStdout(std::string_view s) override
-        { prev.writeToStdout(s); }
-
-        void result(ActivityId act, ResultType type, const Fields & fields) override
-        {
-            if (type == resBuildLogLine)
-                writeLine(logFile.get(), std::string(getS(fields, 0)));
-            else
-                prev.result(act, type, fields);
-        }
-    };
-
-    auto destStore = openStore();
-    auto evalStore = myArgs.evalStoreUrl ? openStore(*myArgs.evalStoreUrl) : destStore;
-    auto buildStore = myArgs.buildStoreUrl ? openStore(*myArgs.buildStoreUrl) : destStore;
-
-    auto drvPath = evalStore->parseStorePath(myArgs.drvPath);
-
-    auto drv = evalStore->readDerivation(drvPath);
-    BasicDerivation basicDrv(drv);
-
-    uint64_t overhead = 0;
-
-    /* Gather the inputs. */
-    StorePathSet inputs;
-
-    for (auto & p : drv.inputSrcs)
-        inputs.insert(p);
-
-    for (auto & [drvPath, node] : drv.inputDrvs.map) {
-        auto drv2 = evalStore->readDerivation(drvPath);
-        for (auto & name : node.value) {
-            if (auto i = get(drv2.outputs, name)) {
-                auto outPath = i->path(*evalStore, drv2.name, name);
-                inputs.insert(*outPath);
-                basicDrv.inputSrcs.insert(*outPath);
-            }
-        }
-    }
-
-    /* Ensure that the inputs exist in the destination store (so that
-       the builder can substitute them from the destination
-       store). This is a no-op for regular stores, but for the binary
-       cache store, this will copy the inputs to the binary cache from
-       the local store. */
-    {
-        auto now1 = std::chrono::steady_clock::now();
-
-        debug("sending closure of '%s' to '%s'",
-            evalStore->printStorePath(drvPath), destStore->getUri());
-
-        if (evalStore != destStore)
-            copyClosure(*evalStore, *destStore, drv.inputSrcs, NoRepair, NoCheckSigs);
-
-        copyClosure(*destStore, *buildStore, inputs, NoRepair, NoCheckSigs, Substitute);
-
-        auto now2 = std::chrono::steady_clock::now();
-
-        overhead += std::chrono::duration_cast<std::chrono::milliseconds>(now2 - now1).count();
-    }
-
-    /* Perform the build. */
-    if (myArgs.logPath)
-        logger = new MyLogger(*logger, *myArgs.logPath);
-
-    auto buildResult = buildStore->buildDerivation(drvPath, basicDrv);
-
-    /* Copy the output paths from the build store to the destination
-       store. */
-    size_t totalNarSize = 0;
-
-    if (buildResult.success()) {
-
-        std::map<StorePath, ValidPathInfo> infos;
-        StorePathSet outputs;
-        for (auto & [output, realisation] : buildResult.builtOutputs) {
-            auto info = buildStore->queryPathInfo(realisation.outPath);
-            totalNarSize += info->narSize;
-            infos.insert_or_assign(info->path, *info);
-            outputs.insert(info->path);
-        }
-
-        if ((!myArgs.maxOutputSize || totalNarSize <= *myArgs.maxOutputSize)
-            && buildStore != destStore)
-        {
-            debug("copying outputs of '%s' from '%s' (%d bytes)",
-                buildStore->printStorePath(drvPath), buildStore->getUri(), totalNarSize);
-
-            auto now1 = std::chrono::steady_clock::now();
-
-            copyPaths(*buildStore, *destStore, outputs, NoRepair, NoCheckSigs);
-
-            auto now2 = std::chrono::steady_clock::now();
-
-            overhead += std::chrono::duration_cast<std::chrono::milliseconds>(now2 - now1).count();
-        }
-    }
-
-    FdSink to { STDOUT_FILENO };
-    WorkerProto::WriteConn wconn {
-        .to = to,
-        // Hardcode latest version because we are deploying hydra
-        // itself atomically
-        .version = PROTOCOL_VERSION,
-    };
-    WorkerProto::write(*evalStore, wconn, buildResult);
-}
-
-int main(int argc, char * * argv)
-{
-    return handleExceptions(argv[0], [&]() {
-        initNix();
-        mainWrapped(argvToStrings(argc, argv));
-    });
-}
--- a/src/hydra-build-step/meson.build
+++ b/src/hydra-build-step/meson.build
@@ -1,14 +0,0 @@
-srcs = files(
-  'hydra-build-step.cc',
-)
-
-hydra_build_step = executable('hydra-build-step',
-  'hydra-build-step.cc',
-  srcs,
-  dependencies: [
-    libhydra_dep,
-    nix_dep,
-    dependency('nix-cmd', required: true)
-  ],
-  install: true,
-)
--- a/src/hydra-queue-runner/build-remote.cc
+++ b/src/hydra-queue-runner/build-remote.cc
@@ -1,27 +1,356 @@
-#include <math.h>
+#include <algorithm>
+#include <cmath>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>

 #include "build-result.hh"
+#include "path.hh"
 #include "serve-protocol.hh"
+#include "serve-protocol-impl.hh"
 #include "state.hh"
 #include "current-process.hh"
 #include "processes.hh"
 #include "util.hh"
+#include "serve-protocol.hh"
+#include "serve-protocol-impl.hh"
+#include "ssh.hh"
 #include "finally.hh"
 #include "url.hh"
-#include "worker-protocol.hh"

 using namespace nix;

 namespace nix::build_remote {

-static Path createLogFileDir(const std::string & logDir, const StorePath & drvPath)
+static Strings extraStoreArgs(std::string & machine)
+{
+    Strings result;
+    try {
+        auto parsed = parseURL(machine);
+        if (parsed.scheme != "ssh") {
+            throw SysError("Currently, only (legacy-)ssh stores are supported!");
+        }
+        machine = parsed.authority.value_or("");
+        auto remoteStore = parsed.query.find("remote-store");
+        if (remoteStore != parsed.query.end()) {
+            result = {"--store", shellEscape(remoteStore->second)};
+        }
+    } catch (BadURL &) {
+        // We just try to continue with `machine->sshName` here for backwards compat.
+    }
+
+    return result;
+}
+
+static std::unique_ptr<SSHMaster::Connection> openConnection(
+    ::Machine::ptr machine, SSHMaster & master)
+{
+    Strings command = {"nix-store", "--serve", "--write"};
+    if (machine->isLocalhost()) {
+        command.push_back("--builders");
+        command.push_back("");
+    } else {
+        command.splice(command.end(), extraStoreArgs(machine->sshName));
+    }
+
+    auto ret = master.startCommand(std::move(command), {
+        "-a", "-oBatchMode=yes", "-oConnectTimeout=60", "-oTCPKeepAlive=yes"
+    });
+
+    // XXX: determine the actual max value we can use from /proc.
+
+    // FIXME: Should this be upstreamed into `startCommand` in Nix?
+
+    int pipesize = 1024 * 1024;
+
+    fcntl(ret->in.get(), F_SETPIPE_SZ, &pipesize);
+    fcntl(ret->out.get(), F_SETPIPE_SZ, &pipesize);
+
+    return ret;
+}
+
+
+static void copyClosureTo(
+    ::Machine::Connection & conn,
+    Store & destStore,
+    const StorePathSet & paths,
+    SubstituteFlag useSubstitutes = NoSubstitute)
+{
+    StorePathSet closure;
+    destStore.computeFSClosure(paths, closure);
+
+    /* Send the "query valid paths" command with the "lock" option
+       enabled. This prevents a race where the remote host
+       garbage-collect paths that are already there. Optionally, ask
+       the remote host to substitute missing paths. */
+    // FIXME: substitute output pollutes our build log
+    /* Get back the set of paths that are already valid on the remote
+       host. */
+    auto present = conn.queryValidPaths(
+        destStore, true, closure, useSubstitutes);
+
+    if (present.size() == closure.size()) return;
+
+    auto sorted = destStore.topoSortPaths(closure);
+
+    StorePathSet missing;
+    for (auto i = sorted.rbegin(); i != sorted.rend(); ++i)
+        if (!present.count(*i)) missing.insert(*i);
+
+    printMsg(lvlDebug, "sending %d missing paths", missing.size());
+
+    std::unique_lock<std::timed_mutex> sendLock(conn.machine->state->sendLock,
+        std::chrono::seconds(600));
+
+    conn.to << ServeProto::Command::ImportPaths;
+    destStore.exportPaths(missing, conn.to);
+    conn.to.flush();
+
+    if (readInt(conn.from) != 1)
+        throw Error("remote machine failed to import closure");
+}
+
+
+// FIXME: use Store::topoSortPaths().
+static StorePaths reverseTopoSortPaths(const std::map<StorePath, UnkeyedValidPathInfo> & paths)
+{
+    StorePaths sorted;
+    StorePathSet visited;
+
+    std::function<void(const StorePath & path)> dfsVisit;
+
+    dfsVisit = [&](const StorePath & path) {
+        if (!visited.insert(path).second) return;
+
+        auto info = paths.find(path);
+        auto references = info == paths.end() ? StorePathSet() : info->second.references;
+
+        for (auto & i : references)
+            /* Don't traverse into paths that don't exist.  That can
+               happen due to substitutes for non-existent paths. */
+            if (i != path && paths.count(i))
+                dfsVisit(i);
+
+        sorted.push_back(path);
+    };
+
+    for (auto & i : paths)
+        dfsVisit(i.first);
+
+    return sorted;
+}
+
+static std::pair<Path, AutoCloseFD> openLogFile(const std::string & logDir, const StorePath & drvPath)
 {
    std::string base(drvPath.to_string());
    auto logFile = logDir + "/" + std::string(base, 0, 2) + "/" + std::string(base, 2);

    createDirs(dirOf(logFile));

-    return logFile;
+    AutoCloseFD logFD = open(logFile.c_str(), O_CREAT | O_TRUNC | O_WRONLY, 0666);
+    if (!logFD) throw SysError("creating log file ‘%s’", logFile);
+
+    return {std::move(logFile), std::move(logFD)};
+}
+
+static BasicDerivation sendInputs(
+    State & state,
+    Step & step,
+    Store & localStore,
+    Store & destStore,
+    ::Machine::Connection & conn,
+    unsigned int & overhead,
+    counter & nrStepsWaiting,
+    counter & nrStepsCopyingTo
+)
+{
+    /* Replace the input derivations by their output paths to send a
+       minimal closure to the builder.
+
+       `tryResolve` currently does *not* rewrite input addresses, so it
+       is safe to do this in all cases. (It should probably have a mode
+       to do that, however, but we would not use it here.)
+     */
+    BasicDerivation basicDrv = ({
+        auto maybeBasicDrv = step.drv->tryResolve(destStore, &localStore);
+        if (!maybeBasicDrv)
+            throw Error(
+                "the derivation '%s' can’t be resolved. It’s probably "
+                "missing some outputs",
+                localStore.printStorePath(step.drvPath));
+        *maybeBasicDrv;
+    });
+
+    /* Ensure that the inputs exist in the destination store. This is
+       a no-op for regular stores, but for the binary cache store,
+       this will copy the inputs to the binary cache from the local
+       store. */
+    if (&localStore != &destStore) {
+        copyClosure(localStore, destStore,
+            step.drv->inputSrcs,
+            NoRepair, NoCheckSigs, NoSubstitute);
+    }
+
+    {
+        auto mc1 = std::make_shared<MaintainCount<counter>>(nrStepsWaiting);
+        mc1.reset();
+        MaintainCount<counter> mc2(nrStepsCopyingTo);
+
+        printMsg(lvlDebug, "sending closure of ‘%s’ to ‘%s’",
+            localStore.printStorePath(step.drvPath), conn.machine->sshName);
+
+        auto now1 = std::chrono::steady_clock::now();
+
+        /* Copy the input closure. */
+        if (conn.machine->isLocalhost()) {
+            StorePathSet closure;
+            destStore.computeFSClosure(basicDrv.inputSrcs, closure);
+            copyPaths(destStore, localStore, closure, NoRepair, NoCheckSigs, NoSubstitute);
+        } else {
+            copyClosureTo(conn, destStore, basicDrv.inputSrcs, Substitute);
+        }
+
+        auto now2 = std::chrono::steady_clock::now();
+
+        overhead += std::chrono::duration_cast<std::chrono::milliseconds>(now2 - now1).count();
+    }
+
+    return basicDrv;
+}
+
+static BuildResult performBuild(
+    ::Machine::Connection & conn,
+    Store & localStore,
+    StorePath drvPath,
+    const BasicDerivation & drv,
+    const ServeProto::BuildOptions & options,
+    counter & nrStepsBuilding
+)
+{
+    conn.putBuildDerivationRequest(localStore, drvPath, drv, options);
+
+    BuildResult result;
+
+    time_t startTime, stopTime;
+
+    startTime = time(0);
+    {
+        MaintainCount<counter> mc(nrStepsBuilding);
+        result = ServeProto::Serialise<BuildResult>::read(localStore, conn);
+    }
+    stopTime = time(0);
+
+    if (!result.startTime) {
+        // If the builder gave `startTime = 0`, use our measurements
+        // instead of the builder's.
+        //
+        // Note: this represents the duration of a single round, rather
+        // than all rounds.
+        result.startTime = startTime;
+        result.stopTime = stopTime;
+    }
+
+    // If the protocol was too old to give us `builtOutputs`, initialize
+    // it manually by introspecting the derivation.
+    if (GET_PROTOCOL_MINOR(conn.remoteVersion) < 6)
+    {
+        // If the remote is too old to handle CA derivations, we can’t get this
+        // far anyways
+        assert(drv.type().hasKnownOutputPaths());
+        DerivationOutputsAndOptPaths drvOutputs = drv.outputsAndOptPaths(localStore);
+        // Since this a `BasicDerivation`, `staticOutputHashes` will not
+        // do any real work.
+        auto outputHashes = staticOutputHashes(localStore, drv);
+        for (auto & [outputName, output] : drvOutputs) {
+            auto outputPath = output.second;
+            // We’ve just asserted that the output paths of the derivation
+            // were known
+            assert(outputPath);
+            auto outputHash = outputHashes.at(outputName);
+            auto drvOutput = DrvOutput { outputHash, outputName };
+            result.builtOutputs.insert_or_assign(
+                std::move(outputName),
+                Realisation { drvOutput, *outputPath });
+        }
+    }
+
+    return result;
+}
+
+static std::map<StorePath, UnkeyedValidPathInfo> queryPathInfos(
+    ::Machine::Connection & conn,
+    Store & localStore,
+    StorePathSet & outputs,
+    size_t & totalNarSize
+)
+{
+
+    /* Get info about each output path. */
+    std::map<StorePath, UnkeyedValidPathInfo> infos;
+    conn.to << ServeProto::Command::QueryPathInfos;
+    ServeProto::write(localStore, conn, outputs);
+    conn.to.flush();
+    while (true) {
+        auto storePathS = readString(conn.from);
+        if (storePathS == "") break;
+
+        auto storePath = localStore.parseStorePath(storePathS);
+        auto info = ServeProto::Serialise<UnkeyedValidPathInfo>::read(localStore, conn);
+        totalNarSize += info.narSize;
+        infos.insert_or_assign(std::move(storePath), std::move(info));
+    }
+
+    return infos;
+}
+
+static void copyPathFromRemote(
+    ::Machine::Connection & conn,
+    NarMemberDatas & narMembers,
+    Store & localStore,
+    Store & destStore,
+    const ValidPathInfo & info
+)
+{
+      /* Receive the NAR from the remote and add it to the
+          destination store. Meanwhile, extract all the info from the
+          NAR that getBuildOutput() needs. */
+      auto source2 = sinkToSource([&](Sink & sink)
+      {
+          /* Note: we should only send the command to dump the store
+              path to the remote if the NAR is actually going to get read
+              by the destination store, which won't happen if this path
+              is already valid on the destination store. Since this
+              lambda function only gets executed if someone tries to read
+              from source2, we will send the command from here rather
+              than outside the lambda. */
+          conn.to << ServeProto::Command::DumpStorePath << localStore.printStorePath(info.path);
+          conn.to.flush();
+
+          TeeSource tee(conn.from, sink);
+          extractNarData(tee, localStore.printStorePath(info.path), narMembers);
+      });
+
+      destStore.addToStore(info, *source2, NoRepair, NoCheckSigs);
+}
+
+static void copyPathsFromRemote(
+    ::Machine::Connection & conn,
+    NarMemberDatas & narMembers,
+    Store & localStore,
+    Store & destStore,
+    const std::map<StorePath, UnkeyedValidPathInfo> & infos
+)
+{
+      auto pathsSorted = reverseTopoSortPaths(infos);
+
+      for (auto & path : pathsSorted) {
+          auto & info = infos.find(path)->second;
+          copyPathFromRemote(
+              conn, narMembers, localStore, destStore,
+              ValidPathInfo { path, info });
+      }
+
 }

 }
@@ -30,14 +359,11 @@ static Path createLogFileDir(const std::string & logDir, const StorePath & drvPa

 void RemoteResult::updateWithBuildResult(const nix::BuildResult & buildResult)
 {
-    // FIXME: make RemoteResult inherit BuildResult.
+    startTime = buildResult.startTime;
+    stopTime = buildResult.stopTime;
    timesBuilt = buildResult.timesBuilt;
    errorMsg = buildResult.errorMsg;
    isNonDeterministic = buildResult.isNonDeterministic;
-    if (buildResult.startTime && buildResult.stopTime) {
-        startTime = buildResult.startTime;
-        stopTime = buildResult.stopTime;
-    }

    switch ((BuildResult::Status) buildResult.status) {
        case BuildResult::Built:
@@ -86,8 +412,19 @@ void RemoteResult::updateWithBuildResult(const nix::BuildResult & buildResult)

 }

+/* Utility guard object to auto-release a semaphore on destruction. */
+template <typename T>
+class SemaphoreReleaser {
+public:
+    SemaphoreReleaser(T* s) : sem(s) {}
+    ~SemaphoreReleaser() { sem->release(); }
+
+private:
+    T* sem;
+};

 void State::buildRemote(ref<Store> destStore,
+    MachineReservation::ptr & reservation,
    ::Machine::ptr machine, Step::ptr step,
    const ServeProto::BuildOptions & buildOptions,
    RemoteResult & result, std::shared_ptr<ActiveStep> activeStep,
@@ -96,44 +433,30 @@ void State::buildRemote(ref<Store> destStore,
 {
    assert(BuildResult::TimedOut == 8);

-    result.logFile = build_remote::createLogFileDir(logDir, step->drvPath);
+    auto [logFile, logFD] = build_remote::openLogFile(logDir, step->drvPath);
+    AutoDelete logFileDel(logFile, false);
+    result.logFile = logFile;

    try {

-        updateStep(ssBuilding);
-        result.startTime = time(0);
+        updateStep(ssConnecting);

-        auto buildStoreUrl = machine->completeStoreReference().render();
-
-        Strings args = {
-            localStore->printStorePath(step->drvPath),
-            "--store", destStore->getUri(),
-            "--eval-store", localStore->getUri(),
-            "--build-store", buildStoreUrl,
-            "--max-silent-time", std::to_string(buildOptions.maxSilentTime),
-            "--timeout", std::to_string(buildOptions.buildTimeout),
-            "--max-build-log-size", std::to_string(buildOptions.maxLogSize),
-            "--max-output-size", std::to_string(maxOutputSize),
-            "--repeat", std::to_string(buildOptions.nrRepeats),
-            "--log-file", result.logFile,
-            // FIXME: step->isDeterministic
+        SSHMaster master {
+            machine->sshName,
+            machine->sshKey,
+            machine->sshPublicHostKey,
+            false, // no SSH master yet
+            false, // no compression yet
+            logFD.get(),
        };

-        // FIXME: set pid for cancellation
+        // FIXME: rewrite to use Store.
+        auto child = build_remote::openConnection(machine, master);

-        auto [status, childStdout] = [&]() {
-            MaintainCount<counter> mc(nrStepsBuilding);
-            return runProgram({
-                .program = "hydra-build-step",
-                .args = std::move(args),
-            });
-        }();
-
-        #if 0
        {
            auto activeStepState(activeStep->state_.lock());
            if (activeStepState->cancelled) throw Error("step cancelled");
-            activeStepState->pid = conn.store->getConnectionPid();
+            activeStepState->pid = child->sshPid;
        }

        Finally clearPid([&]() {
@@ -147,32 +470,82 @@ void State::buildRemote(ref<Store> destStore,
               possibility that we end up killing another
               process. Meh. */
        });
-        #endif

-        result.stopTime = time(0);
+        ::Machine::Connection conn {
+            {
+                .to = child->in.get(),
+                .from = child->out.get(),
+                /* Handshake. */
+                .remoteVersion = 0xdadbeef, // FIXME avoid dummy initialize
+            },
+            /*.machine =*/ machine,
+        };

-        if (!statusOk(status))
-            throw ExecError(status, fmt("hydra-build-step %s with output:\n%s", statusToString(status), stdout));
+        Finally updateStats([&]() {
+            bytesReceived += conn.from.read;
+            bytesSent += conn.to.written;
+        });
+
+        constexpr ServeProto::Version our_version = 0x206;
+
+        try {
+            conn.remoteVersion = decltype(conn)::handshake(
+                conn.to,
+                conn.from,
+                our_version,
+                machine->sshName);
+        } catch (EndOfFile & e) {
+            child->sshPid.wait();
+            std::string s = chomp(readFile(result.logFile));
+            throw Error("cannot connect to ‘%1%’: %2%", machine->sshName, s);
+        }
+
+        // Do not attempt to speak a newer version of the protocol.
+        //
+        // Per https://github.com/NixOS/nix/issues/9584 should be handled as
+        // part of `handshake` in upstream nix.
+        conn.remoteVersion = std::min(conn.remoteVersion, our_version);

-        /* The build was executed successfully, so clear the failure
-           count for this machine. */
        {
            auto info(machine->state->connectInfo.lock());
            info->consecutiveFailures = 0;
        }

-        StringSource from { childStdout };
-        /* Read the BuildResult from the child. */
-        WorkerProto::ReadConn rconn {
-            .from = from,
-            // Hardcode latest version because we are deploying hydra
-            // itself atomically
-            .version = PROTOCOL_VERSION,
-        };
-        result.overhead += readNum<uint64_t>(rconn.from);
-        auto totalNarSize = readNum<uint64_t>(rconn.from);
-        auto buildResult = WorkerProto::Serialise<BuildResult>::read(*localStore, rconn);
+        /* Gather the inputs. If the remote side is Nix <= 1.9, we have to
+           copy the entire closure of ‘drvPath’, as well as the required
+           outputs of the input derivations. On Nix > 1.9, we only need to
+           copy the immediate sources of the derivation and the required
+           outputs of the input derivations. */
+        updateStep(ssSendingInputs);
+        BasicDerivation resolvedDrv = build_remote::sendInputs(*this, *step, *localStore, *destStore, conn, result.overhead, nrStepsWaiting, nrStepsCopyingTo);

+        logFileDel.cancel();
+
+        /* Truncate the log to get rid of messages about substitutions
+            etc. on the remote system. */
+        if (lseek(logFD.get(), SEEK_SET, 0) != 0)
+            throw SysError("seeking to the start of log file ‘%s’", result.logFile);
+
+        if (ftruncate(logFD.get(), 0) == -1)
+            throw SysError("truncating log file ‘%s’", result.logFile);
+
+        logFD = -1;
+
+        /* Do the build. */
+        printMsg(lvlDebug, "building ‘%s’ on ‘%s’",
+            localStore->printStorePath(step->drvPath),
+            machine->sshName);
+
+        updateStep(ssBuilding);
+
+        BuildResult buildResult = build_remote::performBuild(
+            conn,
+            *localStore,
+            step->drvPath,
+            resolvedDrv,
+            buildOptions,
+            nrStepsBuilding
+        );

        result.updateWithBuildResult(buildResult);

@@ -180,22 +553,82 @@ void State::buildRemote(ref<Store> destStore,

        result.errorMsg = "";

-        /* If the NAR size limit was exceeded, then hydra-build-step
-           will not have copied the output paths. */
-        if (totalNarSize > maxOutputSize) {
-            result.stepStatus = bsNarSizeLimitExceeded;
-            return;
-        }
-
        /* If the path was substituted or already valid, then we didn't
           get a build log. */
        if (result.isCached) {
            printMsg(lvlInfo, "outputs of ‘%s’ substituted or already valid on ‘%s’",
-                localStore->printStorePath(step->drvPath), machine->storeUri.render());
+                localStore->printStorePath(step->drvPath), machine->sshName);
            unlink(result.logFile.c_str());
            result.logFile = "";
        }

+        /* Throttle CPU-bound work. Opportunistically skip updating the current
+         * step, since this requires a DB roundtrip. */
+        if (!localWorkThrottler.try_acquire()) {
+            MaintainCount<counter> mc(nrStepsWaitingForDownloadSlot);
+            updateStep(ssWaitingForLocalSlot);
+            localWorkThrottler.acquire();
+        }
+        SemaphoreReleaser releaser(&localWorkThrottler);
+
+        /* Once we've started copying outputs, release the machine reservation
+         * so further builds can happen. We do not release the machine earlier
+         * to avoid situations where the queue runner is bottlenecked on
+         * copying outputs and we end up building too many things that we
+         * haven't been able to allow copy slots for. */
+        assert(reservation.unique());
+        reservation = 0;
+        wakeDispatcher();
+
+        StorePathSet outputs;
+        for (auto & [_, realisation] : buildResult.builtOutputs)
+            outputs.insert(realisation.outPath);
+
+        /* Copy the output paths. */
+        if (!machine->isLocalhost() || localStore != std::shared_ptr<Store>(destStore)) {
+            updateStep(ssReceivingOutputs);
+
+            MaintainCount<counter> mc(nrStepsCopyingFrom);
+
+            auto now1 = std::chrono::steady_clock::now();
+
+            size_t totalNarSize = 0;
+            auto infos = build_remote::queryPathInfos(conn, *localStore, outputs, totalNarSize);
+
+            if (totalNarSize > maxOutputSize) {
+                result.stepStatus = bsNarSizeLimitExceeded;
+                return;
+            }
+
+            /* Copy each path. */
+            printMsg(lvlDebug, "copying outputs of ‘%s’ from ‘%s’ (%d bytes)",
+                localStore->printStorePath(step->drvPath), machine->sshName, totalNarSize);
+
+            build_remote::copyPathsFromRemote(conn, narMembers, *localStore, *destStore, infos);
+            auto now2 = std::chrono::steady_clock::now();
+
+            result.overhead += std::chrono::duration_cast<std::chrono::milliseconds>(now2 - now1).count();
+        }
+
+        /* Register the outputs of the newly built drv */
+        if (experimentalFeatureSettings.isEnabled(Xp::CaDerivations)) {
+            auto outputHashes = staticOutputHashes(*localStore, *step->drv);
+            for (auto & [outputName, realisation] : buildResult.builtOutputs) {
+                // Register the resolved drv output
+                destStore->registerDrvOutput(realisation);
+
+                // Also register the unresolved one
+                auto unresolvedRealisation = realisation;
+                unresolvedRealisation.signatures.clear();
+                unresolvedRealisation.id.drvHash = outputHashes.at(outputName);
+                destStore->registerDrvOutput(unresolvedRealisation);
+            }
+        }
+
+        /* Shut down the connection. */
+        child->in = -1;
+        child->sshPid.wait();
+
    } catch (Error & e) {
        /* Disable this machine until a certain period of time has
           passed. This period increases on every consecutive
@@ -208,7 +641,7 @@ void State::buildRemote(ref<Store> destStore,
            info->consecutiveFailures = std::min(info->consecutiveFailures + 1, (unsigned int) 4);
            info->lastFailure = now;
            int delta = retryInterval * std::pow(retryBackoff, info->consecutiveFailures - 1) + (rand() % 30);
-            printMsg(lvlInfo, "will disable machine ‘%1%’ for %2%s", machine->storeUri.render(), delta);
+            printMsg(lvlInfo, "will disable machine ‘%1%’ for %2%s", machine->sshName, delta);
            info->disabledUntil = now + std::chrono::seconds(delta);
        }
        throw;
--- a/src/hydra-queue-runner/builder.cc
+++ b/src/hydra-queue-runner/builder.cc
@@ -3,7 +3,6 @@
 #include "state.hh"
 #include "hydra-build-result.hh"
 #include "finally.hh"
-#include "terminal.hh"
 #include "binary-cache-store.hh"

 using namespace nix;
@@ -38,19 +37,22 @@ void State::builder(MachineReservation::ptr reservation)

        try {
            auto destStore = getDestStore();
+            // Might release the reservation.
            res = doBuildStep(destStore, reservation, activeStep);
        } catch (std::exception & e) {
            printMsg(lvlError, "uncaught exception building ‘%s’ on ‘%s’: %s",
-                localStore->printStorePath(reservation->step->drvPath),
-                reservation->machine->storeUri.render(),
+                localStore->printStorePath(activeStep->step->drvPath),
+                reservation ? reservation->machine->sshName : std::string("(no machine)"),
                e.what());
        }
    }

-    /* Release the machine and wake up the dispatcher. */
-    assert(reservation.unique());
-    reservation = 0;
-    wakeDispatcher();
+    /* If the machine hasn't been released yet, release and wake up the dispatcher. */
+    if (reservation) {
+        assert(reservation.unique());
+        reservation = 0;
+        wakeDispatcher();
+    }

    /* If there was a temporary failure, retry the step after an
       exponentially increasing interval. */
@@ -73,11 +75,11 @@ void State::builder(MachineReservation::ptr reservation)


 State::StepResult State::doBuildStep(nix::ref<Store> destStore,
-    MachineReservation::ptr reservation,
+    MachineReservation::ptr & reservation,
    std::shared_ptr<ActiveStep> activeStep)
 {
-    auto & step(reservation->step);
-    auto & machine(reservation->machine);
+    auto step(reservation->step);
+    auto machine(reservation->machine);

    {
        auto step_(step->state.lock());
@@ -151,7 +153,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
        buildOptions.buildTimeout = build->buildTimeout;

        printInfo("performing step ‘%s’ %d times on ‘%s’ (needed by build %d and %d others)",
-            localStore->printStorePath(step->drvPath), buildOptions.nrRepeats + 1, machine->storeUri.render(), buildId, (dependents.size() - 1));
+            localStore->printStorePath(step->drvPath), buildOptions.nrRepeats + 1, machine->sshName, buildId, (dependents.size() - 1));
    }

    if (!buildOneDone)
@@ -179,7 +181,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
                    unlink(result.logFile.c_str());
                }
            } catch (...) {
-                ignoreExceptionInDestructor();
+                ignoreException();
            }
        }
    });
@@ -197,7 +199,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
        {
            auto mc = startDbUpdate();
            pqxx::work txn(*conn);
-            stepNr = createBuildStep(txn, result.startTime, buildId, step, machine->storeUri.render(), bsBusy);
+            stepNr = createBuildStep(txn, result.startTime, buildId, step, machine->sshName, bsBusy);
            txn.commit();
        }

@@ -212,7 +214,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,

        try {
            /* FIXME: referring builds may have conflicting timeouts. */
-            buildRemote(destStore, machine, step, buildOptions, result, activeStep, updateStep, narMembers);
+            buildRemote(destStore, reservation, machine, step, buildOptions, result, activeStep, updateStep, narMembers);
        } catch (Error & e) {
            if (activeStep->state_.lock()->cancelled) {
                printInfo("marking step %d of build %d as cancelled", stepNr, buildId);
@@ -220,7 +222,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
                result.canRetry = false;
            } else {
                result.stepStatus = bsAborted;
-                result.errorMsg = filterANSIEscapes(e.msg(), true);
+                result.errorMsg = e.msg();
                result.canRetry = true;
            }
        }
@@ -254,7 +256,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
    /* Finish the step in the database. */
    if (stepNr) {
        pqxx::work txn(*conn);
-        finishBuildStep(txn, result, buildId, stepNr, machine->storeUri.render());
+        finishBuildStep(txn, result, buildId, stepNr, machine->sshName);
        txn.commit();
    }

@@ -262,7 +264,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
       issue). Retry a number of times. */
    if (result.canRetry) {
        printMsg(lvlError, "possibly transient failure building ‘%s’ on ‘%s’: %s",
-            localStore->printStorePath(step->drvPath), machine->storeUri.render(), result.errorMsg);
+            localStore->printStorePath(step->drvPath), machine->sshName, result.errorMsg);
        assert(stepNr);
        bool retry;
        {
@@ -453,7 +455,7 @@ void State::failStep(
                    build->finishedInDB)
                    continue;
                createBuildStep(txn,
-                    0, build->id, step, machine ? machine->storeUri.render() : "",
+                    0, build->id, step, machine ? machine->sshName : "",
                    result.stepStatus, result.errorMsg, buildId == build->id ? 0 : buildId);
            }

--- a/src/hydra-queue-runner/dispatcher.cc
+++ b/src/hydra-queue-runner/dispatcher.cc
@@ -40,13 +40,15 @@ void State::dispatcher()
            printMsg(lvlDebug, "dispatcher woken up");
            nrDispatcherWakeups++;

-            auto now1 = std::chrono::steady_clock::now();
+            auto t_before_work = std::chrono::steady_clock::now();

            auto sleepUntil = doDispatch();

-            auto now2 = std::chrono::steady_clock::now();
+            auto t_after_work = std::chrono::steady_clock::now();

-            dispatchTimeMs += std::chrono::duration_cast<std::chrono::milliseconds>(now2 - now1).count();
+            prom.dispatcher_time_spent_running.Increment(
+                std::chrono::duration_cast<std::chrono::microseconds>(t_after_work - t_before_work).count());
+            dispatchTimeMs += std::chrono::duration_cast<std::chrono::milliseconds>(t_after_work - t_before_work).count();

            /* Sleep until we're woken up (either because a runnable build
               is added, or because a build finishes). */
@@ -60,6 +62,10 @@ void State::dispatcher()
                *dispatcherWakeup_ = false;
            }

+            auto t_after_sleep = std::chrono::steady_clock::now();
+            prom.dispatcher_time_spent_waiting.Increment(
+                std::chrono::duration_cast<std::chrono::microseconds>(t_after_sleep - t_after_work).count());
+
        } catch (std::exception & e) {
            printError("dispatcher: %s", e.what());
            sleep(1);
@@ -256,7 +262,7 @@ system_time State::doDispatch()
                /* Can this machine do this step? */
                if (!mi.machine->supportsStep(step)) {
                    debug("machine '%s' does not support step '%s' (system type '%s')",
-                        mi.machine->storeUri.render(), localStore->printStorePath(step->drvPath), step->drv->platform);
+                        mi.machine->sshName, localStore->printStorePath(step->drvPath), step->drv->platform);
                    continue;
                }

--- a/src/hydra-queue-runner/hydra-queue-runner.cc
+++ b/src/hydra-queue-runner/hydra-queue-runner.cc
@@ -70,10 +70,31 @@ State::PromMetrics::PromMetrics()
            .Register(*registry)
            .Add({})
    )
-    , queue_max_id(
-        prometheus::BuildGauge()
-            .Name("hydraqueuerunner_queue_max_build_id_info")
-            .Help("Maximum build record ID in the queue")
+    , dispatcher_time_spent_running(
+        prometheus::BuildCounter()
+            .Name("hydraqueuerunner_dispatcher_time_spent_running")
+            .Help("Time (in micros) spent running the dispatcher")
+            .Register(*registry)
+            .Add({})
+    )
+    , dispatcher_time_spent_waiting(
+        prometheus::BuildCounter()
+            .Name("hydraqueuerunner_dispatcher_time_spent_waiting")
+            .Help("Time (in micros) spent waiting for the dispatcher to obtain work")
+            .Register(*registry)
+            .Add({})
+    )
+    , queue_monitor_time_spent_running(
+        prometheus::BuildCounter()
+            .Name("hydraqueuerunner_queue_monitor_time_spent_running")
+            .Help("Time (in micros) spent running the queue monitor")
+            .Register(*registry)
+            .Add({})
+    )
+    , queue_monitor_time_spent_waiting(
+        prometheus::BuildCounter()
+            .Name("hydraqueuerunner_queue_monitor_time_spent_waiting")
+            .Help("Time (in micros) spent waiting for the queue monitor to obtain work")
            .Register(*registry)
            .Add({})
    )
@@ -85,6 +106,7 @@ State::State(std::optional<std::string> metricsAddrOpt)
    : config(std::make_unique<HydraConfig>())
    , maxUnsupportedTime(config->getIntOption("max_unsupported_time", 0))
    , dbPool(config->getIntOption("max_db_connections", 128))
+    , localWorkThrottler(config->getIntOption("max_local_worker_threads", std::min(maxSupportedLocalWorkers, std::max(4u, std::thread::hardware_concurrency()) - 2)))
    , maxOutputSize(config->getIntOption("max_output_size", 2ULL << 30))
    , maxLogSize(config->getIntOption("max_log_size", 64ULL << 20))
    , uploadLogsToBinaryCache(config->getBoolOption("upload_logs_to_binary_cache", false))
@@ -135,26 +157,65 @@ void State::parseMachines(const std::string & contents)
        oldMachines = *machines_;
    }

-    for (auto && machine_ : nix::Machine::parseConfig({}, contents)) {
-        auto machine = std::make_shared<::Machine>(std::move(machine_));
+    for (auto line : tokenizeString<Strings>(contents, "\n")) {
+        line = trim(std::string(line, 0, line.find('#')));
+        auto tokens = tokenizeString<std::vector<std::string>>(line);
+        if (tokens.size() < 3) continue;
+        tokens.resize(8);
+
+        if (tokens[5] == "-") tokens[5] = "";
+        auto supportedFeatures = tokenizeString<StringSet>(tokens[5], ",");
+
+        if (tokens[6] == "-") tokens[6] = "";
+        auto mandatoryFeatures = tokenizeString<StringSet>(tokens[6], ",");
+
+        for (auto & f : mandatoryFeatures)
+            supportedFeatures.insert(f);
+
+        using MaxJobs = std::remove_const<decltype(nix::Machine::maxJobs)>::type;
+
+        auto machine = std::make_shared<::Machine>(nix::Machine {
+            // `storeUri`, not yet used
+            "",
+            // `systemTypes`
+            tokenizeString<StringSet>(tokens[1], ","),
+            // `sshKey`
+            tokens[2] == "-" ? "" : tokens[2],
+            // `maxJobs`
+            tokens[3] != ""
+                ? string2Int<MaxJobs>(tokens[3]).value()
+                : 1,
+            // `speedFactor`
+            std::stof(tokens[4].c_str()),
+            // `supportedFeatures`
+            std::move(supportedFeatures),
+            // `mandatoryFeatures`
+            std::move(mandatoryFeatures),
+            // `sshPublicHostKey`
+            tokens[7] != "" && tokens[7] != "-"
+                ? tokens[7]
+                : "",
+        });
+
+        machine->sshName = tokens[0];

        /* Re-use the State object of the previous machine with the
           same name. */
-        auto i = oldMachines.find(machine->storeUri.variant);
+        auto i = oldMachines.find(machine->sshName);
        if (i == oldMachines.end())
-            printMsg(lvlChatty, "adding new machine ‘%1%’", machine->storeUri.render());
+            printMsg(lvlChatty, "adding new machine ‘%1%’", machine->sshName);
        else
-            printMsg(lvlChatty, "updating machine ‘%1%’", machine->storeUri.render());
+            printMsg(lvlChatty, "updating machine ‘%1%’", machine->sshName);
        machine->state = i == oldMachines.end()
            ? std::make_shared<::Machine::State>()
            : i->second->state;
-        newMachines[machine->storeUri.variant] = machine;
+        newMachines[machine->sshName] = machine;
    }

    for (auto & m : oldMachines)
        if (newMachines.find(m.first) == newMachines.end()) {
            if (m.second->enabled)
-                printInfo("removing machine ‘%1%’", m.second->storeUri.render());
+                printInfo("removing machine ‘%1%’", m.first);
            /* Add a disabled ::Machine object to make sure stats are
               maintained. */
            auto machine = std::make_shared<::Machine>(*(m.second));
@@ -550,11 +611,13 @@ void State::dumpStatus(Connection & conn)
        {"nrQueuedBuilds", builds.lock()->size()},
        {"nrActiveSteps", activeSteps_.lock()->size()},
        {"nrStepsBuilding", nrStepsBuilding.load()},
-        #if 0
        {"nrStepsCopyingTo", nrStepsCopyingTo.load()},
+        {"nrStepsWaitingForDownloadSlot", nrStepsWaitingForDownloadSlot.load()},
        {"nrStepsCopyingFrom", nrStepsCopyingFrom.load()},
-        #endif
+        {"nrStepsWaiting", nrStepsWaiting.load()},
        {"nrUnsupportedSteps", nrUnsupportedSteps.load()},
+        {"bytesSent", bytesSent.load()},
+        {"bytesReceived", bytesReceived.load()},
        {"nrBuildsRead", nrBuildsRead.load()},
        {"buildReadTimeMs", buildReadTimeMs.load()},
        {"buildReadTimeAvgMs", nrBuildsRead == 0 ? 0.0 : (float) buildReadTimeMs / nrBuildsRead},
@@ -617,7 +680,7 @@ void State::dumpStatus(Connection & conn)
                    machine["avgStepTime"] = (float) s->totalStepTime / s->nrStepsDone;
                    machine["avgStepBuildTime"] = (float) s->totalStepBuildTime / s->nrStepsDone;
                }
-                statusJson["machines"][m->storeUri.render()] = machine;
+                statusJson["machines"][m->sshName] = machine;
            }
        }

--- a/src/hydra-queue-runner/queue-monitor.cc
+++ b/src/hydra-queue-runner/queue-monitor.cc
@@ -1,6 +1,7 @@
 #include "state.hh"
 #include "hydra-build-result.hh"
 #include "globals.hh"
+#include "thread-pool.hh"

 #include <cstring>

@@ -37,16 +38,21 @@ void State::queueMonitorLoop(Connection & conn)

    auto destStore = getDestStore();

-    unsigned int lastBuildId = 0;
-
    bool quit = false;
    while (!quit) {
+        auto t_before_work = std::chrono::steady_clock::now();
+
        localStore->clearPathInfoCache();

-        bool done = getQueuedBuilds(conn, destStore, lastBuildId);
+        bool done = getQueuedBuilds(conn, destStore);

        if (buildOne && buildOneDone) quit = true;

+        auto t_after_work = std::chrono::steady_clock::now();
+
+        prom.queue_monitor_time_spent_running.Increment(
+            std::chrono::duration_cast<std::chrono::microseconds>(t_after_work - t_before_work).count());
+
        /* Sleep until we get notification from the database about an
           event. */
        if (done && !quit) {
@@ -56,12 +62,10 @@ void State::queueMonitorLoop(Connection & conn)
            conn.get_notifs();

        if (auto lowestId = buildsAdded.get()) {
-            lastBuildId = std::min(lastBuildId, static_cast<unsigned>(std::stoul(*lowestId) - 1));
            printMsg(lvlTalkative, "got notification: new builds added to the queue");
        }
        if (buildsRestarted.get()) {
            printMsg(lvlTalkative, "got notification: builds restarted");
-            lastBuildId = 0; // check all builds
        }
        if (buildsCancelled.get() || buildsDeleted.get() || buildsBumped.get()) {
            printMsg(lvlTalkative, "got notification: builds cancelled or bumped");
@@ -71,6 +75,10 @@ void State::queueMonitorLoop(Connection & conn)
            printMsg(lvlTalkative, "got notification: jobset shares changed");
            processJobsetSharesChange(conn);
        }
+
+        auto t_after_sleep = std::chrono::steady_clock::now();
+        prom.queue_monitor_time_spent_waiting.Increment(
+            std::chrono::duration_cast<std::chrono::microseconds>(t_after_sleep - t_after_work).count());
    }

    exit(0);
@@ -84,20 +92,18 @@ struct PreviousFailure : public std::exception {


 bool State::getQueuedBuilds(Connection & conn,
-    ref<Store> destStore, unsigned int & lastBuildId)
+    ref<Store> destStore)
 {
    prom.queue_checks_started.Increment();

-    printInfo("checking the queue for builds > %d...", lastBuildId);
+    printInfo("checking the queue for builds...");

    /* Grab the queued builds from the database, but don't process
       them yet (since we don't want a long-running transaction). */
    std::vector<BuildID> newIDs;
-    std::map<BuildID, Build::ptr> newBuildsByID;
+    std::unordered_map<BuildID, Build::ptr> newBuildsByID;
    std::multimap<StorePath, BuildID> newBuildsByPath;

-    unsigned int newLastBuildId = lastBuildId;
-
    {
        pqxx::work txn(conn);

@@ -106,17 +112,12 @@ bool State::getQueuedBuilds(Connection & conn,
             "jobsets.name as jobset, job, drvPath, maxsilent, timeout, timestamp, "
             "globalPriority, priority from Builds "
             "inner join jobsets on builds.jobset_id = jobsets.id "
-             "where builds.id > $1 and finished = 0 order by globalPriority desc, builds.id",
-            lastBuildId);
+             "where finished = 0 order by globalPriority desc, random()");

        for (auto const & row : res) {
            auto builds_(builds.lock());
            BuildID id = row["id"].as<BuildID>();
            if (buildOne && id != buildOne) continue;
-            if (id > newLastBuildId) {
-                newLastBuildId = id;
-                prom.queue_max_id.Set(id);
-            }
            if (builds_->count(id)) continue;

            auto build = std::make_shared<Build>(
@@ -318,15 +319,13 @@ bool State::getQueuedBuilds(Connection & conn,

        /* Stop after a certain time to allow priority bumps to be
           processed. */
-        if (std::chrono::system_clock::now() > start + std::chrono::seconds(600)) {
+        if (std::chrono::system_clock::now() > start + std::chrono::seconds(60)) {
            prom.queue_checks_early_exits.Increment();
            break;
        }
    }

    prom.queue_checks_finished.Increment();
-
-    lastBuildId = newBuildsByID.empty() ? newLastBuildId : newBuildsByID.begin()->first - 1;
    return newBuildsByID.empty();
 }

@@ -405,6 +404,34 @@ void State::processQueueChange(Connection & conn)
 }


+std::map<DrvOutput, std::optional<StorePath>> State::getMissingRemotePaths(
+    ref<Store> destStore,
+    const std::map<DrvOutput, std::optional<StorePath>> & paths)
+{
+    Sync<std::map<DrvOutput, std::optional<StorePath>>> missing_;
+    ThreadPool tp;
+
+    for (auto & [output, maybeOutputPath] : paths) {
+        if (!maybeOutputPath) {
+            auto missing(missing_.lock());
+            missing->insert({output, maybeOutputPath});
+        } else {
+            tp.enqueue([&] {
+                if (!destStore->isValidPath(*maybeOutputPath)) {
+                    auto missing(missing_.lock());
+                    missing->insert({output, maybeOutputPath});
+                }
+            });
+        }
+    }
+
+    tp.process();
+
+    auto missing(missing_.lock());
+    return *missing;
+}
+
+
 Step::ptr State::createStep(ref<Store> destStore,
    Connection & conn, Build::ptr build, const StorePath & drvPath,
    Build::ptr referringBuild, Step::ptr referringStep, std::set<StorePath> & finishedDrvs,
@@ -485,16 +512,15 @@ Step::ptr State::createStep(ref<Store> destStore,

    /* Are all outputs valid? */
    auto outputHashes = staticOutputHashes(*localStore, *(step->drv));
-    bool valid = true;
-    std::map<DrvOutput, std::optional<StorePath>> missing;
+    std::map<DrvOutput, std::optional<StorePath>> paths;
    for (auto & [outputName, maybeOutputPath] : destStore->queryPartialDerivationOutputMap(drvPath, &*localStore)) {
        auto outputHash = outputHashes.at(outputName);
-        if (maybeOutputPath && destStore->isValidPath(*maybeOutputPath))
-            continue;
-        valid = false;
-        missing.insert({{outputHash, outputName}, maybeOutputPath});
+        paths.insert({{outputHash, outputName}, maybeOutputPath});
    }

+    auto missing = getMissingRemotePaths(destStore, paths);
+    bool valid = missing.empty();
+
    /* Try to copy the missing paths from the local store or from
       substitutes. */
    if (!missing.empty()) {
--- a/src/hydra-queue-runner/state.hh
+++ b/src/hydra-queue-runner/state.hh
@@ -6,6 +6,8 @@
 #include <map>
 #include <memory>
 #include <queue>
+#include <regex>
+#include <semaphore>

 #include <prometheus/counter.h>
 #include <prometheus/gauge.h>
@@ -21,6 +23,8 @@
 #include "sync.hh"
 #include "nar-extractor.hh"
 #include "serve-protocol.hh"
+#include "serve-protocol-impl.hh"
+#include "serve-protocol-connection.hh"
 #include "machines.hh"


@@ -55,6 +59,7 @@ typedef enum {
    ssConnecting = 10,
    ssSendingInputs = 20,
    ssBuilding = 30,
+    ssWaitingForLocalSlot = 35,
    ssReceivingOutputs = 40,
    ssPostProcessing = 50,
 } StepState;
@@ -238,6 +243,10 @@ struct Machine : nix::Machine
 {
    typedef std::shared_ptr<Machine> ptr;

+    /* TODO Get rid of: `nix::Machine::storeUri` is normalized in a way
+       we are not yet used to, but once we are, we don't need this. */
+    std::string sshName;
+
    struct State {
        typedef std::shared_ptr<State> ptr;
        counter currentJobs{0};
@@ -287,7 +296,17 @@ struct Machine : nix::Machine
        return true;
    }

-    bool isLocalhost() const;
+    bool isLocalhost()
+    {
+        std::regex r("^(ssh://|ssh-ng://)?localhost$");
+        return std::regex_search(sshName, r);
+    }
+
+    // A connection to a machine
+    struct Connection : nix::ServeProto::BasicClientConnection {
+        // Backpointer to the machine
+        ptr machine;
+    };
 };


@@ -341,9 +360,13 @@ private:

    /* The build machines. */
    std::mutex machinesReadyLock;
-    typedef std::map<nix::StoreReference::Variant, Machine::ptr> Machines;
+    typedef std::map<std::string, Machine::ptr> Machines;
    nix::Sync<Machines> machines; // FIXME: use atomic_shared_ptr

+    /* Throttler for CPU-bound local work. */
+    static constexpr unsigned int maxSupportedLocalWorkers = 1024;
+    std::counting_semaphore<maxSupportedLocalWorkers> localWorkThrottler;
+
    /* Various stats. */
    time_t startedAt;
    counter nrBuildsRead{0};
@@ -352,10 +375,10 @@ private:
    counter nrStepsStarted{0};
    counter nrStepsDone{0};
    counter nrStepsBuilding{0};
-    #if 0
    counter nrStepsCopyingTo{0};
+    counter nrStepsWaitingForDownloadSlot{0};
    counter nrStepsCopyingFrom{0};
-    #endif
+    counter nrStepsWaiting{0};
    counter nrUnsupportedSteps{0};
    counter nrRetries{0};
    counter maxNrRetries{0};
@@ -364,6 +387,8 @@ private:
    counter nrQueueWakeups{0};
    counter nrDispatcherWakeups{0};
    counter dispatchTimeMs{0};
+    counter bytesSent{0};
+    counter bytesReceived{0};
    counter nrActiveDbUpdates{0};

    /* Specific build to do for --build-one (testing only). */
@@ -440,7 +465,12 @@ private:
        prometheus::Counter& queue_steps_created;
        prometheus::Counter& queue_checks_early_exits;
        prometheus::Counter& queue_checks_finished;
-        prometheus::Gauge& queue_max_id;
+
+        prometheus::Counter& dispatcher_time_spent_running;
+        prometheus::Counter& dispatcher_time_spent_waiting;
+
+        prometheus::Counter& queue_monitor_time_spent_running;
+        prometheus::Counter& queue_monitor_time_spent_waiting;

        PromMetrics();
    };
@@ -484,8 +514,7 @@ private:
    void queueMonitorLoop(Connection & conn);

    /* Check the queue for new builds. */
-    bool getQueuedBuilds(Connection & conn,
-        nix::ref<nix::Store> destStore, unsigned int & lastBuildId);
+    bool getQueuedBuilds(Connection & conn, nix::ref<nix::Store> destStore);

    /* Handle cancellation, deletion and priority bumps. */
    void processQueueChange(Connection & conn);
@@ -493,6 +522,12 @@ private:
    BuildOutput getBuildOutputCached(Connection & conn, nix::ref<nix::Store> destStore,
        const nix::StorePath & drvPath);

+    /* Returns paths missing from the remote store. Paths are processed in
+     * parallel to work around the possible latency of remote stores. */
+    std::map<nix::DrvOutput, std::optional<nix::StorePath>> getMissingRemotePaths(
+        nix::ref<nix::Store> destStore,
+        const std::map<nix::DrvOutput, std::optional<nix::StorePath>> & paths);
+
    Step::ptr createStep(nix::ref<nix::Store> store,
        Connection & conn, Build::ptr build, const nix::StorePath & drvPath,
        Build::ptr referringBuild, Step::ptr referringStep, std::set<nix::StorePath> & finishedDrvs,
@@ -528,10 +563,11 @@ private:
       retried. */
    enum StepResult { sDone, sRetry, sMaybeCancelled };
    StepResult doBuildStep(nix::ref<nix::Store> destStore,
-        MachineReservation::ptr reservation,
+        MachineReservation::ptr & reservation,
        std::shared_ptr<ActiveStep> activeStep);

    void buildRemote(nix::ref<nix::Store> destStore,
+        MachineReservation::ptr & reservation,
        Machine::ptr machine, Step::ptr step,
        const nix::ServeProto::BuildOptions & buildOptions,
        RemoteResult & result, std::shared_ptr<ActiveStep> activeStep,
--- a/src/lib/Hydra/Controller/Build.pm
+++ b/src/lib/Hydra/Controller/Build.pm
@@ -238,7 +238,7 @@ sub serveFile {
        # XSS hole.
        $c->response->header('Content-Security-Policy' => 'sandbox allow-scripts');

-        $c->stash->{'plain'} = { data => grab(cmd => ["nix", "--experimental-features", "nix-command",
+        $c->stash->{'plain'} = { data => readIntoSocket(cmd => ["nix", "--experimental-features", "nix-command",
                                                      "store", "cat", "--store", getStoreUri(), "$path"]) };

        # Detect MIME type.
--- a/src/lib/Hydra/Controller/Jobset.pm
+++ b/src/lib/Hydra/Controller/Jobset.pm
@@ -364,6 +364,21 @@ sub evals_GET {
    );
 }

+sub errors :Chained('jobsetChain') :PathPart('errors') :Args(0) :ActionClass('REST') { }
+
+sub errors_GET {
+    my ($self, $c) = @_;
+
+    $c->stash->{template} = 'eval-error.tt';
+
+    my $jobsetName = $c->stash->{params}->{name};
+    $c->stash->{jobset} = $c->stash->{project}->jobsets->find(
+        { name => $jobsetName },
+        { '+columns' => { 'errormsg' => 'errormsg' } }
+    );
+
+    $self->status_ok($c, entity => $c->stash->{jobset});
+}

 # Redirect to the latest finished evaluation of this jobset.
 sub latest_eval : Chained('jobsetChain') PathPart('latest-eval') {
--- a/src/lib/Hydra/Controller/JobsetEval.pm
+++ b/src/lib/Hydra/Controller/JobsetEval.pm
@@ -86,6 +86,17 @@ sub view_GET {
    );
 }

+sub errors :Chained('evalChain') :PathPart('errors') :Args(0) :ActionClass('REST') { }
+
+sub errors_GET {
+    my ($self, $c) = @_;
+
+    $c->stash->{template} = 'eval-error.tt';
+
+    $c->stash->{eval} = $c->model('DB::JobsetEvals')->find($c->stash->{eval}->id, { prefetch => 'evaluationerror' });
+
+    $self->status_ok($c, entity => $c->stash->{eval});
+}

 sub create_jobset : Chained('evalChain') PathPart('create-jobset') Args(0) {
    my ($self, $c) = @_;
--- a/src/lib/Hydra/Controller/Root.pm
+++ b/src/lib/Hydra/Controller/Root.pm
@@ -162,7 +162,7 @@ sub status_GET {
            { "buildsteps.busy" => { '!=', 0 } },
            { order_by => ["globalpriority DESC", "id"],
              join => "buildsteps",
-              columns => [@buildListColumns]
+              columns => [@buildListColumns, 'buildsteps.drvpath', 'buildsteps.type']
            })]
    );
 }
--- a/src/lib/Hydra/Helper/BuildDiff.pm
+++ b/src/lib/Hydra/Helper/BuildDiff.pm
@@ -37,7 +37,16 @@ sub buildDiff {

    my $n = 0;
    foreach my $build (@{$builds}) {
-        my $aborted = $build->finished != 0 && ($build->buildstatus == 3 || $build->buildstatus == 4);
+        my $aborted = $build->finished != 0 && (
+            # aborted
+            $build->buildstatus == 3
+            # cancelled
+            || $build->buildstatus == 4
+            # timeout
+            || $build->buildstatus == 7
+            # log limit exceeded
+            || $build->buildstatus == 10
+        );
        my $d;
        my $found = 0;
        while ($n < scalar(@{$builds2})) {
@@ -79,4 +88,4 @@ sub buildDiff {
    return $ret;
 }

-1;
+1;
--- a/src/lib/Hydra/Helper/Nix.pm
+++ b/src/lib/Hydra/Helper/Nix.pm
@@ -36,6 +36,7 @@ our @EXPORT = qw(
    jobsetOverview
    jobsetOverview_
    pathIsInsidePrefix
+    readIntoSocket
    readNixFile
    registerRoot
    restartBuilds
@@ -296,8 +297,7 @@ sub getEvals {

    my @evals = $evals_result_set->search(
        { hasnewbuilds => 1 },
-        { order_by => "$me.id DESC", rows => $rows, offset => $offset
-        , prefetch => { evaluationerror => [ ] } });
+        { order_by => "$me.id DESC", rows => $rows, offset => $offset });
    my @res = ();
    my $cache = {};

@@ -417,6 +417,16 @@ sub pathIsInsidePrefix {
    return $cur;
 }

+sub readIntoSocket{
+    my (%args) = @_;
+    my $sock;
+
+    eval {
+        open($sock, "-|", @{$args{cmd}}) or die q(failed to open socket from command:\n $x);
+    };
+
+    return $sock;
+}



--- a/src/lib/Hydra/Schema/Result/EvaluationErrors.pm
+++ b/src/lib/Hydra/Schema/Result/EvaluationErrors.pm
@@ -105,4 +105,6 @@ __PACKAGE__->add_column(
    "+id" => { retrieve_on_insert => 1 }
 );

+__PACKAGE__->mk_group_accessors('column' => 'has_error');
+
 1;
--- a/src/lib/Hydra/Schema/Result/Jobsets.pm
+++ b/src/lib/Hydra/Schema/Result/Jobsets.pm
@@ -386,6 +386,8 @@ __PACKAGE__->add_column(
    "+id" => { retrieve_on_insert => 1 }
 );

+__PACKAGE__->mk_group_accessors('column' => 'has_error');
+
 sub supportsDynamicRunCommand {
  my ($self) = @_;

--- a/src/lib/Hydra/Schema/ResultSet/EvaluationErrors.pm
+++ b/src/lib/Hydra/Schema/ResultSet/EvaluationErrors.pm
@@ -0,0 +1,30 @@
+package Hydra::Schema::ResultSet::EvaluationErrors;
+
+use strict;
+use utf8;
+use warnings;
+
+use parent 'DBIx::Class::ResultSet';
+
+use Storable qw(dclone);
+
+__PACKAGE__->load_components('Helper::ResultSet::RemoveColumns');
+
+# Exclude expensive error message values unless explicitly requested, and
+# replace them with a summary field describing their presence/absence.
+sub search_rs {
+  my ( $class, $query, $attrs ) = @_;
+
+  if ($attrs) {
+    $attrs = dclone($attrs);
+  }
+
+  unless (exists $attrs->{'select'} || exists $attrs->{'columns'}) {
+    $attrs->{'+columns'}->{'has_error'} = "errormsg != ''";
+  }
+  unless (exists $attrs->{'+columns'}->{'errormsg'}) {
+    push @{ $attrs->{'remove_columns'} }, 'errormsg';
+  }
+
+  return $class->next::method($query, $attrs);
+}
--- a/src/lib/Hydra/Schema/ResultSet/Jobsets.pm
+++ b/src/lib/Hydra/Schema/ResultSet/Jobsets.pm
@@ -0,0 +1,30 @@
+package Hydra::Schema::ResultSet::Jobsets;
+
+use strict;
+use utf8;
+use warnings;
+
+use parent 'DBIx::Class::ResultSet';
+
+use Storable qw(dclone);
+
+__PACKAGE__->load_components('Helper::ResultSet::RemoveColumns');
+
+# Exclude expensive error message values unless explicitly requested, and
+# replace them with a summary field describing their presence/absence.
+sub search_rs {
+  my ( $class, $query, $attrs ) = @_;
+
+  if ($attrs) {
+    $attrs = dclone($attrs);
+  }
+
+  unless (exists $attrs->{'select'} || exists $attrs->{'columns'}) {
+    $attrs->{'+columns'}->{'has_error'} = "errormsg != ''";
+  }
+  unless (exists $attrs->{'+columns'}->{'errormsg'}) {
+    push @{ $attrs->{'remove_columns'} }, 'errormsg';
+  }
+
+  return $class->next::method($query, $attrs);
+}
--- a/src/meson.build
+++ b/src/meson.build
@@ -1,6 +1,5 @@
 # Native code
 subdir('libhydra')
-subdir('hydra-build-step')
 subdir('hydra-evaluator')
 subdir('hydra-queue-runner')

--- a/src/root/build.tt
+++ b/src/root/build.tt
@@ -61,21 +61,7 @@ END;
            <td>[% IF step.busy != 0 || ((step.machine || step.starttime) && (step.status == 0 || step.status == 1 || step.status == 3 || step.status == 4 || step.status == 7)); INCLUDE renderMachineName machine=step.machine; ELSE; "<em>n/a</em>"; END %]</td>
            <td class="step-status">
              [% IF step.busy != 0 %]
-                [% IF step.busy == 1 %]
-                  <strong>Preparing</strong>
-                [% ELSIF step.busy == 10 %]
-                  <strong>Connecting</strong>
-                [% ELSIF step.busy == 20 %]
-                  <strong>Sending inputs</strong>
-                [% ELSIF step.busy == 30 %]
-                  <strong>Building</strong>
-                [% ELSIF step.busy == 40 %]
-                  <strong>Receiving outputs</strong>
-                [% ELSIF step.busy == 50 %]
-                  <strong>Post-processing</strong>
-                [% ELSE %]
-                  <strong>Unknown state</strong>
-                [% END %]
+                [% INCLUDE renderBusyStatus %]
              [% ELSIF step.status == 0 %]
                [% IF step.isnondeterministic %]
                  <span class="warn">Succeeded with non-determistic result</span>
--- a/src/root/common.tt
+++ b/src/root/common.tt
@@ -91,6 +91,17 @@ BLOCK renderDuration;
  duration % 60 %]s[%
 END;

+BLOCK renderDrvInfo;
+  drvname = step.drvpath
+    .substr(11) # strip `/nix/store/`
+    .split('-').slice(1).join("-") # strip hash part
+    .substr(0, -4); # strip `.drv`
+  IF drvname != releasename;
+    IF step.type == 0; action = "Build"; ELSE; action = "Substitution"; END;
+    IF drvname; %]<em> ([% action %] of [% drvname %])</em>[% END;
+  END;
+END;
+

 BLOCK renderBuildListHeader %]
  <table class="table table-striped table-condensed clickable-rows">
@@ -131,7 +142,12 @@ BLOCK renderBuildListBody;
      [% END %]
      <td><a class="row-link" href="[% link %]">[% build.id %]</a></td>
      [% IF !hideJobName %]
-        <td><a href="[%link%]">[% IF !hideJobsetName %][%build.jobset.get_column("project")%]:[%build.jobset.get_column("name")%]:[% END %][%build.get_column("job")%]</td>
+        <td>
+          <a href="[%link%]">[% IF !hideJobsetName %][%build.jobset.get_column("project")%]:[%build.jobset.get_column("name")%]:[% END %][%build.get_column("job")%]</a>
+          [% IF showStepName %]
+            [% INCLUDE renderDrvInfo step=build.buildsteps releasename=build.nixname %]
+          [% END %]
+        </td>
      [% END %]
      <td class="nowrap">[% t = showSchedulingInfo ? build.timestamp : build.stoptime; IF t; INCLUDE renderRelativeDate timestamp=(showSchedulingInfo ? build.timestamp : build.stoptime); ELSE; "-"; END %]</td>
      <td>[% !showSchedulingInfo and build.get_column('releasename') ? build.get_column('releasename') : build.nixname %]</td>
@@ -245,6 +261,27 @@ BLOCK renderBuildStatusIcon;
 END;


+BLOCK renderBusyStatus;
+  IF step.busy == 1 %]
+    <strong>Preparing</strong>
+  [% ELSIF step.busy == 10 %]
+    <strong>Connecting</strong>
+  [% ELSIF step.busy == 20 %]
+    <strong>Sending inputs</strong>
+  [% ELSIF step.busy == 30 %]
+    <strong>Building</strong>
+  [% ELSIF step.busy == 35 %]
+    <strong>Waiting to receive outputs</strong>
+  [% ELSIF step.busy == 40 %]
+    <strong>Receiving outputs</strong>
+  [% ELSIF step.busy == 50 %]
+    <strong>Post-processing</strong>
+  [% ELSE %]
+    <strong>Unknown state</strong>
+  [% END;
+END;
+
+
 BLOCK renderStatus;
  IF build.finished;
    buildstatus = build.buildstatus;
@@ -476,7 +513,7 @@ BLOCK renderEvals %]
            ELSE %]
             -
            [% END %]
-            [% IF eval.evaluationerror.errormsg %]
+            [% IF eval.evaluationerror.has_error %]
              <span class="badge badge-warning">Eval Errors</span>
            [% END %]
          </td>
@@ -602,7 +639,7 @@ BLOCK renderJobsetOverview %]
        <td>[% HTML.escape(j.description) %]</td>
        <td>[% IF j.lastcheckedtime;
                 INCLUDE renderDateTime timestamp = j.lastcheckedtime;
-                 IF j.errormsg || j.fetcherrormsg; %]&nbsp;<span class = 'badge badge-warning'>Error</span>[% END;
+                 IF j.has_error || j.fetcherrormsg; %]&nbsp;<span class = 'badge badge-warning'>Error</span>[% END;
                 ELSE; "-";
               END %]</td>
        [% IF j.get_column('nrtotal') > 0 %]
--- a/src/root/eval-error.tt
+++ b/src/root/eval-error.tt
@@ -0,0 +1,26 @@
+[% PROCESS common.tt %]
+<!DOCTYPE html>
+
+<html lang="en">
+
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
+    [% INCLUDE style.tt %]
+  </head>
+
+  <body>
+
+    <div class="tab-content tab-pane">
+        <div id="tabs-errors" class="">
+          [% IF jobset %]
+              <p>Errors occurred at [% INCLUDE renderDateTime timestamp=(jobset.errortime || jobset.lastcheckedtime) %].</p>
+              <div class="card bg-light"><div class="card-body"><pre>[% HTML.escape(jobset.fetcherrormsg || jobset.errormsg) %]</pre></div></div>
+          [% ELSIF eval %]
+              <p>Errors occurred at [% INCLUDE renderDateTime timestamp=(eval.evaluationerror.errortime || eval.timestamp) %].</p>
+              <div class="card bg-light"><div class="card-body"><pre>[% HTML.escape(eval.evaluationerror.errormsg) %]</pre></div></div>
+          [% END %]
+        </div>
+    </div>
+  </body>
+</html>
--- a/src/root/jobset-eval.tt
+++ b/src/root/jobset-eval.tt
@@ -65,7 +65,7 @@ c.uri_for(c.controller('JobsetEval').action_for('view'),
  [% END %]

  [% IF aborted.size > 0 %]
-    <li class="nav-item"><a class="nav-link" href="#tabs-aborted" data-toggle="tab"><span class="text-warning">Aborted Jobs ([% aborted.size %])</span></a></li>
+    <li class="nav-item"><a class="nav-link" href="#tabs-aborted" data-toggle="tab"><span class="text-warning">Aborted / Timed out Jobs ([% aborted.size %])</span></a></li>
  [% END %]
  [% IF nowFail.size > 0 %]
    <li class="nav-item"><a class="nav-link" href="#tabs-now-fail" data-toggle="tab"><span class="text-warning">Newly Failing Jobs ([% nowFail.size %])</span></a></li>
@@ -90,7 +90,7 @@ c.uri_for(c.controller('JobsetEval').action_for('view'),
  [% END %]
  <li class="nav-item"><a class="nav-link" href="#tabs-inputs" data-toggle="tab">Inputs</a></li>

-  [% IF eval.evaluationerror.errormsg %]
+  [% IF eval.evaluationerror.has_error %]
    <li class="nav-item"><a class="nav-link" href="#tabs-errors" data-toggle="tab"><span class="text-warning">Evaluation Errors</span></a></li>
  [% END %]
 </ul>
@@ -108,13 +108,6 @@ c.uri_for(c.controller('JobsetEval').action_for('view'),

 <div class="tab-content">

-  [% IF eval.evaluationerror.errormsg %]
-    <div id="tabs-errors" class="tab-pane">
-      <p>Errors occurred at [% INCLUDE renderDateTime timestamp=(eval.evaluationerror.errortime || eval.timestamp) %].</p>
-      <div class="card bg-light"><div class="card-body"><pre>[% HTML.escape(eval.evaluationerror.errormsg) %]</pre></div></div>
-    </div>
-  [% END %]
-
  <div id="tabs-aborted" class="tab-pane">
    [% INCLUDE renderSome builds=aborted tabname="#tabs-aborted" %]
  </div>
@@ -172,10 +165,9 @@ c.uri_for(c.controller('JobsetEval').action_for('view'),
    [% END %]
  </div>

-  [% IF eval.evaluationerror.errormsg %]
+  [% IF eval.evaluationerror.has_error %]
    <div id="tabs-errors" class="tab-pane">
-      <p>Errors occurred at [% INCLUDE renderDateTime timestamp=(eval.evaluationerror.errortime || eval.timestamp) %].</p>
-      <div class="card bg-light"><div class="card-body"><pre>[% HTML.escape(eval.evaluationerror.errormsg) %]</pre></div></div>
+      <iframe src="[% c.uri_for(c.controller('JobsetEval').action_for('errors'), [eval.id], params) %]" loading="lazy" frameBorder="0" width="100%"></iframe>
    </div>
  [% END %]
 </div>
--- a/src/root/jobset.tt
+++ b/src/root/jobset.tt
@@ -61,7 +61,7 @@
  [% END %]

  <li class="nav-item"><a class="nav-link active" href="#tabs-evaluations" data-toggle="tab">Evaluations</a></li>
-  [% IF jobset.errormsg || jobset.fetcherrormsg %]
+  [% IF jobset.has_error || jobset.fetcherrormsg %]
    <li class="nav-item"><a class="nav-link" href="#tabs-errors" data-toggle="tab"><span class="text-warning">Evaluation Errors</span></a></li>
  [% END %]
  <li class="nav-item"><a class="nav-link" href="#tabs-jobs" data-toggle="tab">Jobs</a></li>
@@ -79,7 +79,7 @@
        <th>Last checked:</th>
        <td>
          [% IF jobset.lastcheckedtime %]
-            [% INCLUDE renderDateTime timestamp = jobset.lastcheckedtime %], [% IF jobset.errormsg || jobset.fetcherrormsg %]<em class="text-warning">with errors!</em>[% ELSE %]<em>no errors</em>[% END %]
+            [% INCLUDE renderDateTime timestamp = jobset.lastcheckedtime %], [% IF jobset.has_error || jobset.fetcherrormsg %]<em class="text-warning">with errors!</em>[% ELSE %]<em>no errors</em>[% END %]
          [% ELSE %]
            <em>never</em>
          [% END %]
@@ -117,10 +117,9 @@

  </div>

-  [% IF jobset.errormsg || jobset.fetcherrormsg %]
+  [% IF jobset.has_error || jobset.fetcherrormsg %]
    <div id="tabs-errors" class="tab-pane">
-      <p>Errors occurred at [% INCLUDE renderDateTime timestamp=(jobset.errortime || jobset.lastcheckedtime) %].</p>
-      <div class="card bg-light"><div class="card-body"><pre>[% HTML.escape(jobset.fetcherrormsg || jobset.errormsg) %]</pre></div></div>
+      <iframe src="[% c.uri_for('/jobset' project.name jobset.name "errors") %]" loading="lazy" frameBorder="0" width="100%"></iframe>
    </div>
  [% END %]

--- a/src/root/layout.tt
+++ b/src/root/layout.tt
@@ -10,31 +10,7 @@

    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
-
-    <script type="text/javascript" src="[% c.uri_for("/static/js/jquery/jquery-3.4.1.min.js") %]"></script>
-    <script type="text/javascript" src="[% c.uri_for("/static/js/jquery/jquery-ui-1.10.4.min.js") %]"></script>
-    <script type="text/javascript" src="[% c.uri_for("/static/js/moment/moment-2.24.0.min.js") %]"></script>
-
-    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
-
-    <link href="[% c.uri_for("/static/fontawesome/css/all.css") %]" rel="stylesheet" />
-    <script type="text/javascript" src="[% c.uri_for("/static/js/popper.min.js") %]"></script>
-    <script type="text/javascript" src="[% c.uri_for("/static/bootstrap/js/bootstrap.min.js") %]"></script>
-    <link href="[% c.uri_for("/static/bootstrap/css/bootstrap.min.css") %]" rel="stylesheet" />
-
-    <!-- hydra.css may need to be moved to before boostrap to make the @media rule work. -->
-    <link rel="stylesheet" href="[% c.uri_for("/static/css/hydra.css") %]" type="text/css" />
-    <link rel="stylesheet" href="[% c.uri_for("/static/css/rotated-th.css") %]" type="text/css" />
-
-    <style>
-      .popover { max-width: 40%; }
-    </style>
-
-    <script type="text/javascript" src="[% c.uri_for("/static/js/bootbox.min.js") %]"></script>
-
-    <link rel="stylesheet" href="[% c.uri_for("/static/css/tree.css") %]" type="text/css" />
-
-    <script type="text/javascript" src="[% c.uri_for("/static/js/common.js") %]"></script>
+    [% INCLUDE style.tt %]

    [% IF c.config.enable_google_login %]
      <meta name="google-signin-client_id" content="[% c.config.google_client_id %]">
--- a/src/root/machine-status.tt
+++ b/src/root/machine-status.tt
@@ -6,10 +6,10 @@
  <thead>
    <tr>
      <th>Job</th>
-      <th>System</th>
      <th>Build</th>
      <th>Step</th>
      <th>What</th>
+      <th>Status</th>
      <th>Since</th>
    </tr>
  </thead>
@@ -40,10 +40,10 @@
          [% idle = 0 %]
          <tr>
            <td><tt>[% INCLUDE renderFullJobName project=step.project jobset=step.jobset job=step.job %]</tt></td>
-            <td><tt>[% step.system %]</tt></td>
            <td><a href="[% c.uri_for('/build' step.build) %]">[% step.build %]</a></td>
            <td>[% IF step.busy >= 30 %]<a class="row-link" href="[% c.uri_for('/build' step.build 'nixlog' step.stepnr 'tail') %]">[% step.stepnr %]</a>[% ELSE; step.stepnr; END %]</td>
            <td><tt>[% step.drvpath.match('-(.*)').0 %]</tt></td>
+            <td>[% INCLUDE renderBusyStatus %]</td>
            <td style="width: 10em">[% INCLUDE renderDuration duration = curTime - step.starttime %] </td>
          </tr>
        [% END %]
--- a/src/root/static/js/common.js
+++ b/src/root/static/js/common.js
@@ -129,6 +129,12 @@ $(document).ready(function() {
            el.addClass("is-local");
        }
    });
+
+    [...document.getElementsByTagName("iframe")].forEach((element) => {
+        element.contentWindow.addEventListener("DOMContentLoaded", (_) => {
+            element.style.height = element.contentWindow.document.body.scrollHeight + 'px';
+        })
+    })
 });

 var tabsLoaded = {};
--- a/src/root/status.tt
+++ b/src/root/status.tt
@@ -7,7 +7,7 @@

 [% ELSE %]

-  [% INCLUDE renderBuildList builds=resource showSchedulingInfo=1 hideResultInfo=1 busy=1 %]
+  [% INCLUDE renderBuildList builds=resource showSchedulingInfo=1 hideResultInfo=1 busy=1 showStepName=1 %]

 [% END %]

--- a/src/root/style.tt
+++ b/src/root/style.tt
@@ -0,0 +1,24 @@
+<script type="text/javascript" src="[% c.uri_for("/static/js/jquery/jquery-3.4.1.min.js") %]"></script>
+<script type="text/javascript" src="[% c.uri_for("/static/js/jquery/jquery-ui-1.10.4.min.js") %]"></script>
+<script type="text/javascript" src="[% c.uri_for("/static/js/moment/moment-2.24.0.min.js") %]"></script>
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0" />
+
+<link href="[% c.uri_for("/static/fontawesome/css/all.css") %]" rel="stylesheet" />
+<script type="text/javascript" src="[% c.uri_for("/static/js/popper.min.js") %]"></script>
+<script type="text/javascript" src="[% c.uri_for("/static/bootstrap/js/bootstrap.min.js") %]"></script>
+<link href="[% c.uri_for("/static/bootstrap/css/bootstrap.min.css") %]" rel="stylesheet" />
+
+<!-- hydra.css may need to be moved to before boostrap to make the @media rule work. -->
+<link rel="stylesheet" href="[% c.uri_for("/static/css/hydra.css") %]" type="text/css" />
+<link rel="stylesheet" href="[% c.uri_for("/static/css/rotated-th.css") %]" type="text/css" />
+
+<style>
+  .popover { max-width: 40%; }
+</style>
+
+<script type="text/javascript" src="[% c.uri_for("/static/js/bootbox.min.js") %]"></script>
+
+<link rel="stylesheet" href="[% c.uri_for("/static/css/tree.css") %]" type="text/css" />
+
+<script type="text/javascript" src="[% c.uri_for("/static/js/common.js") %]"></script>
--- a/t/Hydra/Controller/Jobset/evals.t
+++ b/t/Hydra/Controller/Jobset/evals.t
@@ -32,4 +32,9 @@ subtest "/jobset/PROJECT/JOBSET/evals" => sub {
    ok($jobsetevals->is_success, "The page showing the jobset evals returns 200.");
 };

+subtest "/jobset/PROJECT/JOBSET/errors" => sub {
+    my $jobsetevals = request(GET '/jobset/' . $project->name . '/' . $jobset->name . '/errors');
+    ok($jobsetevals->is_success, "The page showing the jobset eval errors returns 200.");
+};
+
 done_testing;
--- a/t/Hydra/Controller/JobsetEval/fetch.t
+++ b/t/Hydra/Controller/JobsetEval/fetch.t
@@ -35,6 +35,10 @@ subtest "Fetching the eval's overview" => sub {
    is($fetch->code, 200, "channel page is 200");
 };

+subtest "Fetching the eval's overview" => sub {
+    my $fetch = request(GET '/eval/' . $eval->id, '/errors');
+    is($fetch->code, 200, "errors page is 200");
+};


 done_testing;
--- a/t/lib/CliRunners.pm
+++ b/t/lib/CliRunners.pm
@@ -14,7 +14,7 @@ our @EXPORT = qw(
 sub evalSucceeds {
    my ($jobset) = @_;
    my ($res, $stdout, $stderr) = captureStdoutStderr(60, ("hydra-eval-jobset", $jobset->project->name, $jobset->name));
-    $jobset->discard_changes;  # refresh from DB
+    $jobset->discard_changes({ '+columns' => {'errormsg' => 'errormsg'} });  # refresh from DB
    if ($res) {
        chomp $stdout; chomp $stderr;
        utf8::decode($stdout) or die "Invalid unicode in stdout.";
@@ -29,7 +29,7 @@ sub evalSucceeds {
 sub evalFails {
    my ($jobset) = @_;
    my ($res, $stdout, $stderr) = captureStdoutStderr(60, ("hydra-eval-jobset", $jobset->project->name, $jobset->name));
-    $jobset->discard_changes;  # refresh from DB
+    $jobset->discard_changes({ '+columns' => {'errormsg' => 'errormsg'} });  # refresh from DB
    if (!$res) {
        chomp $stdout; chomp $stderr;
        utf8::decode($stdout) or die "Invalid unicode in stdout.";
--- a/t/meson.build
+++ b/t/meson.build
@@ -27,8 +27,6 @@ testenv.prepend('PERL5LIB',
  separator: ':'
 )
 testenv.prepend('PATH',
-  fs.parent(find_program('nix').full_path()),
-  fs.parent(hydra_build_step.full_path()),
  fs.parent(hydra_evaluator.full_path()),
  fs.parent(hydra_queue_runner.full_path()),
  meson.project_source_root() / 'src/script',
--- a/t/queue-runner/direct-indirect-constituents.t
+++ b/t/queue-runner/direct-indirect-constituents.t
@@ -13,7 +13,7 @@ my $constituentBuildA = $builds->{"constituentA"};
 my $constituentBuildB = $builds->{"constituentB"};

 my $eval = $constituentBuildA->jobsetevals->first();
-is($eval->evaluationerror->errormsg, "");
+is($eval->evaluationerror->has_error, 0);

 subtest "Verifying the direct aggregate" => sub {
    my $aggBuild = $builds->{"direct_aggregate"};
Author	SHA1	Message	Date
Pierre Bourdon	c3b6e7b425	queue runner: fix nullptr deref on build exception after releasing a machine reservation	2025-02-16 13:27:26 +01:00
K900	c60e7955bf	Add metric for builds waiting for download slot (cherry picked from commit f23ec71227911891807706b6b978836e4d80edde)	2025-02-12 10:35:17 +01:00
Maximilian Bosch	90399cb674	readIntoSocket: fix with store URIs containing an `&` The third argument to `open()` in `-\|` mode is passed to a shell if it's a string. In my case the store URI contains `?secret-key=${signingKey.directory}/secret&compression=zstd` For the `nix store cat` case this means that * until `&` the process will be started in the background. This fails immediately because no path to cat is specified. * `compression=zstd` is a variable assignment * the `$path` argument to `store cat` is attempted to be executed as another command Passing just the list solves the problem. (cherry picked from commit 3ee51dbe589458cc54ff753317bbc6db530bddc0)	2025-02-12 10:35:17 +01:00
git@71rd.net	d6a3ef484c	Stream files from store instead of buffering them When an artifact is requested from hydra the output is first copied from the nix store into memory and then sent as a response, delaying the download and taking up significant amounts of memory. As reported in https://github.com/NixOS/hydra/issues/1357 Instead of calling a command and blocking while reading in the entire output, this adds read_into_socket(). the function takes a command, starting a subprocess with that command, returning a file descriptor attached to stdout. This file descriptor is then by responsebuilder of Catalyst to steam the output directly (cherry picked from commit 459aa0a5983a0bd546399c08231468d6e9282f54)	2025-02-12 10:35:17 +01:00
Pierre Bourdon	685857df2e	web: replace 'errormsg' with 'errormsg IS NULL' in most cases This is implement in an extremely hacky way due to poor DBIx feature support. Ideally, what we'd need is a way to tell DBIx to ignore the errormsg column unless explicitly requested, and to automatically add a computed 'errormsg IS NULL' column in others. Since it does not support that, this commit instead hacks some support via method overrides while taking care to not break anything obvious.	2025-02-12 10:35:17 +01:00
ajs124	f2b6e9d8ab	lazy-load evaluation errors Closes #1362	2025-02-12 10:35:17 +01:00
Maximilian Bosch	b04847335f	Only show stepname if it doesn't equal the name of the drv When building e.g. nixpkgs, the "Running builds" view will mostly look like this hello.x86_64-linux (Build of hello-X.Y) exa.x86_64-linux (Build of exa-X.Y) ... This doesn't provide any useful information. Showing the step name only makes sense if it's not a child of the job's derivation. With this patch, that information will only be shown if the drv name (i.e. w/o `/nix/store/` prefix, .drv ext & hash) is not equal to the drv name of the job itself (build.nixname).	2025-02-12 10:35:17 +01:00
Maximilian Bosch	7e61000172	Running builds view: show build step names When using Hydra to build machine configurations, you'll often see "nixosConfigurations.foo" five times, i.e. for each build step being run. This isn't very helpful I think because in such a case, a single build step can also be compiling the Linux kernel. This change also fetches the `drvpath` and `type` from the `buildsteps` relation. We're already joining it, so this doesn't make much difference (confirmed via query logging that this doesn't cause extra SQL queries). Unfortunately build steps don't have a human readable name, so I'm deriving it from the drvpath by stripping away the hash (assuming that it'll never contain a `-` and that `/nix/store/` is used as prefix). I decided against using the Nix bindings for that to avoid too much overhead due to store operations for each build step.	2025-02-12 10:35:17 +01:00
Maximilian Bosch	6d04e824d5	Make "timed out" and "log limit exceeded" builds aborted In `73694087a0` I gave builds that failed because of a timeout or exceeded log limit a stop sign and I stand by that reasoning: with that it's possible to distinguish between actual build failures and rather transient things such as timeouts. Back then I considered it a feature that these are shown in a different tab, but I don't think that's a good idea anymore. When using a jobset to e.g. track the regressions from a mass rebuild (like a compiler or gcc update), "Newly failed builds" should exclusively display regressions (and flaky builds of course, not much I can do about that). Also, when a bunch of builds fail in such a jobset because of e.g. a broken connection to a builder that results in a timeout, I want to be able to restart them all w/o rebuilding actual regressions. To make it clear that we not only have "Aborted" builds in the tab, I renamed the label to "Aborted / Timed out".	2025-02-12 10:35:16 +01:00
Pierre Bourdon	36e25d8fd2	queue-runner: release machine reservation while copying outputs This allows for better builder usage when the queue runner is busy. To avoid running into uncontrollable imbalances between builder/queue runner, we only release the machine reservation after the local throttler has found a slot to start copying the outputs for that build.	2025-02-12 10:35:16 +01:00
Pierre Bourdon	d4e273f7b1	queue-runner: switch to pseudorandom ordering of builds processing We don't rely on sequential / monotonic build IDs processing anymore, so randomizing actually has the advantage of mixing builds for different systems together, to avoid only one chunk of builds for a single system getting processed while builders for other systems are starved.	2025-02-12 10:35:16 +01:00
Pierre Bourdon	23366ec10d	queue runner: introduce some parallelism for remote paths lookup Each output for a given step being ingested is looked up in parallel, which should basically multiply the speed of builds ingestion by the average number of outputs per derivation.	2025-02-12 10:35:16 +01:00
Pierre Bourdon	98759e4ff9	queue-runner: reduce the time between queue monitor restarts This will induce more DB queries (though these are fairly cheap), but at the benefit of processing bumps within 1m instead of within 10m.	2025-02-12 10:35:16 +01:00
Pierre Bourdon	d850c99883	queue-runner: remove id > X from new builds query Running the query with/without it shows that it makes no difference to postgres, since there's an index on finished=0 already. This allows a few simplifications, but also paves the way towards running multiple parallel monitor threads in the future.	2025-02-12 10:35:16 +01:00
Pierre Bourdon	1b76eec4e8	queue-runner: add prom metrics to allow detecting internal bottlenecks By looking at the ratio of running vs. waiting for the dispatcher and the queue monitor, we should get better visibility into what hydra is currently bottlenecked on. There are other side effects we can try to measure to get to the same result, but having a simple way doesn't cost us much.	2025-02-12 10:35:15 +01:00
Pierre Bourdon	3bb1a61a7d	web: include current step status on /machines	2025-02-12 10:35:15 +01:00
Pierre Bourdon	3b9045f60d	queue-runner: limit parallelism of CPU intensive operations My current theory is that running more parallel xz than available CPU cores is reducing our overall throughput by requiring more scheduling overhead and more cache thrashing.	2025-02-12 10:35:15 +01:00