Compare commits

..

1 Commits

Author SHA1 Message Date
Pierre Bourdon
6fc7a6f7c8 queue-runner: add prom metrics to allow detecting internal bottlenecks
By looking at the ratio of running vs. waiting for the dispatcher and
the queue monitor, we should get better visibility into what hydra is
currently bottlenecked on.

There are other side effects we can try to measure to get to the same
result, but having a simple way doesn't cost us much.
2025-04-13 16:25:14 +02:00
13 changed files with 102 additions and 93 deletions

31
flake.lock generated
View File

@@ -1,13 +1,22 @@
{
"nodes": {
"nix": {
"flake": false,
"inputs": {
"flake-compat": [],
"flake-parts": [],
"git-hooks-nix": [],
"nixpkgs": [
"nixpkgs"
],
"nixpkgs-23-11": [],
"nixpkgs-regression": []
},
"locked": {
"lastModified": 1745420957,
"narHash": "sha256-ZbB3IH9OlJvo14GlQZbYHzJojf/HCDT38GzYTod8DaU=",
"lastModified": 1744030329,
"narHash": "sha256-r+psCOW77vTSTNbxTVrYHeh6OgB0QukbnyUVDwg8s4I=",
"owner": "NixOS",
"repo": "nix",
"rev": "70921714cb3b5e6041b7413459541838651079f3",
"rev": "a4962f73b5fc874d4b16baef47921daf349addfc",
"type": "github"
},
"original": {
@@ -20,11 +29,11 @@
"nix-eval-jobs": {
"flake": false,
"locked": {
"lastModified": 1744370057,
"narHash": "sha256-n220U5pjzCtTtOJtbga4Xr/PyllowKw9anSevgCqJEw=",
"lastModified": 1744018595,
"narHash": "sha256-v5n6t49X7MOpqS9j0FtI6TWOXvxuZMmGsp2OfUK5QfA=",
"owner": "nix-community",
"repo": "nix-eval-jobs",
"rev": "1260c6599d22dfd8c25fea6893c3d031996b20e1",
"rev": "cba718bafe5dc1607c2b6761ecf53c641a6f3b21",
"type": "github"
},
"original": {
@@ -35,16 +44,16 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1748124805,
"narHash": "sha256-8A7HjmnvCpDjmETrZY1QwzKunR63LiP7lHu1eA5q6JI=",
"lastModified": 1743987495,
"narHash": "sha256-46T2vMZ4/AfCK0Y2OjlFzJPxmdpP8GtsuEqSSJv3oe4=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "db1aed32009f408e4048c1dd0beaf714dd34ed93",
"rev": "db8f4fe18ce772a9c8f3adf321416981c8fe9371",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-25.05-small",
"ref": "nixos-24.11-small",
"repo": "nixpkgs",
"type": "github"
}

View File

@@ -1,12 +1,18 @@
{
description = "A Nix-based continuous build system";
inputs.nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.05-small";
inputs.nixpkgs.url = "github:NixOS/nixpkgs/nixos-24.11-small";
inputs.nix = {
url = "github:NixOS/nix/2.28-maintenance";
# We want to control the deps precisely
flake = false;
inputs.nixpkgs.follows = "nixpkgs";
# hide nix dev tooling from our lock file
inputs.flake-parts.follows = "";
inputs.git-hooks-nix.follows = "";
inputs.nixpkgs-regression.follows = "";
inputs.nixpkgs-23-11.follows = "";
inputs.flake-compat.follows = "";
};
inputs.nix-eval-jobs = {
@@ -24,27 +30,10 @@
# A Nixpkgs overlay that provides a 'hydra' package.
overlays.default = final: prev: {
nixDependenciesForHydra = final.lib.makeScope final.newScope
(import (nix + "/packaging/dependencies.nix") {
pkgs = final;
inherit (final) stdenv;
inputs = {};
});
nixComponentsForHydra = final.lib.makeScope final.nixDependenciesForHydra.newScope
(import (nix + "/packaging/components.nix") {
officialRelease = true;
inherit (final) lib;
pkgs = final;
src = nix;
maintainers = [ ];
});
nix-eval-jobs = final.callPackage nix-eval-jobs {
nixComponents = final.nixComponentsForHydra;
};
nix-eval-jobs = final.callPackage nix-eval-jobs {};
hydra = final.callPackage ./package.nix {
inherit (final.lib) fileset;
inherit (nixpkgs.lib) fileset;
rawSrc = self;
nixComponents = final.nixComponentsForHydra;
};
};
@@ -84,26 +73,24 @@
});
packages = forEachSystem (system: let
inherit (nixpkgs) lib;
pkgs = nixpkgs.legacyPackages.${system};
nixDependencies = lib.makeScope pkgs.newScope
(import (nix + "/packaging/dependencies.nix") {
inherit pkgs;
inherit (pkgs) stdenv;
inputs = {};
});
nixComponents = lib.makeScope nixDependencies.newScope
(import (nix + "/packaging/components.nix") {
officialRelease = true;
inherit lib pkgs;
src = nix;
maintainers = [ ];
});
nixComponents = {
inherit (nix.packages.${system})
nix-util
nix-store
nix-expr
nix-fetchers
nix-flake
nix-main
nix-cmd
nix-cli
nix-perl-bindings
;
};
in {
nix-eval-jobs = pkgs.callPackage nix-eval-jobs {
nix-eval-jobs = nixpkgs.legacyPackages.${system}.callPackage nix-eval-jobs {
inherit nixComponents;
};
hydra = pkgs.callPackage ./package.nix {
hydra = nixpkgs.legacyPackages.${system}.callPackage ./package.nix {
inherit (nixpkgs.lib) fileset;
inherit nixComponents;
inherit (self.packages.${system}) nix-eval-jobs;

View File

@@ -228,8 +228,8 @@ in
nix.settings = {
trusted-users = [ "hydra-queue-runner" ];
keep-outputs = true;
keep-derivations = true;
gc-keep-outputs = true;
gc-keep-derivations = true;
};
services.hydra-dev.extraConfig =

View File

@@ -277,8 +277,5 @@ stdenv.mkDerivation (finalAttrs: {
dontStrip = true;
meta.description = "Build of Hydra on ${stdenv.system}";
passthru = {
inherit perlDeps;
nix = nixComponents.nix-cli;
};
passthru = { inherit perlDeps; };
})

View File

@@ -134,8 +134,6 @@ system_time State::doDispatch()
comparator is a partial ordering (see MachineInfo). */
int highestGlobalPriority;
int highestLocalPriority;
size_t numRequiredSystemFeatures;
size_t numRevDeps;
BuildID lowestBuildID;
StepInfo(Step::ptr step, Step::State & step_) : step(step)
@@ -144,8 +142,6 @@ system_time State::doDispatch()
lowestShareUsed = std::min(lowestShareUsed, jobset->shareUsed());
highestGlobalPriority = step_.highestGlobalPriority;
highestLocalPriority = step_.highestLocalPriority;
numRequiredSystemFeatures = step->requiredSystemFeatures.size();
numRevDeps = step_.rdeps.size();
lowestBuildID = step_.lowestBuildID;
}
};
@@ -198,8 +194,6 @@ system_time State::doDispatch()
a.highestGlobalPriority != b.highestGlobalPriority ? a.highestGlobalPriority > b.highestGlobalPriority :
a.lowestShareUsed != b.lowestShareUsed ? a.lowestShareUsed < b.lowestShareUsed :
a.highestLocalPriority != b.highestLocalPriority ? a.highestLocalPriority > b.highestLocalPriority :
a.numRequiredSystemFeatures != b.numRequiredSystemFeatures ? a.numRequiredSystemFeatures > b.numRequiredSystemFeatures :
a.numRevDeps != b.numRevDeps ? a.numRevDeps > b.numRevDeps :
a.lowestBuildID < b.lowestBuildID;
});

View File

@@ -98,6 +98,34 @@ State::PromMetrics::PromMetrics()
.Register(*registry)
.Add({})
)
, dispatcher_time_spent_running(
prometheus::BuildCounter()
.Name("hydraqueuerunner_dispatcher_time_spent_running")
.Help("Time (in micros) spent running the dispatcher")
.Register(*registry)
.Add({})
)
, dispatcher_time_spent_waiting(
prometheus::BuildCounter()
.Name("hydraqueuerunner_dispatcher_time_spent_waiting")
.Help("Time (in micros) spent waiting for the dispatcher to obtain work")
.Register(*registry)
.Add({})
)
, queue_monitor_time_spent_running(
prometheus::BuildCounter()
.Name("hydraqueuerunner_queue_monitor_time_spent_running")
.Help("Time (in micros) spent running the queue monitor")
.Register(*registry)
.Add({})
)
, queue_monitor_time_spent_waiting(
prometheus::BuildCounter()
.Name("hydraqueuerunner_queue_monitor_time_spent_waiting")
.Help("Time (in micros) spent waiting for the queue monitor to obtain work")
.Register(*registry)
.Add({})
)
{
}

View File

@@ -464,6 +464,12 @@ private:
prometheus::Counter& queue_monitor_time_spent_running;
prometheus::Counter& queue_monitor_time_spent_waiting;
prometheus::Counter& dispatcher_time_spent_running;
prometheus::Counter& dispatcher_time_spent_waiting;
prometheus::Counter& queue_monitor_time_spent_running;
prometheus::Counter& queue_monitor_time_spent_waiting;
PromMetrics();
};
PromMetrics prom;

View File

@@ -76,9 +76,7 @@ sub view_GET {
$c->stash->{removed} = $diff->{removed};
$c->stash->{unfinished} = $diff->{unfinished};
$c->stash->{aborted} = $diff->{aborted};
$c->stash->{totalAborted} = $diff->{totalAborted};
$c->stash->{totalFailed} = $diff->{totalFailed};
$c->stash->{totalQueued} = $diff->{totalQueued};
$c->stash->{failed} = $diff->{failed};
$c->stash->{full} = ($c->req->params->{full} || "0") eq "1";

View File

@@ -32,12 +32,7 @@ sub buildDiff {
removed => [],
unfinished => [],
aborted => [],
# These summary counters cut across the categories to determine whether
# actions such as "Restart all failed" or "Bump queue" are available.
totalAborted => 0,
totalFailed => 0,
totalQueued => 0,
failed => [],
};
my $n = 0;
@@ -85,15 +80,8 @@ sub buildDiff {
} else {
push @{$ret->{new}}, $build if !$found;
}
if ($build->finished != 0 && $build->buildstatus != 0) {
if ($aborted) {
++$ret->{totalAborted};
} else {
++$ret->{totalFailed};
}
} elsif ($build->finished == 0) {
++$ret->{totalQueued};
if (defined $build->buildstatus && $build->buildstatus != 0) {
push @{$ret->{failed}}, $build;
}
}

View File

@@ -48,16 +48,16 @@ c.uri_for(c.controller('JobsetEval').action_for('view'),
<a class="nav-link dropdown-toggle" data-toggle="dropdown" href="#">Actions</a>
<div class="dropdown-menu">
<a class="dropdown-item" href="[% c.uri_for(c.controller('JobsetEval').action_for('create_jobset'), [eval.id]) %]">Create a jobset from this evaluation</a>
[% IF totalQueued > 0 %]
[% IF unfinished.size > 0 %]
<a class="dropdown-item" href="[% c.uri_for(c.controller('JobsetEval').action_for('cancel'), [eval.id]) %]">Cancel all scheduled builds</a>
[% END %]
[% IF totalFailed > 0 %]
[% IF aborted.size > 0 || stillFail.size > 0 || nowFail.size > 0 || failed.size > 0 %]
<a class="dropdown-item" href="[% c.uri_for(c.controller('JobsetEval').action_for('restart_failed'), [eval.id]) %]">Restart all failed builds</a>
[% END %]
[% IF totalAborted > 0 %]
[% IF aborted.size > 0 %]
<a class="dropdown-item" href="[% c.uri_for(c.controller('JobsetEval').action_for('restart_aborted'), [eval.id]) %]">Restart all aborted builds</a>
[% END %]
[% IF totalQueued > 0 %]
[% IF unfinished.size > 0 %]
<a class="dropdown-item" href="[% c.uri_for(c.controller('JobsetEval').action_for('bump'), [eval.id]) %]">Bump builds to front of queue</a>
[% END %]
</div>

View File

@@ -17,7 +17,7 @@
[% name = m.key ? stripSSHUser(m.key) : "localhost" %]
<thead>
<tr>
<th colspan="7">
<th colspan="6">
<tt [% IF m.value.disabled %]style="text-decoration: line-through;"[% END %]>[% INCLUDE renderMachineName machine=m.key %]</tt>
[% IF m.value.systemTypes %]
<span class="muted" style="font-weight: normal;">

View File

@@ -372,7 +372,6 @@ sub evalJobs {
or die "cannot find the input containing the job expression\n";
@cmd = ("nix-eval-jobs",
"--option", "restrict-eval", "true",
"<" . $nixExprInputName . "/" . $nixExprPath . ">",
inputsToArgs($inputInfo));
}

View File

@@ -25,10 +25,7 @@ subtest "empty diff" => sub {
removed => [],
unfinished => [],
aborted => [],
totalAborted => 0,
totalFailed => 0,
totalQueued => 0,
failed => [],
},
"empty list of jobs returns empty diff"
);
@@ -51,7 +48,12 @@ subtest "2 different jobs" => sub {
"succeed_with_failed is a new job"
);
is($ret->{totalFailed}, 1, "total failed jobs is 1");
is(scalar(@{$ret->{failed}}), 1, "list of failed jobs is 1 element long");
is(
$ret->{failed}[0]->get_column('id'),
$builds->{"succeed_with_failed"}->get_column('id'),
"succeed_with_failed is a failed job"
);
is(
$ret->{removed},
@@ -68,9 +70,9 @@ subtest "2 different jobs" => sub {
subtest "failed job with no previous history" => sub {
my $ret = buildDiff([$builds->{"fails"}], []);
is($ret->{totalFailed}, 1, "total failed jobs is 1");
is(scalar(@{$ret->{failed}}), 1, "list of failed jobs is 1 element long");
is(
$ret->{new}[0]->get_column('id'),
$ret->{failed}[0]->get_column('id'),
$builds->{"fails"}->get_column('id'),
"fails is a failed job"
);
@@ -91,6 +93,7 @@ subtest "not-yet-built job with no previous history" => sub {
is($ret->{removed}, [], "removed");
is($ret->{unfinished}, [], "unfinished");
is($ret->{aborted}, [], "aborted");
is($ret->{failed}, [], "failed");
is(scalar(@{$ret->{new}}), 1, "list of new jobs is 1 element long");
is(