9 This is the implementation of
Meadow for a Swarm or Docker Engines
13 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
14 Copyright [2016-2023] EMBL-European Bioinformatics Institute
16 Licensed under the Apache License,
Version 2.0 (the
"License"); you may not use
this file except in compliance with the License.
17 You may obtain a copy of the License at
21 Unless required by applicable law or agreed to in writing, software distributed under the License
22 is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
23 See the License
for the specific language governing permissions and limitations under the License.
27 Please subscribe to the
Hive mailing list: http:
32 package Bio::EnsEMBL::Hive::Meadow::DockerSwarm;
39 use base (
'Bio::EnsEMBL::Hive::Meadow',
'Bio::EnsEMBL::Hive::Utils::RESTclient');
42 our $VERSION =
'5.2'; # Semantic version of the
Meadow interface:
43 # change the Major version whenever an incompatible change is introduced,
44 # change the Minor version whenever the interface is extended, but compatibility is retained.
46 sub construct_base_url {
47 my $dma = $ENV{
'DOCKER_MASTER_ADDR'};
48 return $dma &&
"http://$dma/v1.30";
55 my $self = $class->SUPER::new( @_ ); # First construct a
Meadow
56 $self->base_url( $class->construct_base_url
57 $self->{_DOCKER_MASTER_ADDR} = $ENV{
'DOCKER_MASTER_ADDR'}; # saves the location of the manager node
63 sub name { # also called to check
for availability
68 # Object instances have defined the base URL in the parent class
69 $url = $self->construct_base_url;
70 return undef unless $url;
74 my $swarm_attribs = $self->GET( $url ) || {};
76 return $swarm_attribs->{
'ID'};
80 sub _get_our_task_attribs {
83 return $self->{_task_attribs}
if $self->{_task_attribs};
85 # Get the container ID. Although in simple cases, the hostname is the same as
86 # the container ID, it is not always true. So we need to dig into cgroup stuff
89 # ID HOSTNAME STATUS AVAILABILITY MANAGER STATUS ENGINE VERSION
90 # lcprncbmd0z1523t0ft8ej9uy * head-node Ready Active Leader 18.09.0
91 # ksactwapa4nxaaokcj1xw62pr worker-1 Ready Active 18.09.0
92 # wcms6zxgq0hocoutznggs9r0u worker-2 Ready Active 18.09.0
93 # ior43tdjz9x7n4bzzmr5njvcr worker-3 Ready Active 18.09.0
94 # l6khe63f71z3ntv4abii3n9o1 worker-4 Ready Active 18.09.0
95 # nvmy341e4a3sqtt9k3cfdmc7w worker-5 Ready Active 18.09.0
96 # w42pyk5wvoa0qrzbnjhn1yyt7 worker-6 Ready Active 18.09.0
97 # 28h1sk9zwkw53bkv4bi95q2f1 worker-7 Ready Active 18.09.0
98 # rldg2wm4h19oo9cxxrdbpx5n4 worker-8 Ready Active 18.09.0
99 # 72cv6frnei4gjdv3p3l8bmd3c worker-9 Ready Active 18.09.0
100 # u21fny9eapmh09sflk45zzscz worker-10 Ready Active 18.09.0
102 # # cat /proc/self/cgroup
103 #13:name=systemd:/docker/c8ecf8b2f3f2a26543971b57fd37205164a19908871d7bd43405914fcd054bfd
104 #12:pids:/docker/c8ecf8b2f3f2a26543971b57fd37205164a19908871d7bd43405914fcd054bfd
105 #11:hugetlb:/docker/c8ecf8b2f3f2a26543971b57fd37205164a19908871d7bd43405914fcd054bfd
106 #10:net_prio:/docker/c8ecf8b2f3f2a26543971b57fd37205164a19908871d7bd43405914fcd054bfd
107 #9:perf_event:/docker/c8ecf8b2f3f2a26543971b57fd37205164a19908871d7bd43405914fcd054bfd
108 #8:net_cls:/docker/c8ecf8b2f3f2a26543971b57fd37205164a19908871d7bd43405914fcd054bfd
109 #7:freezer:/docker/c8ecf8b2f3f2a26543971b57fd37205164a19908871d7bd43405914fcd054bfd
110 #6:devices:/docker/c8ecf8b2f3f2a26543971b57fd37205164a19908871d7bd43405914fcd054bfd
111 #5:memory:/docker/c8ecf8b2f3f2a26543971b57fd37205164a19908871d7bd43405914fcd054bfd
112 #4:blkio:/docker/c8ecf8b2f3f2a26543971b57fd37205164a19908871d7bd43405914fcd054bfd
113 #3:cpuacct:/docker/c8ecf8b2f3f2a26543971b57fd37205164a19908871d7bd43405914fcd054bfd
114 #2:cpu:/docker/c8ecf8b2f3f2a26543971b57fd37205164a19908871d7bd43405914fcd054bfd
115 #1:cpuset:/docker/c8ecf8b2f3f2a26543971b57fd37205164a19908871d7bd43405914fcd054bfd
117 open(my $fh,
'<',
'/proc/self/cgroup');
118 my $container_prefix;
120 if (m{:/docker/(.*)$}) {
121 $container_prefix = $1;
125 # Not running in a container
126 return unless $container_prefix;
128 my $tasks_list = $self->GET(
'/tasks' );
129 my ($our_task_attribs) = grep { ($_->{
'Status'}{
'ContainerStatus'}{
'ContainerID'} ||
'') =~ /^${container_prefix}/ } @$tasks_list;
130 $self->{_task_attribs} = $our_task_attribs;
132 return $self->{_task_attribs};
136 sub get_current_hostname {
139 my $nodes_list = $self->GET(
'/nodes' );
140 my %node_id_2_ip =
map { ($_->{
'ID'} => $_->{
'Status'}{
'Addr'}) } @$nodes_list;
141 my $our_node_ip = $node_id_2_ip{ $self->_get_our_task_attribs()->{
'NodeID'} };
147 sub get_current_worker_process_id {
150 my $our_task_id = $self->_get_our_task_attribs()->{
'ID'};
156 sub deregister_local_process {
158 # so that the LOCAL child processes don't think they belong to the DockerSwarm meadow
159 delete $ENV{
'DOCKER_MASTER_ADDR'};
163 sub status_of_all_our_workers { # returns an arrayref
166 # my $service_tasks_struct = $self->GET( '/tasks?filters={"name":["' . $service_name . '"]}' );
167 my $service_tasks_struct = $self->GET(
'/tasks' );
169 my @status_list = ();
170 foreach my $task_entry (@$service_tasks_struct) {
171 my $slot = $task_entry->{
'Slot'}; # an index within the given service
172 my $task_id = $task_entry->{
'ID'};
173 my $prestatus = lc $task_entry->{
'Status'}{
'State'};
175 # Some statuses are explained at https://docs.docker.com/datacenter/ucp/2.2/guides/admin/monitor-and-troubleshoot/troubleshoot-task-state/
179 'assigned' =>
'PEND',
180 'accepted' =>
'PEND',
181 'preparing' =>
'RUN',
184 'complete' =>
'DONE',
185 'shutdown' =>
'DONE',
187 'rejected' =>
'EXIT',
188 'orphaned' =>
'EXIT',
189 }->{$prestatus} || $prestatus;
191 push @status_list, [ $task_id,
'docker_user', $status ];
194 return \@status_list;
198 #sub check_worker_is_alive_and_mine {
199 # my ($self, $worker) = @_;
201 # my $wpid = $worker->process_id();
202 # my $is_alive_and_mine = kill 0, $wpid;
204 # return $is_alive_and_mine;
209 # my ($self, $worker, $fast) = @_;
211 # system('kill', '-9', $worker->process_id());
214 sub type_resources_as_numeric {
216 # In Perl, large numbers would be stringified as strings by stringify
217 # and then JSON. Here we force them to be numeric
220 # 'Reservations' => {
221 # 'NanoCPUs' => 1000000000,
222 # 'MemoryBytes' => '34359738368'
225 # 'NanoCPUs' => 1000000000,
226 # 'MemoryBytes' => '34359738368'
231 my $resources = shift;
233 if (exists $resources->{
'Reservations'}) {
234 $resources->{
'Reservations'}->{
'NanoCPUs'} += 0
if exists $resources->{
'Reservations'}->{
'NanoCPUs'};
235 $resources->{
'Reservations'}->{
'MemoryBytes'} += 0
if exists $resources->{
'Reservations'}->{
'MemoryBytes'};
237 if (exists $resources->{
'Limits'}) {
238 $resources->{
'Limits'}->{
'NanoCPUs'} += 0
if exists $resources->{
'Limits'}->{
'NanoCPUs'};
239 $resources->{
'Limits'}->{
'MemoryBytes'} += 0
if exists $resources->{
'Limits'}->{
'MemoryBytes'};
244 sub submit_workers_return_meadow_pids {
245 my ($self, $worker_cmd, $required_worker_count, $iteration, $rc_name, $rc_specific_submission_cmd_args, $submit_log_subdir) = @_;
247 my $worker_cmd_components = [ split_for_bash($worker_cmd) ];
249 my $job_array_common_name = $self->job_array_common_name($rc_name, $iteration);
251 # Name collision detection
252 my $extra_suffix = 0;
253 my $service_name = $job_array_common_name;
254 while (scalar(@{ $self->GET(
'/tasks?filters={"name":["' . $service_name .
'"]}' ) })) {
256 $service_name =
"$job_array_common_name-$extra_suffix";
259 warn
"'$job_array_common_name' already used to name a service. Using '$service_name' instead.\n";
260 $job_array_common_name = $service_name;
263 die
"The image name for the ".$self->name.
" DockerSwarm meadow is not configured. Cannot submit jobs !" unless $self->config_get(
'ImageName');
265 # If the resource description is missing, use 1 core
266 my $default_resources = {
268 'NanoCPUs' => 1_000_000_000,
271 my $resources = destringify($rc_specific_submission_cmd_args);
273 my $service_create_data = {
274 'Name' => $job_array_common_name, # NB: service names in DockerSwarm have to be unique!
277 'Image' => $self->config_get(
'ImageName'),
278 'Args' => $worker_cmd_components,
279 'Mounts' => $self->config_get(
'Mounts'),
281 # Propagate these to the workers
282 "DOCKER_MASTER_ADDR=$self->{'_DOCKER_MASTER_ADDR'}",
283 "_EHIVE_HIDDEN_PASS=$ENV{'_EHIVE_HIDDEN_PASS'}",
286 # NOTE: By default, docker alway keeps logs. Should we disable them here
287 # $submit_log_subdir has been set ? There are no options to redirect
288 # the logs, so the option's value would be ignored.
292 'Resources' => $resources || $default_resources,
294 'Condition' =>
'none',
299 'Replicas' => int($required_worker_count),
303 type_resources_as_numeric($service_create_data->{
'TaskTemplate'}->{
'Resources'});
305 my $service_created_struct = $self->POST(
'/services/create', $service_create_data );
306 unless (exists $service_created_struct->{
'ID'}) {
307 die
"Submission unsuccessful: " . ($service_created_struct->{
'message'}
310 # Give some time to the Docker daemon to process the request
313 my $service_id = $service_created_struct->{
'ID'};
314 my $service_tasks_list = $self->GET( qq{/tasks?filters={
"service":[
"$service_id"]}} );
315 if (scalar(@$service_tasks_list) !=
int($required_worker_count)) {
316 die
"Submission unsuccessful: found " . scalar(@$service_tasks_list) .
" tasks instead of " . int($required_worker_count) .
"\n";
319 my @children_task_ids =
map { $_->{
'ID'} } @$service_tasks_list;
321 return \@children_task_ids;
325 sub run_on_host { # Overrides Meadow::run_on_host ; not supported yet - it
's just a placeholder to block the base class' functionality
326 my ($self, $meadow_host, $meadow_user, $command) = @_;