#!/usr/bin/env expect
############################################################################
# Purpose: Establish global state information for Slurm federation tests
#
# To define site-specific state information, set the values in a file
# named 'globals.local'. Those values will override any specified here.
# for example:
#
# $ cat globals.local
# set slurm_dir "/usr/local"
# set mpicc     "/usr/local/bin/mpicc"
#
############################################################################
# Copyright (C) SchedMD LLC.

# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the supplied file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with SLURM; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
############################################################################

source ./globals

# Set if testing federations
cset fed_slurm_base 	""
cset fedc1 		""
cset fedc2 		""
cset fedc3 		""

proc check_federation_setup { } {
	global fed_slurm_base fedc1 fedc2 fedc3
	set rc true
	if {$fed_slurm_base eq "" || $fedc1 eq "" || $fedc2 eq "" || $fedc3 eq ""} {
		set rc false
	}

	return $rc
}


proc setup_federation { fed_name } {
	global sacctmgr fedc1 fedc2 fedc3 eol

	set rc $::RETURN_SUCCESS

	spawn $sacctmgr -i add federation $fed_name
	set matches 0
	expect {
		-re "Adding Federation\\(s\\)$eol" {
			incr matches
				exp_continue
		}
		-re "$fed_name$eol" {
			incr matches
				exp_continue
		}
		timeout {
			fail "sacctmgr add not responding"
		}
		eof {
			wait
		}
	}
	if {!$rc && $matches != 2} {
		log_error "Failed to create federation"
		return $::RETURN_ERROR
	}

	set count 0
	foreach cluster [list $fedc1 $fedc2 $fedc3] {
		incr count
		spawn $sacctmgr -i mod cluster $cluster set federation=$fed_name features=
		set matches 0
		expect {
			-re "Setting$eol" {
				incr matches
					exp_continue
			}
			-re "^\\s+Feature\\s+=\\s+$eol" {
				incr matches
				exp_continue
			}
			-re "^\\s+Federation\\s+=\\s+$fed_name$eol" {
				incr matches
				exp_continue
			}
			-re "Modified cluster...$eol" {
				incr matches
				exp_continue
			}
			-re "^\\s+$cluster$eol" {
				incr matches
				exp_continue
			}
			timeout {
				fail "sacctmgr add not responding"
			}
			eof {
				wait
			}
		}
		if {!$rc && $matches != 5} {
			log_error "Failed to add $cluster to federation"
			set rc $::RETURN_ERROR
			break
		}

		if {$count > 1} {
			sleep 5
		}
	}
	return $rc
}


proc check_cluster_up { cname } {
	set rc true
	set matches 0
	set timeout 2
	global fed_slurm_base fedc1 fedc2 fedc3
	set my_scontrol "${fed_slurm_base}/$cname/bin/scontrol"
	log_user 0
	set my_pid [spawn $my_scontrol show config]
	expect {
		"Configuration data as of" {
			incr matches
		}
		timeout {
			log_warn "$cname not responding"
			slow_kill $my_pid
			set rc false
		}
		eof {
			wait
		}
	}
	if {$matches != 1} {
		log_error "$cname not responding"
		set rc false
	}
	log_user 1
	return $rc
}


proc check_federation_up {} {
	set rc true
	global fedc1 fedc2 fedc3

	if {![check_cluster_up $fedc1] ||
	    ![check_cluster_up $fedc2] ||
	    ![check_cluster_up $fedc3]} {
		log_warn "This test can't be run if any clusters--$fedc1,\
			$fedc2, or $fedc3--are down"
		set rc false
	}

	return $rc
}


proc delete_federations { names } {
	global sacctmgr
	set matches 0
	set rc $::RETURN_SUCCESS
	set object "federation"
	spawn $sacctmgr -i delete $object $names
	expect {
		-re "privilege to perform this action" {
			log_error "Don't have privileges"
			set rc $::RETURN_ERROR
		}
		-re "(There was a problem|Unknown condition|Bad format on|Bad MaxWall|Unknown option)" {
			log_error "There was a problem with the sacctmgr command"
			set rc $::RETURN_ERROR
		}
		-re "Problem getting" {
			log_error "There was a problem getting information from the database"
			set rc $::RETURN_ERROR
		}
		-re "Problem adding" {
			log_error "There was an unknown problem"
			set rc $::RETURN_ERROR
		}
		-re "No associations" {
			log_error "Your command didn't return anything"
			set rc $::RETURN_ERROR
		}
		-re "Deleting $object" {
			incr matches
			exp_continue
		}
		-re " Nothing deleted" {
			incr matches
			exp_continue
		}
		timeout {
			fail "sacctmgr delete not responding"
		}
		eof {
			wait
		}
	}

	if {!$rc && $matches != 1} {
		log_error "sacctmgr had a problem deleting $object. Got $matches"
	}

	return $rc
}


proc get_clusterfed_info { fed_name } {
	global sacctmgr eol
	set matches 0
	array set clusters {}
	spawn $sacctmgr show cluster federation=$fed_name \
		    format="cluster%20,federation%20,id,controlhost,controlport,features,fedstate"
	expect {
		-re "Cluster\\s+Federation\\s+ID\\s+ControlHost\\s+ControlPort\\s+Features\\s+FedState $eol" {
			incr matches
			exp_continue
		}
		-re "\\s+(\\S+)\\s+$fed_name\\s+(\\d+)\\s+(\\S+)\\s+(\\d+)\\s+(\\S*)\\s+(\\S*) $eol" {
			set clusters($expect_out(1,string)) [dict create id     $expect_out(2,string) \
									 host   $expect_out(3,string) \
									 port   $expect_out(4,string) \
									 features $expect_out(5,string) \
									 state  $expect_out(6,string)]
			incr matches
			exp_continue
		}
		timeout {
			fail "sacctmgr add not responding"
		}
		eof {
			wait
		}
	}
	if {$matches < 2} {
		fail "Didn't match enough clusters for federation ($fed_name) ($matches < 2)"
	}

	return [array get clusters]
}


#
# Add a single cluster to the given federation.
# IN:  cname    - name of cluster to add to federation.
# IN:  fed_name - name of federation to add cluster to.
# RET: RETURN_SUCCESS, or non-zero on error.
#
proc add_cluster_to_fed {cname fed_name} {
	global sacctmgr eol

	set rc $::RETURN_SUCCESS
	set matches 0
	spawn $sacctmgr -i modify federation $fed_name set clusters+=$cname
	expect {
		-re "Setting$eol" {
			incr matches
			exp_continue
		}
		-re "Cluster\\s+ \\+= $cname$eol" {
			incr matches
			exp_continue
		}
		-re "^\\s+Modified federation...$eol" {
			incr matches
			exp_continue
		}
		-re "\\s+$fed_name$eol" {
			incr matches
			exp_continue
		}
		timeout {
			fail "sacctmgr add not responding"
		}
		eof {
			wait
		}
	}
	if {$rc || $matches != 4} {
		log_error "Failed to add $cname to $fed_name ($matches != 4)"
		set $rc $::RETURN_ERROR
	}

	return $rc
}


#
# Remove a single cluster from the given federation.
# IN:  cname    - name of cluster to remove from the federation.
# IN:  fed_name - name of federation to remove cluster from.
# RET: RETURN_SUCCESS, or non-zero on error.
#
proc remove_cluster_from_fed {cname fed_name} {
	global sacctmgr eol

	set rc $::RETURN_SUCCESS
	set matches 0
	spawn $sacctmgr -i modify federation $fed_name set clusters-=$cname
	expect {
		-re "Setting$eol" {
			incr matches
			exp_continue
		}
		-re "Cluster\\s+ -= $cname$eol" {
			incr matches
			exp_continue
		}
		-re "^\\s+Modified federation...$eol" {
			incr matches
			exp_continue
		}
		-re "\\s+$fed_name$eol" {
			incr matches
			exp_continue
		}
		timeout {
			fail "sacctmgr add not responding"
		}
		eof {
			wait
		}
	}
	if {$rc || $matches != 4} {
		log_error "Failed to remove $cname from $fed_name"
		set $rc $::RETURN_ERROR
	}

	return $rc
}


################################################################
#
# NAME
#	wait_for_fed_job - waits for a job to reach the desired state
#
# SYNOPSIS
#	wait_for_fed_job ?options? job_id desired_state clusters
#
# DESCRIPTION
#	Wait for a previously submitted Slurm job to reach the desired state.
#
# OPTIONS
#	-timeout <integer_number>
#		time in seconds to wait for the job to be in the desired state
#		before timing out (default is 360)
#	-pollinterval <integer_number>
#		time in seconds between each job state check (default is 1)
#
# ARGUMENTS
#	job_id
#		The Slurm job id of a job we want to wait for.
#	desired_state
#		The state you want the job to attain before
#		returning.  Currently supports:
#			DONE any terminated state
#			PENDING job is pending
#			RUNNING job is running
#			SUSPENDED job is suspended
#	clusters
#		The list of clusters to wait for. If empty "" the default
#		clusters of fedc1, fedc2 and fedc3 will be used.
#
# RETURN VALUE
#	The name of the first cluster on which the job is found
#	in the desired state. Empty string indicates a failure.
#
# NOTE
#	We sleep for two seconds before replying that a job is
#	done to give time for I/O completion (stdout/stderr files)
#
################################################################

proc wait_for_fed_job args {
	global scontrol fedc1 fedc2 fedc3

	set timeout       360
	set poll_interval 1
	while {[llength $args]} {
		switch -glob -- [lindex $args 0] {
			-time*  {set args [lassign $args - timeout]}
			-poll*  {set args [lassign $args - poll_interval]}
			-*      {fail "Unknown option: [lindex $args 0]"}
			default break
		}
	}
	set argument_count [llength $args]
	if {$argument_count != 3} {
		fail "Invalid number of arguments ($argument_count): $args"
	} else {
		lassign $args job_id desired_state clusters
	}

	# First verify that desired_state is supported
	switch $desired_state {
		"DONE" {}
		"PENDING" {}
		"REVOKED" {}
		"RUNNING" {}
		"SUSPENDED" {}
		"SPECIAL_EXIT" {}
		default {
			log_error "Invalid desired state: $desired_state"
			return ""
		}
	}

	if {$job_id == 0} {
		log_error "Invalid job ID: $job_id"
		return ""
	}

	set my_delay    0

	set spec_clusters [list $fedc1 $fedc2 $fedc3]
	if {$clusters ne ""} {
		set spec_clusters [split $clusters ","]
	}
	log_debug "Checking for job '$job_id' in state '$desired_state' on [join $spec_clusters ,]"

	while 1 {
		foreach cluster $spec_clusters {
			log_debug "Checking $cluster"
			set fd [open "|$scontrol -M$cluster --local -a -o show job $job_id"]
			gets $fd line
			catch {close $fd}
			if {[regexp {JobState\s*=\s*(\w+)} $line foo state] != 1} {
				log_error "$desired_state not found on cluster $cluster"
				continue
			}

			switch $state {
				"NOT_FOUND" -
				"CANCELLED" -
				"DEADLINE" -
				"FAILED" -
				"TIMEOUT" -
				"NODE_FAIL" -
				"PREEMPTED" -
				"COMPLETED" {
					if {$desired_state eq "DONE"} {
						log_debug "Job $job_id is DONE ($state) on $cluster"
						sleep 2
						return $cluster
					}
					if {$desired_state eq "RUNNING"} {
						log_warn "Job $job_id is $state but we wanted RUNNING"
					}
					if {$desired_state eq "SUSPENDED"} {
						log_warn "Job $job_id is $state but we wanted SUSPENDED"
					}
					return ""
				}
				"PENDING" {
					if {$desired_state eq "PENDING"} {
						log_debug "Job $job_id is PENDING on $cluster"
						return $cluster
					}
					log_warn "Job $job_id is in state $state, but desired state $desired_state"
				}
				"REVOKED" {
					if {$desired_state eq "REVOKED"} {
						log_debug "Job $job_id is REVOKED on $cluster"
						return $cluster
					}
					log_warn "Job $job_id is in state $state, but desired state $desired_state"
				}
				"RUNNING" {
					if {$desired_state eq "RUNNING"} {
						log_debug "Job $job_id is RUNNING on $cluster"
						return $cluster
					}
					log_warn "Job $job_id is in state $state, but desired state $desired_state"
				}
				"SPECIAL_EXIT" {
					if {$desired_state eq "SPECIAL_EXIT"} {
						log_debug "Job $job_id is SPECIAL_EXIT on $cluster"
						return $cluster
					}
					log_warn "Job $job_id is in state $state, but desired state $desired_state"
				}
				"SUSPENDED" {
					if {$desired_state eq "SUSPENDED"} {
						log_debug "Job $job_id is SUSPENDED on $cluster"
						return $cluster
					}
					log_warn "Job $job_id is in state $state, but desired state $desired_state"
				}
				default {
					log_debug "Job $job_id is in state $state. Desired state was $desired_state"
				}
			}
		}

		if { $my_delay > $timeout } {
			log_error "Timeout waiting for job state $desired_state"
			return ""
		}

		exec sleep $poll_interval
		set my_delay [expr $my_delay + $poll_interval]
	}
}
