#!/bin/bash
#
# Copyright (C) Microsoft Corporation.
#
# SQL Server Always-On Availability Groups resource agent
#
# Valid actions are:
#    start
#    stop
#    promote
#    demote
#    monitor
#    validate-all
#    meta-data
#

# ----------------------------------------------------------------------------------------------------------
# Location of other files used by this resource agent
#
: "${AG_HELPER_BIN=${OCF_ROOT}/lib/mssql/ag-helper}"
: "${METADATA_FILE=${OCF_ROOT}/lib/mssql/ag_metadata}"
: "${USAGE_FILE=${OCF_ROOT}/lib/mssql/ag_usage}"

# ----------------------------------------------------------------------------------------------------------
# Defaults values for optional parameters
#
: "${MONITOR_LEVEL_DEFAULT=3}"
: "${CONNECTION_TIMEOUT_DEFAULT=30}"
: "${DISABLE_PRIMARY_ON_QUORUM_TIMEOUT_DEFAULT=60}"
: "${MONITORING_CREDENTIALS_FILE_DEFAULT=/var/opt/mssql/secrets/passwd}"
: "${PORT_DEFAULT=1433}"
: "${PROCESS_NAME_DEFAULT=sqlservr}"
: "${REQUIRED_SYNCHRONIZED_SECONDARIES_TO_COMMIT_DEFAULT=-1}" # -1 is a sentinel that ag-helper interprets as "unset"

# Subtract this from the exit code of ag-helper to get the OCF exit code.
# This is the inverse of the operation performed by `mssqlcommon.OcfExit` used by ag-helper.
OCF_EXIT_DIFFERENCE=10

# ----------------------------------------------------------------------------------------------------------
# Pacemaker libraries
#
: "${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs}"
. "$OCF_FUNCTIONS"
: "${__OCF_ACTION=$1}"

# ----------------------------------------------------------------------------------------------------------
# function: mssql_meta_data
#
# Description:
#    Implements the OCF "meta-data" action.
#
mssql_meta_data() {
	cat "$METADATA_FILE"
}

# ----------------------------------------------------------------------------------------------------------
# function: usage
#
mssql_usage() {
	cat "$USAGE_FILE"
}

# ----------------------------------------------------------------------------------------------------------
# function: mssql_start
#
# Description:
#    Implements the OCF "start" action.
#
mssql_start() {
	ocf_log info 'mssql_start'

	# Fetch sequence numbers of all replicas
	#
	local sequence_numbers="$(attrd_updater -n "$OCF_RESOURCE_INSTANCE-sequence-number" -QA)"

	local command_output
	local rc

	if ! pidof "$OCF_RESKEY_process_name"; then
		# SQL Server crashed or isn't running at all.
		ocf_exit_reason "SQL Server isn't running."
		return "$OCF_ERR_GENERIC"
	fi

	local current_master="$(crm_resource -r "$OCF_RESOURCE_INSTANCE" --locate | grep -Po 'resource [^ ]+ is running on: \K(.+)(?= Master$)')"

	command_output="$(
		"$AG_HELPER_BIN" \
			--port "$OCF_RESKEY_port" \
			--credentials-file "$OCF_RESKEY_monitoring_credentials_file" \
			--ag-name "$OCF_RESKEY_ag_name" \
			--application-name "monitor-$OCF_RESOURCE_INSTANCE-start" \
			--connection-timeout "$OCF_RESKEY_connection_timeout" \
			--health-threshold "$OCF_RESKEY_monitor_policy" \
			--action start \
			--sequence-numbers "$sequence_numbers" \
			--required-synchronized-secondaries-to-commit "$OCF_RESKEY_required_synchronized_secondaries_to_commit" \
			--current-master "$current_master" \
			--disable-primary-on-quorum-timeout-after "$OCF_RESKEY_disable_primary_on_quorum_timeout_after" \
			--primary-write-lease-duration "$OCF_RESKEY_primary_write_lease_duration" \
			2>&1 |
			while read -r line; do
				ocf_log info "start: $line"
				echo "$line"
			done
		exit "${PIPESTATUS[0]}"
	)"
	rc="$?"

	set_exit_reason "$command_output"

	if (( rc < OCF_EXIT_DIFFERENCE )); then
		# ag-helper failed in an unexpected way
		#
		return "$OCF_ERR_GENERIC"
	fi

	rc="$(( rc - OCF_EXIT_DIFFERENCE ))"

	set_promotion_score "$command_output"

	return "$rc"
}

# ----------------------------------------------------------------------------------------------------------
# function: mssql_stop
#
# Description:
#    Implements the OCF "stop" action.
#
mssql_stop() {
	ocf_log info 'mssql_stop'

	local command_output
	local rc

	if ! pidof "$OCF_RESKEY_process_name"; then
		# SQL Server crashed. This is as good as stopped.
		# Even if it was a PRIMARY before it crashed and automatically restarts, it will come back as RESOLVING.
		ocf_exit_reason "SQL Server isn't running."
		return "$OCF_SUCCESS"
	fi

	# Reserve 5s for killing the SQL Server process if ag-helper fails
	# and use the rest to run ag-helper
	local stop_timeout="$(( OCF_RESKEY_CRM_meta_timeout / 1000 - 5 ))"
	if (( stop_timeout < 5 )); then
		# The stop timeout should be atleast 10 seconds so that there's enough time to change the AG replica's role,
		# and kill the SQL Server process if that fails. If the user set it shorter, there doesn't seem to be any way to
		# tell them that while still having the `stop` action succeed (failure would cause the node to be fenced).
		#
		# So print an error to cluster log and then pretend the change role failed without actually trying to change the role.
		# This will kill the process and hopefully trigger the user to read the cluster log and discover this error.
		#
		# Note that Pacemaker documentation recommends both that the default timeout specified in a resource's metadata (10s in this case)
		# is the minimum that the user should set, and further recommends not setting *any* timeout less than 10 seconds anyway.
		# So the user who sets this timeout to less than 10 seconds is going out of their way to do it.
		ocf_log error 'The stop action timeout should be at least 10 seconds'
		rc="$OCF_ERR_GENERIC"
	else
		command_output="$(
			timeout "$stop_timeout" "$AG_HELPER_BIN" \
				--port "$OCF_RESKEY_port" \
				--credentials-file "$OCF_RESKEY_monitoring_credentials_file" \
				--ag-name "$OCF_RESKEY_ag_name" \
				--application-name "monitor-$OCF_RESOURCE_INSTANCE-stop" \
				--connection-timeout "$OCF_RESKEY_connection_timeout" \
				--health-threshold "$OCF_RESKEY_monitor_policy" \
				--action stop \
				2>&1 |
				while read -r line; do
					ocf_log info "stop: $line"
					echo "$line"
				done
			exit "${PIPESTATUS[0]}"
		)"
		rc="$?"
		if (( rc == 124 )); then
			# `timeout` exits with 124 on timeout. Map it to $OCF_ERR_GENERIC
			ocf_log error 'ag-helper timed out'
			rc="$(( OCF_ERR_GENERIC + OCF_EXIT_DIFFERENCE ))"
		fi

		set_exit_reason "$command_output"

		if (( rc < OCF_EXIT_DIFFERENCE )); then
			# ag-helper failed in an unexpected way
			#
			return "$OCF_ERR_GENERIC"
		fi

		rc="$(( rc - OCF_EXIT_DIFFERENCE ))"
	fi

	case "$rc" in
		"$OCF_SUCCESS")
			# ag-helper succeeded. Nothing else to do.
			#
			;;
		"$OCF_ERR_GENERIC")
			# ag-helper failed to set the AG replica to SECONDARY Role.
			# Kill the instance to ensure that it stops being in PRIMARY role.
			#
			ocf_log info 'Killing SQL Server process...'
			if ocf_stop_processes KILL 9 "$(pidof "$OCF_RESKEY_process_name")"; then
				# Once the SQL Server instance has been killed, it will not come back as PRIMARY.
				# If it was PRIMARY before, it will come back as RESOLVING.
				# So by killing the SQL Server, the stop action has succeeded.
				rc="$OCF_SUCCESS"
			fi
			;;
	esac

	return "$rc"
}

# ----------------------------------------------------------------------------------------------------------
# function: mssql_monitor
#
# Description:
#    Implements the OCF "monitor" action.
#
mssql_monitor() {
	ocf_log info 'mssql_monitor'

	local command_output
	local rc

	if ! pidof "$OCF_RESKEY_process_name"; then
		# SQL Server crashed or isn't running at all.
		ocf_exit_reason "SQL Server isn't running."
		return "$OCF_NOT_RUNNING"
	fi

	local current_master="$(crm_resource -r "$OCF_RESOURCE_INSTANCE" --locate | grep -Po 'resource [^ ]+ is running on: \K(.+)(?= Master$)')"

	command_output="$(
		"$AG_HELPER_BIN" \
			--port "$OCF_RESKEY_port" \
			--credentials-file "$OCF_RESKEY_monitoring_credentials_file" \
			--ag-name "$OCF_RESKEY_ag_name" \
			--application-name "monitor-$OCF_RESOURCE_INSTANCE-monitor" \
			--connection-timeout "$OCF_RESKEY_connection_timeout" \
			--health-threshold "$OCF_RESKEY_monitor_policy" \
			--action monitor \
			--required-synchronized-secondaries-to-commit "$OCF_RESKEY_required_synchronized_secondaries_to_commit" \
			--current-master "$current_master" \
			--disable-primary-on-quorum-timeout-after "$OCF_RESKEY_disable_primary_on_quorum_timeout_after" \
			--primary-write-lease-duration "$OCF_RESKEY_primary_write_lease_duration" \
			2>&1 |
			while read -r line; do
				ocf_log info "monitor: $line"
				echo "$line"
			done
		exit "${PIPESTATUS[0]}"
	)"
	rc="$?"

	set_exit_reason "$command_output"

	if (( rc < OCF_EXIT_DIFFERENCE )); then
		# ag-helper failed in an unexpected way
		#
		return "$OCF_ERR_GENERIC"
	fi

	rc="$(( rc - OCF_EXIT_DIFFERENCE ))"

	set_promotion_score "$command_output"

	return "$rc"
}

# ----------------------------------------------------------------------------------------------------------
# function: mssql_promote
#
# Description:
#    Implements the OCF "promote" action.
#
mssql_promote() {
	ocf_log info 'mssql_promote'

	if ! pidof "$OCF_RESKEY_process_name"; then
		# SQL Server crashed.
		ocf_exit_reason "SQL Server isn't running."
		return "$OCF_NOT_RUNNING"
	fi

	local command_output
	local rc

	# Fetch sequence numbers of all replicas
	#
	local sequence_numbers="$(attrd_updater -n "$OCF_RESOURCE_INSTANCE-sequence-number" -QA)"

	command_output="$(
		"$AG_HELPER_BIN" \
			--port "$OCF_RESKEY_port" \
			--credentials-file "$OCF_RESKEY_monitoring_credentials_file" \
			--ag-name "$OCF_RESKEY_ag_name" \
			--application-name "monitor-$OCF_RESOURCE_INSTANCE-promote" \
			--connection-timeout "$OCF_RESKEY_connection_timeout" \
			--health-threshold "$OCF_RESKEY_monitor_policy" \
			--action promote \
			--sequence-numbers "$sequence_numbers" \
			--new-master "$OCF_RESKEY_CRM_meta_notify_promote_uname" \
			--required-synchronized-secondaries-to-commit "$OCF_RESKEY_required_synchronized_secondaries_to_commit" \
			--disable-primary-on-quorum-timeout-after "$OCF_RESKEY_disable_primary_on_quorum_timeout_after" \
			--primary-write-lease-duration "$OCF_RESKEY_primary_write_lease_duration" \
			2>&1 |
			while read -r line; do
				ocf_log info "promote: $line"
				echo "$line"
			done
		exit "${PIPESTATUS[0]}"
	)"
	rc="$?"

	set_exit_reason "$command_output"

	if (( rc < OCF_EXIT_DIFFERENCE )); then
		# ag-helper failed in an unexpected way
		#
		return "$OCF_ERR_GENERIC"
	fi

	rc="$(( rc - OCF_EXIT_DIFFERENCE ))"

	return "$rc"
}

# ----------------------------------------------------------------------------------------------------------
# function: mssql_demote
#
# Description:
#    Implements the OCF "demote" action.
#
mssql_demote() {
	ocf_log info 'mssql_demote'

	if ! pidof "$OCF_RESKEY_process_name"; then
		# SQL Server crashed. This is as good as demoted.
		# Even if it was a PRIMARY before it crashed and automatically restarts, it will come back as RESOLVING.
		ocf_exit_reason "SQL Server isn't running."
		return "$OCF_SUCCESS"
	fi

	local command_output
	local rc

	command_output="$(
		"$AG_HELPER_BIN" \
			--port "$OCF_RESKEY_port" \
			--credentials-file "$OCF_RESKEY_monitoring_credentials_file" \
			--ag-name "$OCF_RESKEY_ag_name" \
			--application-name "monitor-$OCF_RESOURCE_INSTANCE-demote" \
			--connection-timeout "$OCF_RESKEY_connection_timeout" \
			--health-threshold "$OCF_RESKEY_monitor_policy" \
			--action demote \
			2>&1 |
			while read -r line; do
				ocf_log info "demote: $line"
				echo "$line"
			done
		exit "${PIPESTATUS[0]}"
	)"
	rc="$?"

	set_exit_reason "$command_output"

	if (( rc < OCF_EXIT_DIFFERENCE )); then
		# ag-helper failed in an unexpected way
		#
		return "$OCF_ERR_GENERIC"
	fi

	rc="$(( rc - OCF_EXIT_DIFFERENCE ))"

	return "$rc"
}

# ----------------------------------------------------------------------------------------------------------
# function: mssql_notify
#
# Description:
#    Implements the OCF "notify" action.
#
mssql_notify() {
	ocf_log info "mssql_notify $OCF_RESKEY_CRM_meta_notify_type-$OCF_RESKEY_CRM_meta_notify_operation"

	local command_output
	local rc

	case "$OCF_RESKEY_CRM_meta_notify_type-$OCF_RESKEY_CRM_meta_notify_operation" in
		'pre-start')
			command_output="$(
				"$AG_HELPER_BIN" \
					--port "$OCF_RESKEY_port" \
					--credentials-file "$OCF_RESKEY_monitoring_credentials_file" \
					--ag-name "$OCF_RESKEY_ag_name" \
					--application-name "monitor-$OCF_RESOURCE_INSTANCE-pre-start" \
					--connection-timeout "$OCF_RESKEY_connection_timeout" \
					--health-threshold "$OCF_RESKEY_monitor_policy" \
					--action pre-start \
					--required-synchronized-secondaries-to-commit "$OCF_RESKEY_required_synchronized_secondaries_to_commit" \
					2>&1 |
					while read -r line; do
						ocf_log info "notify: $line"
						echo "$line"
					done
				exit "${PIPESTATUS[0]}"
			)"
			rc="$?"
			;;

		'post-promote')
			# Reset sequence number attribute so that it doesn't retain old values for subsequent starts or promotes
			attrd_updater -n "$OCF_RESOURCE_INSTANCE-sequence-number" -D
			command_output="$(
				"$AG_HELPER_BIN" \
					--port "$OCF_RESKEY_port" \
					--credentials-file "$OCF_RESKEY_monitoring_credentials_file" \
					--ag-name "$OCF_RESKEY_ag_name" \
					--application-name "monitor-$OCF_RESOURCE_INSTANCE-post-promote" \
					--connection-timeout "$OCF_RESKEY_connection_timeout" \
					--health-threshold "$OCF_RESKEY_monitor_policy" \
					--action post-promote \
					2>&1 |
					while read -r line; do
						ocf_log info "notify: $line"
						echo "$line"
					done
				exit "${PIPESTATUS[0]}"
			)"
			rc="$?"
			;;
			
		'post-start')
			# Reset sequence number attribute so that it doesn't retain old values for subsequent starts or promotes
			attrd_updater -n "$OCF_RESOURCE_INSTANCE-sequence-number" -D
			return "$OCF_SUCCESS"
			;;	

		'post-stop')
			command_output="$(
				"$AG_HELPER_BIN" \
					--port "$OCF_RESKEY_port" \
					--credentials-file "$OCF_RESKEY_monitoring_credentials_file" \
					--ag-name "$OCF_RESKEY_ag_name" \
					--application-name "monitor-$OCF_RESOURCE_INSTANCE-post-stop" \
					--connection-timeout "$OCF_RESKEY_connection_timeout" \
					--health-threshold "$OCF_RESKEY_monitor_policy" \
					--action post-stop \
					--required-synchronized-secondaries-to-commit "$OCF_RESKEY_required_synchronized_secondaries_to_commit" \
					2>&1 |
					while read -r line; do
						ocf_log info "notify: $line"
						echo "$line"
					done
				exit "${PIPESTATUS[0]}"
			)"
			rc="$?"
			;;

		'pre-promote')
			command_output="$(
				"$AG_HELPER_BIN" \
					--port "$OCF_RESKEY_port" \
					--credentials-file "$OCF_RESKEY_monitoring_credentials_file" \
					--ag-name "$OCF_RESKEY_ag_name" \
					--application-name "monitor-$OCF_RESOURCE_INSTANCE-pre-promote" \
					--connection-timeout "$OCF_RESKEY_connection_timeout" \
					--health-threshold "$OCF_RESKEY_monitor_policy" \
					--action pre-promote \
					2>&1 |
					while read -r line; do
						ocf_log info "notify: $line"
						echo "$line"
					done
				exit "${PIPESTATUS[0]}"
			)"
			rc="$?"
			;;

		*)
			return "$OCF_SUCCESS"
			;;
	esac

	set_exit_reason "$command_output"

	if (( rc < OCF_EXIT_DIFFERENCE )); then
		# ag-helper failed in an unexpected way
		#
		return "$OCF_ERR_GENERIC"
	fi

	rc="$(( rc - OCF_EXIT_DIFFERENCE ))"

	case "$OCF_RESKEY_CRM_meta_notify_type-$OCF_RESKEY_CRM_meta_notify_operation" in
		'pre-start' | 'pre-promote')
			if (( rc == OCF_SUCCESS )); then
				# Find sequence number in ag-helper's output
				#
				local sequence_number="$(echo "$command_output" | grep -Po '^SEQUENCE_NUMBER: \K.*')"

				if [ -z "$sequence_number" ]; then
					ocf_exit_reason 'Could not find sequence number in ag-helper output.'
					return "$OCF_ERR_GENERIC"
				fi

				attrd_updater -n "$OCF_RESOURCE_INSTANCE-sequence-number" -U "$sequence_number" -p

				# Work around attrd bug https://bugzilla.redhat.com/show_bug.cgi?id=1463033
				# attrd_updater can receive ack from attrd for the update before attrd has propagated the value to other nodes
				# or even committed it locally
				local attribute_value=''
				while :; do
					attribute_value="$(attrd_updater -n "$OCF_RESOURCE_INSTANCE-sequence-number" -Q)"
					if [ -n "$attribute_value" ]; then
						break
					fi

					sleep 5
				done

				# -Q returns the value when it's committed locally but not necessarily propagated to other nodes,
				# so sleep some more to let the update propagate
				sleep 5
			fi
			;;
	esac

	return "$rc"
}

# ----------------------------------------------------------------------------------------------------------
# function: mssql_validate
#
# Description:
#    Implements the OCF "validate-all" action.
#
# Returns:
#    OCF_SUCCESS: The credentials file exists and the ag-helper binary is present.
#    OCF_ERR_ARGS: The credentials file does not exist.
#    OCF_ERR_CONFIGURED: The ag-helper binary is not present.
#
mssql_validate() {
	ocf_log info 'mssql_validate'

	# Set default parameters
	#
	: "${OCF_RESKEY_disable_primary_on_quorum_timeout_after=$DISABLE_PRIMARY_ON_QUORUM_TIMEOUT_DEFAULT}"
	: "${OCF_RESKEY_monitoring_credentials_file=$MONITORING_CREDENTIALS_FILE_DEFAULT}"
	: "${OCF_RESKEY_monitor_policy=$MONITOR_LEVEL_DEFAULT}"
	: "${OCF_RESKEY_port=$PORT_DEFAULT}"
	: "${OCF_RESKEY_primary_write_lease_duration=$(( OCF_RESKEY_CRM_meta_timeout / 1000 + OCF_RESKEY_CRM_meta_interval / 1000 + 10 ))}"
	: "${OCF_RESKEY_process_name=$PROCESS_NAME_DEFAULT}"

	# monitor_timeout is an old alias for connection_timeout
	: "${OCF_RESKEY_connection_timeout:=$OCF_RESKEY_monitor_timeout}"
	: "${OCF_RESKEY_connection_timeout:=$CONNECTION_TIMEOUT_DEFAULT}"

	# required_copies_to_commit is an old alias for required_synchronized_secondaries_to_commit
	: "${OCF_RESKEY_required_synchronized_secondaries_to_commit:=$OCF_RESKEY_required_copies_to_commit}"
	: "${OCF_RESKEY_required_synchronized_secondaries_to_commit:=$REQUIRED_SYNCHRONIZED_SECONDARIES_TO_COMMIT_DEFAULT}"

	# Check binaries necessary for the resource agent to run
	#
	check_binary "$AG_HELPER_BIN"

	# Check credentials file
	#
	if [[ ! -f "$OCF_RESKEY_monitoring_credentials_file" ]]; then
		ocf_exit_reason "Credentials file at $OCF_RESKEY_monitoring_credentials_file does not exist"
		return "$OCF_ERR_ARGS"
	fi

	# Check notify=true
	if [[ "$OCF_RESKEY_CRM_meta_notify" != 'true' ]]; then
		ocf_exit_reason 'Resource must be configured with notify=true'
		return "$OCF_ERR_CONFIGURED"
	fi

	return "$OCF_SUCCESS"
}

# ----------------------------------------------------------------------------------------------------------
# function: mssql_export_ocf_exit_codes
#
# Description:
#    Exports the OCF exit code variables as environment variables for sub-processes.
#
mssql_export_ocf_exit_codes() {
	export \
        OCF_ERR_ARGS OCF_ERR_CONFIGURED OCF_ERR_GENERIC OCF_ERR_PERM OCF_ERR_UNIMPLEMENTED \
        OCF_FAILED_MASTER OCF_NOT_RUNNING \
        OCF_RUNNING_MASTER OCF_SUCCESS
}

# ----------------------------------------------------------------------------------------------------------
# function: set_exit_reason
#
# Description:
#    Extracts the exit reason from the given command output if it exists, and sets it.
#
set_exit_reason() {
	local exit_reason="$(echo "$1" | grep -Po '^ERROR: \K.*' | head -n1)"
	if [[ -n "$exit_reason" ]]; then
		ocf_exit_reason "$exit_reason"
	fi
}

# ----------------------------------------------------------------------------------------------------------
# function: set_promotion_score
#
# Description:
#    Extracts the promotion score value from the given command output and sets it.
#    If no output is found, sets the promotion score to `-INFINITY`
#
set_promotion_score() {
	local promotion_score="$(echo "$1" | grep -Po '^PROMOTION_SCORE: \K.*')"
	if [ -z "$promotion_score" ]; then
		crm_master -v '-INFINITY' -l reboot
	else
		crm_master -v "$promotion_score" -l reboot
	fi
}

# ----------------------------------------------------------------------------------------------------------
#
if [[ "$__OCF_ACTION" = 'meta-data' ]]; then
	mssql_meta_data
	exit "$OCF_SUCCESS"
fi

mssql_validate
validate_result="$?"

ocf_log info "Resource agent invoked with: $__OCF_ACTION"

# Everything else must pass validation
if (( validate_result != 0 )); then
	exit "$validate_result"
fi

mssql_export_ocf_exit_codes

case "$__OCF_ACTION" in
	'start')
		mssql_start
		;;
	'stop')
		mssql_stop
		;;
	'monitor')
		mssql_monitor
		;;
	'promote')
		mssql_promote
		;;
	'demote')
		mssql_demote
		;;
	'notify')
		mssql_notify
		;;
	'validate-all')
		exit "$validate_result"
		;;
	'usage' | 'help')
		mssql_usage
		exit "$OCF_SUCCESS"
		;;
	*)
		mssql_usage
		exit "$OCF_ERR_UNIMPLEMENTED"
		;;
esac
rc="$?"

ocf_log info "$OCF_RESOURCE_INSTANCE $__OCF_ACTION : $rc"
exit "$rc"
