#!/bin/ksh
######################################################################
# Purpose:	md2web - process a Markdown document (potentially
#		with a structured layout) to an HTML/M4 intermediate
#		format.
#               - Process known headers into _ROBOTS, _DESCRIPTION,
#                 _INDEX and other commands.
#		- Format and quote _M4COMMANDS and m4_commands so
#                 Multimarkdown doesn't break them.
#		- Convert headings to TOC macros if TOC is enabled.
# Copyright:	Copyright 2018-2026 Perette Barella.
#		All rights reserved.
######################################################################
VERSION='$Id: md2web 322 2026-03-25 16:43:16Z perette $'

arg0="$(basename "$0")"

# Validate that ksh supports the modern/extended getopts format.
# Author: Perette Barella 
# Copyright 2018 Devious Fish.  All rights reserved.
# $Id: modern_ksh_check 19 2018-07-28 23:40:39Z perette $  


function modern_ksh_check {
	if [[ $(getopts '[-][12:abc]' flag --abc; print -- 0$flag) != "012" ]]
	then
		print -- "$arg0${arg0+: }Outdated Korn shell." 1>&2
		exit 1
	fi
}
# Korn shell/zsh require
# Author: Perette Barella
# Copyright 2018 Devious Fish.  All rights reserved.
# $Id: require 61 2021-08-09 18:49:07Z perette $

function require {
	typeset requirement
	integer result=0
	for requirement
	do
		if ! whence -p "${requirement%:*}" >/dev/null 2>&1
		then
			print -- "$arg0${arg0+: }${requirement#*:} not found; please install." 1>&2
			result=1
		fi
	done
	(( $result != 0 )) && exit $result
	return 0
}


modern_ksh_check
require multimarkdown

USAGE=$'
[-1?'$VERSION$']
[+NAME?'$arg0$' - translate a markdown document to an M4/HTML .web document.]
[+DESCRIPTION?\b'$arg0$'\b converts markdown into a .web file which is then
suitable for compiling to a completed web page.  Compared to a direct conversion
to HTML, this permits merging the document with page templates, styles and
scripts.]
[+?If multiple files are specified, they are treated as a single document.  Headings are taken from the first file.]
[o:output?Set target filename]:[target]
[+HEADINGS?The following Markdown header fields are used by the conversion:]
{
  [+Title?For the HTML <title> of the document.]
  [+Version?A document revision.]
  [+Compile?Directives for the translation and compilation (see below).]
  [+Template?The name of the template with which to merge the document.  The default is \btemplate.m4\b.  To avoid templates, use the magic value \b-none-\b.]
  [+Robots?To set a \brobot\b directive in the resulting document.]
  [+Index?Index entries, semicolon separated.]
  [+SeeAlso?"See also" index entries, semicolon-separated, in the format \vindex-entry\v\b-->\b\valternate-entry\v.]
}
[+?Without a \bCompile\b directive, the document is expected to be plain Markdown and converted as expected.  The following space-separated directives change this:]
{
  [+m4markup?The document invokes macros from the M4/HTML macro package.  Such macros must be at the start of a line.  They may span lines; the end is detected by the closing parenthesis at the end of a line.  Macro calls differently formatted are likely to cause problems.  Underlines in macro calls may be preceeded by a backslash (helpful in preventing editors from attempt to italicize text), but this is not necessary.]
  [+inline?Indicates the document contains inline m4 markup.  Normally, Markdown converts backticks and apostropes to open and close double quotation marks; this behavior is disabled by this option.]
  [+contents?Requests a structured document.  Heading 1 becomes the document title.  Text following heading 1 is inserted above the table of contents.  Headings 2-4 become chapters, sections, and subsections with heirarchical numbering.  Additionally, inline index entries will link to the title in which they are contained.]
}
[+EXIT STATUS?0 on success, non-0 on error.]
[+SEE ALSO?\bweb2html\b(1)]

files ...

[-author?Perette Barella <perette@deviousfish.com>]
'


# Convenience function to invoke the external MMDtool
function mmdtool {
	${WEBBASE}/Include/bin/mmdtool "$@"
}

PARENTHESIS_BALANCE=0
# Add quotes for an m4 command.
# Quote `` and '' so multimarkdown passes them through.
# @return 0 if command end was reached (see closing paren), non-0 otherwise.
function m4quote {
	typeset closed=false line="$1"

	# Count open & close parenthesis; command ends when have enough closes.
	typeset opens="${*//+([^\(])}" closes="${*//+([^\)])}"
	let PARENTHESIS_BALANCE=PARENTHESIS_BALANCE+${#opens}-${#closes}
	if ((PARENTHESIS_BALANCE < 0))
	then
		print "Unbalanced parenthesis in m4 command." 1>&2
		PARENTHESIS_BALANCE=0
	fi
	((PARENTHESIS_BALANCE == 0)) && closed=true

	line="${line//\`/\\\`}"
	line="${line//'/\\/'}"
	if $closed
	then
		print -r "$line"
		print
		return 0
	fi
	print -rn "$line"
	return 1
}

# Convert the text of the document to M4web.
# Change headings to commands.  Make sure every command is on a single
# line, preceded and followed by a blank, so multimarkdown makes it its
# own paragraph.
function m4preprocess {
	typeset use_toc="$1"
	typeset line lastline underline_header='#' reset=""
	typeset titled=false
	typeset inm4command=false
	PARENTHESIS_BALANCE=0
	mmdtool body |
	while IFS="" read -r line
	do
		# If in the middle of an M4command, keep quoting it.
		if $inm4command
		then
			m4quote "$line" && inm4command=false
			continue
		fi
		# Convert underline headings to #-based headings
		if [[ $line == {3,}(=) ]]
		then
			if [[ -z "$lastline" ]]
			then
				print "Double underline found with no prior heading." 1>&2
				exit 1
			fi
			line="# $lastline"
			lastline=""
			underline_header='##'
		elif [[ $line == {3,}(-) ]]
		then
			if [[ -z "$lastline" ]]
			then
				print "Underline found with no prior heading." 1>&2
				exit 1
			fi
			line="$underline_header $lastline"
			lastline=""
		fi

		[[ -n "$lastline" ]] && print -r -- "$lastline"
		lastline=""

		if [[ "$line" == +(#)* ]] && $use_toc
		then
			# Extract heading title and level
			typeset title="${line##+(#)}"
			integer level=$((${#line} - ${#title}))
			title="${title##[ 	]}"
			title="${title%%[ 	#]}"

			# Extract section ID and language, if present.
			# Format: [language] Title [ID].  Language format: en or en-US.
			typeset lang=""
			if [[ $title == \[+([-A-Za-z])]* ]]
			then
				lang=${title%%]*}
				lang=${lang#\[}
				title=${title#*]}
				title=${title##[ 	]}
			fi
			typeset id=""
			if [[ $title == *\[[A-Za-z]+([A-Za-z0-9])] ]]
			then
				id=${title##*\[}
				id=${id%]}
				title=${title%\[*}
				title=${title##[ 	]}
			fi

			[ -n "$reset" ] && print -- "$reset" && print
			reset=""
			print
			case $level in
			    1)
				if $titled
				then
					print "Error: two document titles found." 1>&2
					exit 1
				fi
				print -- "\\_DOCTITLE(\\\`\\\`$title'')"
				print
				print "_DIVERT(DIV_INTRO)"
				reset="_DIVERT(DIV_CONTENT)"
				titled=true
				;;
			    2)
				print -- "\\_CHAPTER(\\\`\\\`$title'', \\\`\\\`$id'', \\\`\\\`$lang'')"
				;;
			    3)
				print -- "\\_SECTION(\\\`\\\`$title'', \\\`\\\`$id'', \\\`\\\`$lang'')"
				;;
			    4)
				print -- "\\_SUBSECTION(\\\`\\\`$title'', \\\`\\\`$id'', \\\`\\\`$lang'')"
				;;
			    *)	print -- "$line"
			esac
			print
		elif [[ "$line" == +(#)* ]]
		then
			# Heading but we're not using M4 TOC
			print
			print "$line"
			print
		elif [[ "$line" == \\_+([A-Z_])\(* ||
			"$line" == m4\\_+([a-z_])\(* ||
			"$line" == m4_+([a-z_])\(* ]]
		then
			# Handle a \_WEBM4_COMMAND() with backslash & parameters
			# or any m4_macro or m4\_macro
			print
			m4quote "$line" || inm4command=true
		elif [[ "$line" == _+([A-Z_])\(* ]]
		then
			# Handle an _WEBM4_COMMAND() without backslash
			print
			m4quote "\\$line" || inm4command=true
		elif [[ "$line" == _+([A-Z_]) ]]
		then
			# Handle an _WEBM4_COMMAND with no parameters
			print
			print -r "\\$line"
			print
			print
		elif [[ "$line" == \\_+([A-Z_]) || "$line" == m4\\_+([a-z_]) ||
			"$line" == m4_+([a-z_]) ]] 
		then
			# Handle an _WEBM4_COMMAND, m4_macro or m4\_macro
			# with no parameters
			print
			print -r "$line"
			print
			print
		elif [[ "$line" == m4_dnl* || "$line" == m4\\_dnl* ]]
		then
			# Drop it.
			print
		else
			lastline="$line"
			[[ -z "$lastline" ]] && print
		fi
	done
	[[ -n "$lastline" ]] && print -- "$lastline"
}

# Convert a semicolon-separated list of index entries to macro calls.
function render_index_entries {
	typeset entries="$1" entry
	integer piece=1
	while entry=$(print -- "$entries" | cut -d';' -f$piece -s)
		(( piece == 1 )) && [[ -z "$entry" ]] && entry="$entries"
		[[ -n "$entry" ]]
	do
		html_escape_entities "_INDEX($entry)"
		let piece++
	done
	return 0
}


# Convert a semicolon-separated list of see-also index entries to macro calls.
# The format for an entry is: from-entry-->to entry
function render_see_also_entries {
	typeset entries="$1" entry
	integer piece=1
	while entry=$(print -- "$entries" | cut -d';' -f$piece -s)
		(( piece == 1 )) && [[ -z "$entry" ]] && entry="$entries"
		[[ -n "$entry" ]]
	do
		typeset from=$(html_escape_entities "${entry%%-->*}")
		typeset to=$(html_escape_entities "${entry##*-->}")
		if [[ "$to" == "$from" ]]
		then
			print "$document: SeeAlso value incorrectly formatted." 1>&2
			return 1
		else
			print "_IXSEE(\`\`$to'', $from)"
		fi
		let piece++
	done
	return 0
}

# Convert _titles_ to use <cite> elements
function correct_citation_markup {
	sed -E -e 's&(^|[[:space:]])_([[:alnum:]][^_]*[[:alnum:]])_($|[[:space:]]|[,.])&\1<cite>\2</cite>\3&' \
               -e 's&\)\[_([[:alnum:]][^_]*[[:alnum:]])_\]&)[<cite>\1</cite>]&'
}

# Convert the document in $1 from .md to .web format.
function format_document {
	typeset document="$1"
	if [ ! -f "$document" ]
	then
		print "$arg0: $document: not found." 1>&2
		return 1
	fi

	typeset title="$(mmdtool -h get title < "$document")"
	if [[ -z "$title" ]]
	then
		print -- "$document: Title not found." 1>&2
		return 2
	fi

	typeset summary="$(mmdtool -h get summary < "$document")"
	if [[ -z "$summary" ]]
	then
		print "$document: Warning: does not contain a summary." 1>&2
	fi

	typeset index_entry="$(mmdtool get index < "$document")"
	typeset see_also="$(mmdtool get seealso < "$document")"
	typeset features="$(mmdtool get compile < "$document")"
	typeset robots="$(mmdtool -h get robots < "$document")"
	typeset author="$(mmdtool -h get author < "$document")"
	typeset document_date="$(mmdtool -h get date < "$document")"
	typeset template_title
	template_file="$(mmdtool get template < "$document")" ||
		template_file=template.m4
	typeset version="$(mmdtool -h get version < "$document")"

	typeset table_of_contents=false
	typeset m4_markup=false
	typeset inline_markup=false
	typeset -l feature
	for feature in $features
	do
		case "$feature" in
		    toc|contents)
			table_of_contents=true
			;;
		    inline)
			inline_markup=true
			;;
		    m4|m4markup)
			m4_markup=true
			;;
		    *)
			print "$feature: Unknown document feature." 1>&2
			exit 1
			;;
		esac
	done


	cat << EOF




	m4_dnl ####################################################
	m4_dnl THIS DOCUMENT IS AUTOMATICALLY GENERATED
	m4_dnl FROM THE CORRESPONDING .md FILE!
	m4_dnl Changes must be made to the .md file, not here.
	m4_dnl CHANGES MADE TO THIS DOCUMENT WILL BE OVERWRITTEN.
	m4_dnl ####################################################




	m4_include(DIR_INCLUDE/html.m4)
	m4_define(\`\`TEMPLATE_TITLE'', \`\`$title'')
	m4_define(\`\`DOCUMENT_VERSION'', \`\`$version'')
EOF
	$table_of_contents &&
		print "m4_include(DIR_INCLUDE/toc3.m4)"
	[[ -n "$summary" ]] && print "_DESCRIPTION(\`\`$summary'')"
	[[ -n "$author" ]] && print "_AUTHOR(\`\`$author'')"
	[[ -n "$document_date" ]] && print "_WRITTEN(\`\`$document_date'')"
	[[ -n "$robots" ]] && print "_ROBOTS(\`\`$robots'')"

	if [[ "$template_file" == "-none-" ]]
	then
		print "_HEADER(\`\`TEMPLATE_TITLE'', \`\`DOCUMENT_VERSION'')"
	else
		print "m4_include(_LOCATE_FILE(\`\`$template_file''))"
	fi
	print

	[[ -n "$index_entry" ]] && 
		render_index_entries "$index_entry"
	if [[ -n "$see_also" ]]
	then
		render_see_also_entries "$see_also" || exit 4
	fi

	if $m4_markup || $table_of_contents
	then
		# Extract document, run it through multimarkdown, then
		# strip paragraph tags multimarkdown added around M4web commands.
		m4preprocess $table_of_contents < "$document" |
		(if $inline_markup
		then
			# Quote macro names, m4 open quotes, m4 close quotes.
			# Then undo any Markdown code-block markers that
			# were mistook for m4 open quotes.
			sed -E -e $'s/[[:space:]](_[A-Z][A-Z_]*\\()/ \\1/g' \
			    -e $'s/([^\\])``/\\1\\\\`\\\\`/g' \
			    -e "s/([^\\])''/\\1\\\\'\\\\'/g" \
			    -e $'s/`\\\\`\\\\`/```/'
		else
			cat
		fi) |
		correct_citation_markup |
		multimarkdown --snippet |
		sed -E -e 's&^<p>(_[A-Z_]+|m4_[a-z_]+)(.*)</p>$&\1\2&'
	else
		multimarkdown --snippet "$document"
	fi
}

OUTPUT=""

while getopts -a "$arg0" "$USAGE" option
do
	case "$option" in
		o)	OUTPUT="$OPTARG"
			;;
	esac
done

shift $((OPTIND - 1))

sourcefile="$1"
if (($# > 1))
then
	sourcefile=$(mktemp /var/tmp/$arg0.XXXXXX)
	trap "rm -f '$sourcefile'" EXIT
	mmdtool header < "$1" > "$sourcefile" || exit 1
	for file in "$@"
	do
		print
		mmdtool body < "$file" || exit 1
	done >> "$sourcefile"
fi

if [[ -n "$OUTPUT" ]]
then
	format_document "$sourcefile" > "$OUTPUT"
else
	format_document "$sourcefile"
fi

exit $?
