#!/bin/ksh
######################################################################
# Purpose:	This script generates index.web or delta.web which can then
#		be run through the M4 macros to generate a corresponding
#		HTML file.
# Copyright:	Copyright 1997-2026 Perette Barella
#		All rights reserved.
######################################################################
VERSION='$Id: buildindex 359 2026-04-08 14:18:54Z perette $'

# Validate that ksh supports the modern/extended getopts format.
# Author: Perette Barella 
# Copyright 2018 Devious Fish.  All rights reserved.
# $Id: modern_ksh_check 19 2018-07-28 23:40:39Z perette $  


function modern_ksh_check {
	if [[ $(getopts '[-][12:abc]' flag --abc; print -- 0$flag) != "012" ]]
	then
		print -- "$arg0${arg0+: }Outdated Korn shell." 1>&2
		exit 1
	fi
}
# Escape HTML entities
# Author: Perette Barella
# Copyright 2020 Devious Fish.  All rights reserved.
# $Id: html_escape_entities 56 2020-01-04 01:21:35Z perette $

function html_escape_entities {
	if (($# == 0))
	then
		sed -e 's/&/\&amp;/g' -e 's/</\&lt;/g' -e 's/>/\&gt;/g'
	else
		print -r -- "$*" | html_escape_entities
	fi
}

# dir_name - dirname implemented as a shell function.
# Author: Perette Barella
# Copyright 2020 Devious Fish.  All rights reserved.
# Why: In testing, performed >> 10x faster than external program.
# $Id$

function dir_name {
        typeset dir=${1%/*}
        [[ "$1" = "$dir" ]] && dir="."
        print -r -- "$dir"
}

# base_name - basename implemented as a shell function.
# Author: Perette Barella
# Copyright 2020 Devious Fish.  All rights reserved.
# Why: In testing, performed >> 10x faster than external program.
# $Id$

function base_name {
        typeset base=${1##*/}
        (( $# == 2 )) && eval "base=\${base%$2}"
        print -r -- "$base"
}


arg0="$(basename $0)"

modern_ksh_check

USAGE=$'
[-1?'$VERSION$']
[+NAME?'$arg0$' - build index pages, sitemaps, recently changed page lists, and indexes.]
[+DESCRIPTION?\b'$arg0$'\b assembles details of web pages and presents them in various forms.]
[c:contents?Create \bindex.web\b with a list of local files.  This is the default mode.]
[d:deltas?Create \bdelta.web\b with a list of recently-modified pages.]
[i:index?Create \bindexpage.web\b with a content index (like from a book).]
[I:intermediate?With \b-i\b, produce an intermediate site index.  Without \b-i\b, merge a list of intermediates to produce a combined site index.]
[l:list?Create \bsitelist.web\b with a \vnon\v-heirarchical list of site documents.]
[m:map?Create \bsitemap.web\b with a heirarchical list of site documents.]
[s:sitemap?Create \bsitemap.xml\b with XML sitemap to help search engines.]
[+MODES vs OPTIONS?The above flags choose an operation mode.  Only the last option applies (except \b-i\b and \b-I\b).  These additional flags control optional behaviors; not all options apply to all modes:]
[C:count?Result limit for delta mode.]#[count]
[D:descriptions?Include descriptions/summaries.]
[F:fully-qualify?Fully qualify URLS when indexing.]
[h?For directory lists, if no \b.web\b files are found, list all files.]
[o:output?Choose a file to write to, instead of the defaults shown above.]:[output-file]
[O:outlines?Include outlines of structured documents in maps.]
[r:reason?For deltas, include reason for last change, as indicated in source control.]
[S:suppress-header?Write output to a \b.m4\b file, and only include data, omitting page setup.]
[t:timeframe?Choose duration delta mode finds.]#[days]
[+?\b'$arg0$'\b verifies the publication status of documents by reading the \b_ROBOTS\b macro.  Documents marked as \bunlisted\b or \bnoarchive\b are omitted from all lists and indexes.]
[+?The pages produced by \b'$arg0$'\b are rudimentary, if useful.  For simple lists, the \b_LINK_ALL\b macro may be given a list or glob, enabling more control over the style and content.  Other times, it may be preferable to use \b--suppress-header\b to create a \b.m4\b file with the index, and include that in a \b.web\b page styled to your own preferences using \bm4_include\b.]
[+EXIT STATUS?0 on success, non-0 on error.]
[+SEE ALSO?\bhtml2m4\b(1), \bmmdtool\b(1)]

files ...

[-author?Perette Barella <perette@deviousfish.com>]
'

# Forward mmdtool to that script.
function mmdtool {
	${WEBBASE}/Include/bin/mmdtool "$@"
}


# Extract data from a file.
# $1 = what to extract, $2... = filenames
function perform_extraction {
	${WEBBASE}/Include/bin/web2html -x "$@"
}

# Extract data from a file and get rid of whitespace
# $1 = filename, $2 = what to extract
function extract_cleaned {
	typeset data
	data=$(perform_extraction "$@")
	typeset status=$?
	print -r -- $data
	return $status
}


function get_page_title {
	typeset file="${1#./}" name
	[[ ! -r "$file" ]] && return 1
	typeset basename=$(base_name "$file")
	escape=true
	case "$basename" in
	    *.md)
		name=$(mmdtool -h get Title < "$file")
		;;
	    *.web)
		name=$(extract_cleaned title "$file") || return 1
		;;
	    *.txt.gz|*.txt.Z)
		name=$(zegrep -v $'^[ \t]*$|BEGIN PGP' "$file" | head -1)
		[[ -z "$name" ]] && name=$(html_escape_entities "$basename: $name")
		;;
	    *.txt|*.do)
		name=$(mmdtool get Title < "$file")
		[[ -z $name ]] && name=$(mmdtool get Subject < "$file" |
			sed -e 's/[rR]e: *//g')
		[[ -z "$name" ]] &&
			name=$(egrep -v $'^[ \t]*$|BEGIN PGP' "$file" | head -1)
		if [[ -z "$name" || "${name:0:2}" == "#!" ]]
		then
			name=$(head -2 $file)
		fi
		[ "$name" != "" ] && name=$(html_escape_entities "$name")
		;;
	    *.pdf)
		name=$(strings "$file" | grep 'dc:title' | sed \
			-e 's/.*<dc:title[^>]*>//' \
			-e 's&</dc:title[^>]*>.*&&' \
			-e 's/<[^>]*>//g' \
			-e $'s/^[ \t]*//'g \
			-e $'s/[ \t]*$//g')
		[ "$name" != "" ] && name=$(html_escape_entities "$name (PDF)")
		;;
	    *)
		print "$arg0: Don't know file format for $basename." 1>&2
		return 1
		;;
	esac
	[[ -z "$name" ]] && return 1
	print -r -- $name
	return 0
}

function get_page_location {
	typeset file="$1"
	case "$file" in
	    *.web)
		print -- "${file%.web}.html"
		;;
	    *)
		print -- "$file"
	esac
}

function get_directory_offset {
	# /bin/pwd always returns path with NFS shit in it.
	# Built-in PWD depends on which way the path was accessed.
	typeset here_path=$(/bin/pwd)
	cd $WEBBASE || return 1
	typeset base_path=$(/bin/pwd)
	print -- "$here_path" | sed -e "s&$base_path&&" -e 's&^/&&'
}


# Ignore .web files when there are corresponding .md files.
# Change the .md filenames to .web.
function handle_generated_web_files {
	typeset filename
	while read filename
	do
		if [[ $filename == *.web ]]
		then
			typeset mdfile="${filename%.web}.md"
			[[ -a "$mdfile" ]] && continue
		fi
		print -- "$filename"
	done
}

# Add last-change date and, if requested and found, reason for last change.
function add_update_info {
	typeset file="$1" change updated
	# Requires Subversion is installed and the project be in a respository.
	whence -q svn || return 0
	[[ -d "$WEBBASE/.svn" ]] || return
	updated=$(get_file_date "$file" "%d %b %Y") || return 1
	print "<span class='changedate'>Last updated: $updated.</span>"
	$CHANGEREASON || return 0
	typeset mdname="${file%.web}.md"
	[[ -r "$mdname" ]] && file="$mdname"
	change=$(svn log -l1 "$file" | tail +3 | grep -v -- "-------")
	if [ "$change" != "" ]
	then
		print -n "<span class='changedescription'> Changes: \`\`"
		html_escape_entities "$change"
		print "''</span>"
	fi
	return 0
}

function extract_content_index {
	typeset domain=$(perform_extraction domain) || return 1
	set -o pipefail
	perform_extraction index "$@" |
	grep -v $'^[ \t]*$' |
	sed -e $'s/^[ \t]*//' -e 's/ |/|/g' -e 's/| /|/g' -e 's/<[^>]*>//g' |
	(
		if $FULLYQUALIFY
		then
			cat
		else
			sed -E -e 's&/./&/&' -e "s&^https?://${domain//./\\.}/*/&&i" 
		fi
	)
	typeset status=$?
	set +o pipefail
	return $status
}


# KORN SHELL DEPENDENT FUNCTION
# (Fork order of pipe.)
function format_content_index {
	typeset url title index lastgroup
	typeset -i lastlevel=0 currentlevel=0
	print '<UL ID="contentindex">'
	sed 's/|/  |/g' |
	sort -u -f -t"	" -k 3,3 -k2,2 -k1,1 |
	sed 's/  |/|/g' |
	while IFS=$'$\t' read -r url title indexline
	do
		# Break comma-separated index entry into heirarchy levels
		print -r -- "$indexline" | IFS="|" read -A index
		integer currentlevel=0
		while [[ -n ${index[currentlevel]} ]]
		do
			let currentlevel++
		done
		# Find the first heirarchy level that's different
		typeset backto=0
		while ((currentlevel > backto)) &&
			[[ "${lastgroup[$backto]}" == "${index[$backto]}" ]]
		do
			let backto++;
		done
		# Output end-list tags until we get back to the changed level
		while ((lastlevel > backto))
		do
			print "</UL>"
			let lastlevel--
			lastgroup[$lastlevel]=""
		done
		# Output new/different heirarchy levels if necessary
		while ((lastlevel < currentlevel))
		do
			print "<LI>${index[$lastlevel]}"
			print "<UL>"
			lastgroup[$lastlevel]="${index[$lastlevel]}"
			let lastlevel++
		done
		# Output the index entry
		if [[ "$url" == "SEE" ]]
		then
			print "<LI><I>See:</I> $title"
		elif [[ "$url" == "ALSO" ]]
		then
			print "<LI><I>See also:</I> $title"
		else
			print "<LI><A HREF=\"$url\">$title</A>"
		fi
	done
	while ((lastlevel >= 0))
	do
		let lastlevel--;
		print "</UL>"
	done
}


function file_is_published {
	typeset file="$1"
	typeset -l listed
	if ! listed=$(extract_cleaned robots "$file")
	then
		print "$file: Can't read robots directive." 1>&2
		return 1
	elif [[ $listed == unlisted || $listed == *noindex* ]]
	then
		print "$file: file is unlisted or noarchive, ignoring." 1>&2
		return 1
	fi
	return 0
}

# A filter that unpublished files, one name per line.
function strip_unpublished_from_list {
	typeset file
	while read -r file
	do
		file_is_published "$file" && print -r -- "$file"
	done
}

# Get file last-change date.  Use .md file instead of .web if it exists.
function get_file_date {
	typeset file="$1" format="${2:-%Y-%m-%dT%H:%M:%SZ}"
	if [[ $file == *.web ]]
	then
		typeset mdfile="${file%.web}.md"
		[[ -f $mdfile ]] && file="$mdfile"
	fi
	date -u -r"$file" "+$format"
}
	


function format_list
{
	typeset class="" pagetitle location description
	typeset -l listed
	[ "$mode" = "deltas" ] && class=" CLASS=padded"
	integer count=0
	for file in "$@"
	do
		[[ $file == *.md ]] && file="${file%.md}.web"
		[[ ! -r "$file" ]] && continue
		[[ "$file" == "./robots.txt" ]] && continue
		[[ "$(dir_name "$file")" == "$(dir_name "$destfile")" &&
		   "$(base_name "$file")" == "$(base_name "$destfile")" ]] && continue
		file_is_published "$file" || continue
		
		let count+=1
		(( count > RESULTLIMIT )) && break
		if ! pagetitle=$(get_page_title "$file")
		then
			print "$file: can't determine page title." 1>&2
			pagetitle=$(basename "$file")
		fi
		if ! location=$(get_page_location "$file")
		then
			print "$file: can't determine location." 1>&2
			continue
		fi
		
		print "<LI$class><A HREF=\"${location}\">\`\`${pagetitle:-${location}}''</A>"

		if $DESCRIPTION
		then
			description=$(perform_extraction description "$file")
			[[ -n "$description" ]] &&
				print "<span class='summary'>$description</span>"
			$OUTLINES && perform_extraction outline "$file" && print
		fi

		[ "$mode" = "deltas" ] && add_update_info "$file"
	done
}


function recurse_map {
	typeset dirname="$1" listid="$2"
	typeset file filelist title
	[[ -n $listid ]] && listid=" id='$listid'"

	filelist=$(ls -1d "$dirname"/*.?(web|txt|do|pdf|txt.gz) 2>/dev/null |
		egrep -v '/index.web$|/index.html$|/RCS$|/SCCS$|/Thumbnails$')
		
	typeset empty=false
	[[ -z $filelist ]] && empty=true

	if ! $empty
	then
		print "<UL$listid>"
		$ALLFILES && format_list $filelist
	elif $ALLFILES
	then
		for file in "$dirname"/*
		do
			[ -d "$file" ] && continue
			typeset base=$(basename "$file")
			$empty && print "<UL$listid>"
			print "<LI><A HREF=\"$file\">$base</A>"
			empty=false
		done
	fi

	for file in "$dirname"/*
	do
		[[ ! -d "$file" ]] && continue
		typeset base=$(basename "$file")
		[[ $base == "RCS" || $base == "SCCS" ]] && continue
		[[ "$base" == "Thumbnails" ]] && continue
		if [[ -f "$file/index.web" ]] &&
			title="$(get_page_title "$file/index.web")"
		then
			$empty && print "<UL$listid>"
			print "<LI><A HREF=\"$file/index.html\">$title</A>"
		elif [[ -f "$file/index.html" ]]
		then
			$empty && print "<UL$listid>"
			print "<LI><A HREF=\"$file/index.html\">$base</A>"
		else
			! $ALLFILES && continue
			$empty && print "<UL$listid>"
			print "<LI>$base"
		fi
		empty=false
		recurse_map "$file"
	done
	$empty && $ALLFILES && print "<UL$listid><LI>(empty)</UL>"
	! $empty && print "</UL>"
}


# Create an XML sitemap file with URLs and last-mod dates.
function format_sitemap {
	typeset diroffset=$(get_directory_offset)
	typeset domain=$(perform_extraction domain) || exit 1
	[ "$diroffset" != "" ] && domain="$domain/$diroffset"
	
	# Don't tell search engines about things robots.txt disallows.
	typeset prunes=""
	if [ -r "${WEBBASE}/robots.txt" ]
	then
		prunes=$(grep -i "^ *disallow:" "${WEBBASE}/robots.txt" |
			 cut -d: -f2 | grep -vw / |
			sed -e '&/&&' -e 's/^/-name/' -e 's/$/ -prune -o /')
	fi
	typeset filename
	eval "TZ='UTC:0' find ." $prunes "\( -name '*.web' -o -name '*.txt' -o \
			     -name '*.ics' -o \
			     -name '*.do'  -o -name '*.pdf' \) -type f -print" |
	strip_unpublished_from_list |
	while read -r filename
	do
		typeset outfile filedate
		case $filename in
		    *.web)
			typeset dir=$(dir_name "$filename")
			typeset base=$(base_name "$filename" .web)
			outfile="http://$domain/$dir/$base.html"
		        ;;
		    *)
			outfile="http://$domain/$filename"
			;;
		esac
		filedate=$(get_file_date "$filename") || continue
		print "<url><loc>$outfile</loc><lastmod>$filedate</lastmod></url>"
	done | sed -e 's&//&/&g' -e 's&/\./&/&g' -e 's&http:/&http://&'
}



WEBBASE="${WEBBASE:=$HOME/Web}"
INCLUDE="${WEBBASE}/Include"
timeframe=30

DESCRIPTIONS=false
OUTLINES=false

mode=TOC
ALLFILES=true
target=web
destfile=""
intermediate=false
RESULTLIMIT=999999
CHANGEREASON=false
FULLYQUALIFY=false

while getopts -a "$arg0" "$USAGE" option
do
	case "$option" in
	    c)
		mode=TOC
		;;
	    C)
		RESULTLIMIT="$OPTARG"
		;;
	    d)	
		mode=deltas
		;;
	    D)
		DESCRIPTIONS=true
		;;
	    F)
		FULLYQUALIFY=true
		;;
	    h)
		ALLFILES=false
		;;
	    i)
		mode=index
		;;
	    I)
		if [[ "$mode" == "TOC" ]]
		then
			mode=indexmerge
		else
			target=dat
		fi
		intermediate=true
		;;
	    l)
		mode=list
		;;
	    m)	
		mode=map
		;;
	    o)
		destfile="$OPTARG"
		;;
	    O)
		OUTLINES=true
		;;
	    r)
		CHANGEREASON=true
		;;
	    t)	
		timeframe="$OPTARG"
		;;
	    s)
		mode=sitemap
		;;
	    S)
		target=m4
		;;
	esac
done
shift $((OPTIND - 1))

robots="FOLLOW,NOINDEX,NOARCHIVE"
case "$mode" in
    TOC)
	[ "$destfile" = "" ] && destfile=index.$target
	title="${TOCNAME:-Contents}"
	filelist=$(print *.?(web|txt|do) */index.web)
	robots="FOLLOW,INDEX,NOARCHIVE"
	message=""
	;;
    index|indexmerge)
	[ "$destfile" = "" ] && destfile="indexpage.$target"
	title="${TOCNAME:-Alphabetical Content Index}"
	filelist=$(find . -name '*.web' | strip_unpublished_from_list) ;
	message=""
	;;
    deltas)
	[ "$destfile" = "" ] && destfile=delta.$target
	title="${TOCNAME:-Recent changes}"
	filelist=$(find . \( -name '*.web' -o \
	                     -name '*.md' -o \
			     -name '*.txt' -o \
			     -name '*.do' \) -mtime -$timeframe |
		   handle_generated_web_files)
	[ "$filelist" != "" ] &&
		filelist=$(ls -1t $filelist)
	message="<P><I>Documents listed here changed within $timeframe days
prior to $(date '+%Y-%m-%d'), which is when this list was compiled.</I>"
	;;
    list)
	[ "$destfile" = "" ] && destfile=sitelist.$target
	title="${TOCNAME:-List of Contents}"
	filelist=$(find . \( -name '*.web' -o \
			     -name '*.txt' -o \
			     -name '*.do' \) )
	message="";
	;;
    map)
	[ "$destfile" = "" ] && destfile=sitemap.$target
	title="${TOCNAME:-Site map}"
	message=""
	;;
    sitemap)
	[ "$destfile" = "" ] && destfile=sitemap.xml
	title="${TOCNAME:-Site map}"
	message=""
	;;
esac



if [[ "$mode" == "sitemap" ]]
then
	cat > "$destfile" <<# EOF
		<?xml version="1.0" encoding="UTF-8"?>
		<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
		 xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
		 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
		 xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
	EOF
elif [[ "$target" == "web" ]]
then
	cat > "$destfile" <<# EOF
		m4_include(DIR_INCLUDE/html.m4)
		m4_include(DIR_TOP/siteinfo.m4)
		_HEADER($title, \`\`$(basename $destfile) built $(TZ=UTC0 date '+%Y-%m-%d %H:%M:%S') by $arg0" '$Revision: 1.27 $' "'')
		_ROBOTS(\`\`$robots'')
		<H2>$title</H2>
		$message
	EOF

	if [[ "$mode" == "map" || "$mode" == "deltas" || "$mode" == "list" ]]
	then
		cat >> "$destfile" <<# EOF
			_STYLE(\`\`
				LI.padded {
					 margin-bottom: 1ex;
					 margin-top: 1ex;
				}
				SPAN.summary {
					font-style: italic;
				}
				SPAN.summary:before {
					content: " - ";
				}
			'')
		EOF
	fi
elif [[ "$target" == "m4" ]]
then
	> "$destfile"
fi


if [[ "$mode" == "map" ]]
then
	recurse_map . sitemap
elif [[ "$mode" == "sitemap" ]]
then
	format_sitemap
	[[ "$target" == "html" ]] && print "</urlset>"
elif [[ "$mode" == "index"  ]]
then
	if $intermediate
	then
		extract_content_index $filelist
	else
		extract_content_index $filelist | format_content_index | sed 's/,/, /g'
	fi
elif [[ "$mode" == "indexmerge" ]]
then
	cat "$@" | format_content_index | sed 's/,/, /g'
else
	[[ "$mode" == "deltas" ]] && print -- "<UL ID='recentchanges'>"
	[[ "$mode" == "list" ]] && print -- "<UL ID='sitemap'>"
	format_list $filelist | (
		if $FULLYQUALIFY
		then
			typeset domain=$(perform_extraction domain) || return 1
			sed -E -e $"s&href=\"\\./&href=\"https://${domain}/&i"
		else
			sed -E 's&href="./&href="&i'
		fi
	)
	print -- "</UL>"
fi >> "$destfile"

exit 0

