#!/bin/ksh
# Convert Tuck chapters into epub format.
# Author: Perette Barella
# Copyright 2020 Devious Fish.  All rights reserved.


VERSION='$Id: tuck2epub 62 2021-08-09 18:49:47Z perette $'
OUTPUT="tuck.epub"
TUCK="$HOME/Downloads/Smut/tv/Tuck"

arg0=$(basename "$0")
USAGE=$'
[-1?'$VERSION$']
[+NAME?tuck2epub - convert Tuck chapters into a epub format]
[+DESCRIPTION?\b\f?\f\b converts and combines chapters from The Ellen Hayes story
\bSaga of Tuck\b from text files into an epub file.  Formatting is adjusted to
improve readability.]
[o:output?Specify output location]:[file]
[+EXIT STATUS?0 on success, non-0 on error.]
[+FILES?The following are defaults:]
{
  [+'$TUCK$'?Default location for text chapters.]
  [+'$OUTPUT$'?Default output file.]
}

source paths ...

[-author?Perette Barella <perette@barella.org>]
'

# Korn shell/zsh require
# Author: Perette Barella
# Copyright 2018 Devious Fish.  All rights reserved.
# $Id: require 61 2021-08-09 18:49:07Z perette $

function require {
	typeset requirement
	integer result=0
	for requirement
	do
		if ! whence -p "${requirement%:*}" >/dev/null 2>&1
		then
			print -- "$arg0${arg0+: }${requirement#*:} not found; please install." 1>&2
			result=1
		fi
	done
	(( $result != 0 )) && exit $result
	return 0
}

# Validate that ksh supports the modern/extended getopts format.
# Author: Perette Barella 
# Copyright 2018 Devious Fish.  All rights reserved.
# $Id: modern_ksh_check 19 2018-07-28 23:40:39Z perette $  


function modern_ksh_check {
	if [[ $(getopts '[-][12:abc]' flag --abc; print -- 0$flag) != "012" ]]
	then
		print -- "$arg0${arg0+: }Outdated Korn shell." 1>&2
		exit 1
	fi
}
# Cat a possibly compressed file
# Author: Perette Barella
# Copyright 2020 Devious Fish.  All rights reserved.
# $Id: cat_possibly_compressed 56 2020-01-04 01:21:35Z perette $


function cat_possibly_compressed {
	typeset file
	typeset result=0
	for file
	do
		case "$file" in
		    *.bz2)
			require bunzip2
			bunzip2 -c "$file" || result=$?
			;;
		    *.gz)
			require gunzip
			gunzip -c "$file" || result=$?
			;;
		    *.xz)
			require unxz
			unxz -c "$file" || result=$?
			;;
		    *.Z)
			require uncompress
			uncompress -c "$file" || result=$?
			;;
		    *.txt|*.do|*.md|*.rst|*.adoc)
			cat "$file" || result=$?
			;;
		    *.*)
			print "$file: Warning: Unknown type, treating as text." 1>&2
			cat "$file" || result=$?
			;;
		    *)
			# Plain old text file with no extension
			cat "$file" || result=$?
			;;
		esac
	done
	return $result
}

# Filter out/fix carriage returns in files.
# - CR at end of line: remove.
# - CR mid-line: convert to LF.
# Author: Perette Barella
# Copyright 2020 Devious Fish.  All rights reserved.
# $Id: filter_returns 56 2020-01-04 01:21:35Z perette $

function filter_returns {
	sed -e $'s/\r$//' -e $'s/\r/\\\n/g'
}
######################################################################
# Function:	create_smart_quotes
# Purpose:	Modify plain-ascii quotes and apostrophes into
#		their "proper", opposing characters.
#		Elipsis translated to the corresponding character, retaining
#		surrounding whitespace.
#		Double-dashes become em-dashes with optional whitespace removed.
#		Single dashes separate from words by spaces become en-dashes.
# Limitations:	Single-quoted things and/or leading apostrophes don't get
#		fully translated because it's impossible to distinguish:
#		'Just 'cuz,' she said.
# Author:	Perette Barella
# Version:	$Id: create_smart_quotes 90 2023-11-07 16:34:49Z perette $
#---------------------------------------------------------------------
function create_smart_quotes
{
	sed -E \
		-e '# Use prime marks for minutes and seconds' \
		-e "s/([[:digit:]])'([0-5]?[0-9])\"/\\1′\\2″/"g \
		-e '# Open quote' \
		-e 's/(^|\s)"([[:alpha:]])/\1“\2/'g \
		-e '# Close quote, punctuation precedes' \
		-e 's/([[:alnum:]])([.,?!—…]+|\.\.\.)"(\s|$)/\1\2”\3/'g \
		-e '# Close quote, punctuation follows' \
		-e 's/([[:alnum:]])"([.,?! \t]|\.\.\.|$)/\1”\2/'g \
		-e '# Close single-quote, punctuation leads' \
		-e "s/([[:alnum:]])([.,?!]+|\\.\\.\\.)'(\s|$)/\1\2’\3/"g \
		-e '# Close single-quote, punctuation Follows' \
		-e "s/([[:alnum:]])'([.,?!]+|\\.\\.\\.)(\s|$)/\1’\2\3/"g \
		-e "# Contractions" \
		-e "s/([[:alpha:]])'([a-zA-Z \\t])/\\1’\\2/"g \
		-e "# Use elipsis character" \
		-e 's/([[:alnum:]])\.\.\.(\s|$|[[:alnum:]])/\1…\2/'g \
		-e '# Use endash for ranges like: "3 - 5" and "3--7"' \
		-e 's/([[:xdigit:]])\s-\s([[:xdigit:]])/\1–\2/'g \
		-e 's/([[:xdigit:]])\s?--\s?([[:xdigit:]])/\1–\2/'g \
		-e '# Use figure dash in remaining numbers (phone numbers)' \
		-e 's/([[:digit:]])-([[:digit:]])/\1‒\2/'g \
		-e '# Parenthetical dash---mid sentence between words' \
		-e 's/([[:alnum:]])-{2,3}\s?([[:alnum:]]|$)/\1—\2/'g \
		-e '# Parenthetical dash---trailing off in quoted text." \
		-e 's/([[:alnum:]])-{2,3}"/\1—”/'g \
		-e 's/([[:alnum:]])-{2,3}'/\1—’/'g \
		-e '# Quotation dash' \
		-e 's/-{2,3}\s?([[:alpha:]])/―\1/'g \
		-e '# Catch all: parenthetical dash' \
		-e 's/\s-{2,3}\s?/—/'g \
		-e '# Catch all: range/endash' \
		-e 's/ - /–/'g
}
##### End of function create_smart_quotes #####

# Escape HTML entities
# Author: Perette Barella
# Copyright 2020 Devious Fish.  All rights reserved.
# $Id: html_escape_entities 56 2020-01-04 01:21:35Z perette $

function html_escape_entities {
	if (($# == 0))
	then
		sed -e 's/&/\&amp;/g' -e 's/</\&lt;/g' -e 's/>/\&gt;/g'
	else
		print -r -- "$*" | html_escape_entities
	fi
}


modern_ksh_check
require zip

# Identify markdown and convert it to HTML markup.
# Author: Perette Barella
# Copyright 2020 Devious Fish.  All rights reserved.
# $Id: tuck2epub 62 2021-08-09 18:49:47Z perette $

function encode_tuck_markup {
	sed -E \
	    -e 's/&/&amp;/g' \
	    -e 's&(^|\s)\*\*([[:alnum:]][^*]*[[:alnum:]][-.!?,:;"—…“”’>]*)\*\*($|\s|[-.!?,:;"—…“”’])&\1<strong>\2</strong>\3&g' \
	    -e 's&(^|\s)\*\*([[:alnum:]][-.!?,:;"—…“”’]*)\*\*($|\s|[-.!?,:;"—…“”’])&\1<strong>\2</strong>\3&g' \
	    -e 's&(^|\s|["“*])_([[:alnum:]][^_]*[[:alnum:]][-.!?,:;"—…“”’>]*)_($|\s|[-.!?,:;"—…“”’])&\1<em>\2</em>\3&g' \
	    -e 's&(^|\s|["“*])_([[:alnum:]][-.!?,:;"—…“”’]*)_($|\s|[-.!?,:;"—…“”’])&\1<em>\2</em>\3&g' \
	    -e 's&(^|\s)\*([A-Za-z0-9<][^*]*[[:alnum:]][-.!?,:;"—…“”’>]*)\*($|\s|[-.!?,:;"—…“”’])&\1<span class="thought">\2</span>\3&g' \
	    -e 's&(^|\s)\*([A-Za-z0-9<][-.!?,:;"—…“”’]*)\*($|\s|[-.!?,:;"—…“”’])&\1<span class="thought">\2</span>\3&g'
}


function write_mimetype {
	print "application/epub+zip"
}

function write_container_xml {
	cat << EOF
<?xml version="1.0" encoding="UTF-8" ?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
  <rootfiles>
    <rootfile full-path="$1/content.opf" media-type="application/oebps-package+xml"/>
  </rootfiles>
</container>
EOF
}

function write_css_styles {
	cat << EOF
h2 {
	font-size: 110%;
	font-weight: bold;
	font-family: monospace;
	margin-bottom: 0;
	margin-top: 1em;
}
p {
	margin-top: 0;
	margin-bottom: 0;
	text-indent: 5ex;
}
p.nextentry {
	margin-top: 1em;
}
span.thought {
	font-style: italic;
}
span.thought em {
	font-weight: bold;
}
EOF
}

function write_manifest_header {
	cat << EOF
<?xml version="1.0"?>
<package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="PrimaryId">

  <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
    <dc:title>The Saga of Tuck</dc:title>
    <dc:identifier id="PrimaryId">12tuckoo</dc:identifier>
    <dc:language>en</dc:language>
    <dc:creator opf:file-as="Hayes, Ellen" opf:role="aut">Ellen Hayes</dc:creator>
  </metadata>
  <manifest>
    <item id="stylesheet" href="css/styles.css " media-type="text/css"/>
    <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
EOF
}

function write_manifest_trailer {
	typeset -r files="$1"
	typeset file
	cat << EOF
  </manifest>
  <spine toc="ncx">
EOF
	for file in "$files"/*.xhtml
	do
		typeset name="$(basename "$file")"
		print "    <itemref idref='${name%.xhtml}'/>"
	done
	cat << EOF
  </spine>
</package>
EOF
}

function write_toc_header {
	cat << EOF
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"
"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">

<ncx version="2005-1" xml:lang="en" xmlns="http://www.daisy.org/z3986/2005/ncx/">

  <head>
<!-- The following four metadata items are required for all NCX documents,
including those that conform to the relaxed constraints of OPS 2.0 -->

    <meta name="dtb:uid" content="12tuckoo"/> <!-- same as in .opf -->
    <meta name="dtb:depth" content="1"/> <!-- 1 or higher -->
    <meta name="dtb:totalPageCount" content="0"/> <!-- must be 0 -->
    <meta name="dtb:maxPageNumber" content="0"/> <!-- must be 0 -->
  </head>

  <docTitle>
    <text>The Saga of Tuck</text>
  </docTitle>

  <docAuthor>
    <text>Hayes, Ellen</text>
  </docAuthor>

  <navMap>
EOF
}

function write_toc_trailer {
	cat << EOF
  </navMap>
</ncx>
EOF
}

function retrieve_title {
	typeset line
	integer count=0
	while read -r line
	do
		[[ "$line" == "-----BEGIN PGP SIGNED MESSAGE-----" ]] && break
		((++count > 50)) && return 1
	done
	while read -r line
	do
		if [[ -n "$line" ]]
		then
			print -r -- "$line" | sed 's/ -\*-.*//'
			return 0
		fi
		((++count > 50)) && return 1
	done
	return 1
}

function extract_copyright {
	typeset title=$(retrieve_title) || return 1
	typeset para="<p>" epara=""

	print "<p>"
	html_escape_entities | while read -r line
	do
		if [[ "$line" == "$title" ]]
		then
			print "$epara"
			return 0
		elif ((${#line} < 40))
		then
			[[ -n "$line" ]] && epara="</p>"
			print -n "$para$line$epara"
			epara=""
			para="<p>"
		else
			print "$para$line"
			para=""
			epara="</p>"
		fi
	done
}


function make_copyright {
	typeset -r source="$1" manifest="$2" toc="$3" path="$4"
	typeset -r name="copyright"
	typeset -r output="$path/$name.xhtml"

cat > "$output" << EOF
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
  <head>
    <meta http-equiv="Content-Type" content="application/xhtml+xml; charset=utf-8" />
    <title>The Saga of Tuck</title>
    <link rel="stylesheet" href="css/styles.css" type="text/css" />
  </head>
  <body>
EOF

	cat_possibly_compressed "$source" | filter_returns | extract_copyright >> "$output"
	MADE_COPYRIGHT=true

cat >> "$output" << EOF
  </body>
</html>
EOF
	print "    <item id='$name' href='$name.xhtml' media-type='application/xhtml+xml'/>" >> "$manifest"
}


function format_paragraph {
	typeset -r type="$1" content="$2"
	[[ -z "$2" ]] && return 0
	print -n -- "$type"
	print -n -r -- "$content" | create_smart_quotes | encode_tuck_markup
	print -- "</p>"
	return 0
}


function extract_chapter {
	typeset -f name="$1" file="$2"
	typeset title=$(retrieve_title) || return 1

	# Skip past the copyright block, looking for repeated title.
	integer count=0
	while read -r line
	do
		[[ "$line" == "$title" ]] && break
		((++count > 50)) && return 1
	done
	print -r "<h1>$(html_escape_entities "$title")</h1>"

	typeset para="<p>" content="" newentry=true
	# Ellen quotes specially for <sign language>.  Change them to guillemet mark instead.
	# Needs to be done up-front, before entity encoding.
	sed -E \
	    -e 's&(^|\s)<([[:alnum:]][^<]*[[:alnum:]][-.!?,:;"—…“”’]*)>($|\s|[-.!?,:;"—…“”’])&\1«\2»\3&g' \
	    -e 's&(^|\s)<([[:alnum:]][-.!?,:;"—…“”’]*)>($|\s|[-.!?,:;"—…“”’])&\1«\2»\3&g' |
	html_escape_entities | while IFS="" read -r line
	do
		if [[ "$line" == "-----BEGIN PGP SIGNATURE-----" || "$line" = "Distribution:" || "$line" == $"\"Tallyho!\""* ]]
		then
			format_paragraph "$para" "$content"
			return 0
		elif [[ "$line" == "" || "$line" == "***" ]]
		then
			format_paragraph "$para" "$content"
			content=""
			if [[ "$line" == "***" ]]
			then
				newentry=true
				para="<p class='nextentry'>"
			elif ! $newentry
			then
				para="<p>"
			fi
		elif [[ "$line" == [0-9][0-9]:[0-9][0-9]* ]] && $newentry
		then
			format_paragraph "$para" "$content"
			content=""
			print -r "<h2>$line</h2>"
			newentry=false
		elif [[ "$line" == "    "* ]]
		then
			format_paragraph "$para" "$content"
			[[ -n "$content" ]] && newentry=false
			! $newentry && para="<p>"
			content="$line"
		else
			content="$content $line"
		fi
	done
	return 1
}


MADE_COPYRIGHT=false
CHAPTER=0
function process_file {
	typeset -r source="$1" manifest="$2" toc="$3" path="$4"
	typeset file=$(basename "$1")
	typeset name="${file%%.*}"
	typeset -r output="$path/$name.xhtml"

	$MADE_COPYRIGHT || make_copyright "$@" || return 1

	print -n "Processing $file...\r"
	typeset title="$(cat_possibly_compressed "$source" | filter_returns | retrieve_title)" || return 1
	let CHAPTER++
	cat >> "$toc" << EOF
    <navPoint class="chapter" id="$name" playOrder="$CHAPTER">
      <navLabel><text>$(html_escape_entities "$title")</text></navLabel>
      <content src="$name.xhtml"/>
    </navPoint>
EOF

cat > "$output" << EOF
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
  <head>
    <meta http-equiv="Content-Type" content="application/xhtml+xml; charset=utf-8" />
    <title>$(html_escape_entities "$title")</title>
    <link rel="stylesheet" href="css/styles.css" type="text/css" />
  </head>
  <body>
EOF

	cat_possibly_compressed "$source" | filter_returns | extract_chapter "$name" "$name.xhtml" >> "$output"
cat >> "$output" << EOF
  </body>
</html>
EOF
	print "    <item id='$name' href='$name.xhtml' media-type='application/xhtml+xml'/>" >> "$manifest"
}

function iterate_files {
	typeset file subfile ok=true destination="$1"
	shift
	for file in "$@"
	do
		if [[ ! -e "$file" ]]
		then
			print "$file: Does not exist." 1>&2
			ok=false
		elif [[ ! -d "$file" && ! -f "$file" ]]
		then
			print "$file: Not a file or directory." 1>&2
			ok=false
		fi
	done
	$ok || return 1

	typeset build=$(mktemp -d "/var/tmp/$arg0.XXXXXX") || return 1
	trap "rm -rf '$build'" exit

	typeset -r name="Tuck"
	typeset -r target="$build/$name"
	typeset -r manifest="$target/content.opf"
	typeset -r toc="$target/toc.ncx"
	mkdir "$build/META-INF" "$target" "$target/css"
	write_mimetype > "$build/mimetype"
	write_container_xml "$name" > "$build/META-INF/container.xml"
	write_css_styles > "$target/css/styles.css"
	write_manifest_header > "$manifest"
	write_toc_header > "$toc"

	for file in "$@"
	do
		if [[ -f "$file" ]]
		then
			process_file "$file" "$manifest" "$toc" "$target" || ok=false
		else
			for subfile in $file/tuck*.txt*
			do
				if [[ ! -e "$subfile" ]]
				then
					print "$subfile: Does not exist." 1>&2
					ok=false
				elif [[ -f "$subfile" ]]
				then
					process_file "$subfile" "$manifest" "$toc" "$target" || ok=false
				else
					print "$subfile: Not a regular file, skipping." 1>&2
				fi
			done
		fi
	done
	write_manifest_trailer "$target" >> "$manifest"
	write_toc_trailer >> "$toc"
	
	rm -f "$destination"
	(cd "$build" && zip -Z store "$destination" mimetype)
	(cd "$build" && zip -r "$destination" META-INF "$name")
	
	$ok && return 0
	return 1
}


### Main ###

while getopts -a "$arg0" "$USAGE" option
do
	case "$option" in
	    o)
		OUTPUT="$OPTARG"
		;;
	esac
done

shift $((OPTIND - 1))

# Rework the output file name if needed
[[ -d "$OUTPUT" ]] && OUTPUT="$OUTPUT/tuck.epub"
outpath=$(cd "$(dirname "$OUTPUT")" && pwd) || exit 5
OUTPUT="$outpath/$(basename "$OUTPUT")"

if (($# == 0))
then
	if [[ ! -d "$TUCK" ]]
	then
		print "$TUCK: No such directory.  Please specify file locations." 1>&2
		exit 1
	fi
	iterate_files "$OUTPUT" "$TUCK"
else
	iterate_files "$OUTPUT" "$@"
fi
exit $?