#!/bin/ksh

######################################################################
# Program:	refold
# Purpose:	A text-file restoration utility.
# Arguments:	Commands and files; see usage.
# Author:	Perette Barella
# @(#) $Id: refold 97 2023-11-08 20:19:01Z perette $
#
#				IMPORTANT
# THIS FILE IS UTF-8 ENCODED!
#---------------------------------------------------------------------

######################################################################
# Function:	create_smart_quotes
# Purpose:	Modify plain-ascii quotes and apostrophes into
#		their "proper", opposing characters.
#		Elipsis translated to the corresponding character, retaining
#		surrounding whitespace.
#		Double-dashes become em-dashes with optional whitespace removed.
#		Single dashes separate from words by spaces become en-dashes.
# Limitations:	Single-quoted things and/or leading apostrophes don't get
#		fully translated because it's impossible to distinguish:
#		'Just 'cuz,' she said.
# Author:	Perette Barella
# Version:	$Id: create_smart_quotes 100 2024-05-20 14:43:22Z perette $
#---------------------------------------------------------------------
function create_smart_quotes
{
	sed -E \
		-e '# Use prime marks for minutes and seconds' \
		-e "s/([[:digit:]])'([0-5]?[0-9])\"/\\1′\\2″/"g \
		-e '# Open quote' \
		-e 's/(^|\s)"([[:alpha:]])/\1“\2/'g \
		-e '# Close quote, punctuation precedes' \
		-e 's/([[:alnum:]])([.,?!—…]+|\.\.\.)"(\s|$)/\1\2”\3/'g \
		-e '# Close quote, punctuation follows' \
		-e 's/([[:alnum:]])"([.,?! \t]|\.\.\.|$)/\1”\2/'g \
		-e '# Close single-quote, punctuation leads' \
		-e "s/([[:alnum:]])([.,?!]+|\\.\\.\\.)'(\s|$)/\1\2’\3/"g \
		-e '# Close single-quote, punctuation Follows' \
		-e "s/([[:alnum:]])'([.,?!]+|\\.\\.\\.)(\s|$)/\1’\2\3/"g \
		-e "# Contractions" \
		-e "s/([[:alpha:]])'([a-zA-Z \\t])/\\1’\\2/"g \
		-e "# Use elipsis character" \
		-e 's/([[:alnum:]])\.\.\.(\s|$|[[:alnum:]])/\1…\2/'g \
		-e '# Use endash for ranges like: "3 - 5" and "3--7"' \
		-e 's/([[:xdigit:]])\s-\s([[:xdigit:]])/\1–\2/'g \
		-e 's/([[:xdigit:]])\s?--\s?([[:xdigit:]])/\1–\2/'g \
		-e '# Use figure dash in remaining numbers (phone numbers)' \
		-e 's/([[:digit:]])-([[:digit:]])/\1‒\2/'g \
		-e '# Parenthetical dash---mid sentence between words' \
		-e 's/([[:alnum:]])-{2,3}\s?([[:alnum:]]|$)/\1—\2/'g \
		-e '# Parenthetical dash---trailing off in quoted text.' \
		-e 's/([[:alnum:]])-{2,3}"/\1—”/'g \
		-e "s/([[:alnum:]])-{2,3}'/\1—’/"g \
		-e '# Quotation dash' \
		-e 's/-{2,3}\s?([[:alpha:]])/―\1/'g \
		-e '# Catch all: parenthetical dash' \
		-e 's/\s-{2,3}\s?/—/'g \
		-e '# Catch all: range/endash' \
		-e 's/ - /–/'g
}
##### End of function create_smart_quotes #####




arg0=$(basename $0)
status=0
undo="/tmp/$arg0.$$.last"
temp="/tmp/$arg0.$$.tmp"
html="/tmp/$arg0.$$.html"




######################################################################
# Function:	usage
# Purpose:	Displays the usage of this command.
# Author:	Perette Barella
#---------------------------------------------------------------------
function usage {
	cat << EOF
$arg0 is a tool for restoring text files that have been mangled by mailers,
readnews, dead encodings, etc.  The general philosophy is to end up with
a modern format: a UTF-8 encoded text file without line breaks, paragraphs
separated by a blank line.  Extra blank lines are maintained, as these
often have significance.

Usage: $arg0 [edit command] file ...
Issues the specified edit command on the file, sending the result to
standard out.  Command-file pairs may be repeated, but output will 
run together.  Multiple edit sequences must be done by successive
invocations.  If no edit command is given, $arg0 enters interactive mode.
If only an edit command is given, $arg0 runs as a filter.

Commands are as follows:
	-combine-indented-paragraphs
	-combine-paragraphs-separated-by-blanks
		Both "combine" functions assemble broken-line paragraphs,
		most typically 70-ish column format, into unbroken
		paragraphs.  Paragraph cues are taken as described.
	-remove-double-spacing
		Eliminates double spacing.  When there are multiple blank
		lines in succession, only the first is dropped.
	-demangle-weird-characters
		Fixes Quoted Printable and Rich Text Format encodings.
		The most obvious fix, typically, is fixing quotes and
		apostrophes which have been encoded as ugly character sequences.
		For Quoted Printable, also fixes extra line breaks introduced
		by the encoding.
	-fold
		Once you've got a file back into a modern format, you should
		keep it that way.  If, however, you want to export it for
		posting on Usenet, etc., "folding" does all the magic:
		Inserting line breaks for a 70-character terminal,
		converting asymmetrical quotes into their ASCII equivalents,
		downconverting various Unicode diacritical characters
		into their plain ASCII equivalents, etc.
	-add-smart-quotes
		Replaces ASCII quotes and apostrophes with the "smart"
		equivalent.  It also catches ellipsises.
		Performance with double-quotes is pretty good, but because
		of ambiguity between single-quotes, plural possessives,
		and contractions, these are not touched.
EOF
##### End of function usage #####
}



######################################################################
# Function:	combine_indented_paragraphs
# Purpose:	Combines lines in a source file into unbroken paragraphs.
#		Paragraph determination is done by leading whitespace on
#		a line; thus, a file must be <<'ed prior to processing
#		if it contains a left margin.
# Author:	Perette Barella
#---------------------------------------------------------------------
function combine_indented_paragraphs {
	typeset buffer=""
	typeset didblanks=true
	while IFS="" read aline
	do
		if [ "$aline" = "" ] || expr "$aline" : '[ 	]*$' >/dev/null
		then
			[ "$buffer" != "" ] &&
				print -- "$buffer"
			buffer=""
			if [ "$didblanks" = "false" ]
			then
				print
				print
				print
			fi
			didblanks=true
		elif expr "$aline" : '[ 	]' >/dev/null
		then
			if [ "$buffer" != "" ]
			then
				print -- "$buffer"
				print
			fi
			buffer="$aline"
			didblanks=false
		else
			buffer="$buffer $aline"
			didblanks=false
		fi
	done
	[ "$buffer" != "" ] && print -- "$buffer"
}



######################################################################
# Function:	combine_separated_paragraphs
# Purpose:	Create unbroken paragraphs from a text file, separating
#		paragraphs where blank lines are present.  When there
#		is more than one blank line, extras are retained as they
#		are likely significant.
# Author:	Perette Barella
#---------------------------------------------------------------------
function combine_separated_paragraphs {
	buffer=""
	while read aline
	do
		if [ "$aline" = "" ]
		then
			[ "$buffer" != "" ] &&
				print -- "$buffer"
			print
			buffer=""
		else
			buffer="$buffer$aline "
		fi
	done |
	sed -e 's/[ \t]*$//'
}

######################################################################
# Function:	remove_double_spacing
# Purpose:	Removes double spacing. This step is necessary
#		before combining separated paragraphs-- otherwise,
#		lines will be treated as paragraphs.
# Author:	Perette Barella
#---------------------------------------------------------------------
function remove_double_spacing {
	buffer=""
	emptycount=0
	while IFS="" read aline
	do
		if [ "$aline" = "" ] || expr "$aline" : '[ 	]*$' >/dev/null
		then
			let emptycount=emptycount+1
			[ $emptycount -gt 1 -a $emptycount -lt 5 ] &&
				print
		else
			emptycount=0
			print "$aline"
		fi
	done
}

######################################################################
# Function:	demangle_qp_and_rtf
# Purpose:	Fixes "junk" introduced by Quoted Printable and
#		RTF encodings, including smart quotes, a few special
#		characters, and QP's spurious line breaks.
# Author:	Perette Barella
# References: "Quoted-printable" (preceded by =)
#		http://en.wikipedia.org/wiki/Quoted-printable
# 		"Rich Text Format - Character Encoding" (preceded by \')
#		http://en.wikipedia.org/wiki/Rich_Text_Format
#---------------------------------------------------------------------
function demangle_qp_and_rtf {
	while IFS="" read aline
	do
		# Fix spurious wrap from quoted printable
		if [ "$aline" != "" ] &&
		   [ "${aline:$((${#aline}-1))}" = "=" ]
		then
			print -n -- "${aline:0:$((${#aline}-1))}"
		else
			print -- "$aline"
		fi
	done |
	sed \
		-e "s/=91/’/g" \
		-e "s/=92/‘/g" \
		-e 's/=93/“/g' \
		-e 's/=94/”/g' \
		-e "s/=A9/©/g" \
		-e "s/=3D/=/g" \
		-e "s/=20$//g" \
		-e "s/=09$//g" \
		-e "s/\\'d5/’/g" \
		-e "s/\\'d4/‘/g" \
		-e "s/\\'d2/\“/g" \
		-e "s/\\'d3/\”/g" \
		-e "s/\\cb[13] //g" \
		-e "s/\\cb[13]//g" \
		-e "s/\\c9/.../g"
}




# Execute an edit function on the specified file.
# The result is put to standard out, which is presumably redirected
# by the caller.
function perform_edit {
	typeset action="$1" file="$2"
	case "$action" in
		combine-indented-paragraphs)
			(cat "$file"; print) |
			combine_indented_paragraphs |
			sed -e 's/^[ \t]*//' -e 's/[ 	]*$//'
			;;
		combine-paragraphs-separated-by-blanks)
			(cat "$file"; print) |
			combine_separated_paragraphs |
			sed -e 's/^[ \t]*//' -e 's/[ 	]*$//'
			;;
		remove-double-spacing)
			(cat "$file"; print) |
			remove_double_spacing
			;;
		demangle-weird-characters)
			(sed -e "s/\\\\$//" "$file"; print) |
			demangle_qp_and_rtf
			;;
		add-smart-quotes)
			create_smart_quotes < "$file"
			;;
		fold)
			# Convert a file back to pure ASCII and
			# put it in 70-column format for posting
			# on newsgroups, etc.
			sed \
				-e 's/–/ - /'g \
				-e 's/—/--/'g \
				-e 's/©/(c)/'g \
				-e 's/®/(R)/'g \
				-e 's/…/.../'g \
				-e 's/™/(TM)/'g \
				-e 's/•/*/'g \
				-e "s/[‘’]/'/"g \
				-e 's/[“”]/"/'g \
				-e 's/[áàäâã]/a/'g \
				-e 's/[ÀÁÄÂÃ]/A/'g \
				-e 's/[èéëê]/e/'g \
				-e 's/[ÈÉËÊ]/E/'g \
				-e 's/[ìíïî]/i/'g \
				-e 's/[ÌÍÏÎ]/I/'g \
				-e 's/[òóöôõ]/o/'g \
				-e 's/[ÒÓÖÔÕ]/O/'g \
				-e 's/[ùúüû]/u/'g \
				-e 's/[ÙÚÜÛ]/U/'g \
				-e 's/ñ/n/'g \
				-e 's/Ñ/N/'g \
				-e 's/ç/c/'g \
				-e 's/Ç/C/'g \
				-e 's/æ/ae/'g \
				-e 's/Æ/Ae/'g \
				"$file" |
				fold -70 -s
			;;
		*)
			print "Unknown action: $action."
			exit 1
	esac
	return $?
}



function encoding_lab {
	typeset file="$1" undo="$2" encoding checksum size
	typeset temp="/var/tmp/$0.lab.$$.tmp"
	typeset all_encodings=$(iconv --list | awk '{print $1}')
	typeset possible_encodings=""
	typeset rejected_encodings=""
	cksum < "$file" | read checksum size
	eval "typeset en_${checksum}_${size}='the original file'"
	for encoding in $all_encodings
	do
		if iconv -f "$encoding" -t "utf-8" "$file" > "$temp"  2>/dev/null
		then
			cksum < "$temp" | read checksum size
			if eval " [ \"\${en_${checksum}_${size}}\" != \"\" ]"
			then
				eval "print \"$encoding duplicates \${en_${checksum}_${size}}\""
			else
				possible_encodings="$possible_encodings $encoding"
				eval "typeset en_${checksum}_${size}='$encoding'"
			fi
		else
			rejected_encodings="$rejected_encodings $encoding"
		fi
	done
	print -n "Rejecting encodings: $rejected_encodings"
	print
	if [ "$possible_encodings" = "" ]
	then
		print "Nothing to do in the encoding lab!"
		return 1
	fi
	typeset selected_encoding=""
	typeset commit=""
	while true
	do
		print "Current encoding is: ${selected_encoding:-not set}"
		select encoding in $possible_encodings $commit help cancel
		do
			if [ "$encoding" = "" ]
			then
				print "Unknown action."
				continue
			fi
			case "$encoding" in
				cancel)
					return 1
					;;
				commit)
					if mv "$file" "$undo"
					then
						iconv -f "$selected_encoding" -t "utf-8" "$undo" > "$file"
						return 0
					fi
					return 1
					;;
				help)
					cat << EOF
The Encoding Lab allows you to quickly try different character encodings,
select the correct one, and convert the file you're working with into
Unicode (UTF-8 encoded).

Type the number of the encoding you want to try.  The file will be converted
to that encoding, then the differences between the two file shown.  If
the file is correct (look to smart quotes, umlauts, accent graves, and other
special characters) then commit; otherwise, try a different encoding.

What are these encodings?  In the old days, computers had 256 characters
to work with.  Different regions (say, Russia) had different character
sets to accommodate the special characters necessary in their language,
all using the same 256 values (0-255, or 0 .. 2^8-1) for their encodings.

China, Japan, and Korea (which have hundreds of characters, more than the
256 old-days limit) came up with their own systems, not even compatible
with each other.

Nothing was compatible, and it sucked; it was a tower of Babel.

Then Unicode came, and the way was clear.  The Unicode character set
accommodates every language in use.  Documents written in Unicode thus
eliminate the per-country encoding problem, although there are a handful
of ways to transliterate Unicode characters into byte sequences for
machines-- which is the encoding.  UTF-8 is a common way of encoding
Unicode characters using byte sequences that are compatible with older
software, and thus UNIX friendly.  Unicode UTF-8 is also the most common
Unicode encoding on the World Wide Web.
EOF
				;;
				delete)
					commit=""
					possible_encodings="$(print -- "$possible_encodings" | sed "s/ $selected_encoding / /g")"
					selected_encoding=""
					;;
				*)
					commit="commit delete"
					selected_encoding="$encoding"
					iconv -f "$encoding" -t "utf-8" "$file" > "$temp"
					diff "$file" "$temp"
					rm -f "$temp"
					;;
			esac
			break
		done
	done
}




function show_file_head {
	typeset file="$1" length=20
	while [ $length -gt 5 -a $(head -$length "$file" | wc -w) -gt 150 ]
	do
		let length=length-1
	done
	if [ $length -lt 20 ]
	then
		head -$length "$file" | fold -s -78
	else
		head -$length "$file"
	fi
}



function edit_file
{
	quit=false
	file="$1"
	cp "$file" "$temp"
	cp "$file" "$undo"
	if ! iconv -f utf-8 -t utf-8 "$file" > /dev/null
	then
		print "$file: This file is not Unicode UTF-8 encoded."
		encoding_lab "$temp" "$undo"
	fi
	while true
	do
	    show_file_head "$temp"
	    print
	    select action in "open-textedit" "edit-vi" "render-multimarkdown" "combine-indented-paragraphs" "combine-paragraphs-separated-by-blanks" "remove-double-spacing" "demangle-weird-characters" "add-smart-quotes" "encoding-lab" "fold" "diff" "meld" "undo" "commit" "revert" "quit"
	    do
		if [ "$action" = "" ]
		then
			print "Unknown action."
			continue
		fi
		if [ "$action" = "quit" -a "$quit" = "true" ]
		then
			return 0
		fi
		quit=false
		case "$action" in
			open-textedit)
				# open -e "$temp"
				xed "$temp" &
				;;
			edit-vi)
				vi "$temp"
				;;
			render-multimarkdown)
				multimarkdown "$temp" > "$html" && open "$html"
				;;
			undo)
				cp "$undo" "$temp"
				;;
			commit)
				cp "$temp" "$file"
				return 0
				;;
			diff)
				diff "$file" "$temp"
				continue
				;;
			meld)
				meld "$file" "$temp" &
				continue
				;;
			revert)
				mv "$temp" "$undo" &&
					cp "$file" "$temp"
				;;
			encoding-lab)
				encoding_lab "$temp" "$undo"
				;;
			quit)
				quit=true
				print "ARE YOU SURE? Quit again to confirm."
				continue
				;;
			*)
				if mv "$temp" "$undo"
				then
					perform_edit "$action" "$undo" > "$temp"
				fi
				;;
		esac
	        break
	    done
	done
	
}



if [ $# -eq 0 ]
then
	usage
	exit 1
fi
for file in "$@"
do
	if [ "$file" = "-?" ]
	then
		usage
		exit 0
	elif [ "${file:0:1}" = "-" ]
	then
		action="${file:1}"
	elif [ "$action" = "" ]
	then
		edit_file "$file"
		action=""
	else
		perform_edit "$action" "$file"
	fi
done
if [ $# -eq 1 -a "$action" != "" ]
then
	perform_edit "$action" /dev/stdin
fi

rm -f "$temp" "$undo" "$html"

exit $status
