#! /bin/sh

# transform HTML character mnemonic reference web page into mnemonic entries
#	<td class="code"><code>&amp;szlig;</code></td>
#	<td class="code"><code>&amp;#223;</code></td>
# ->
#		{"szlig", 0x00DF /* ß */},
#	<td class="code"><code>&amp;sbquo;</code> (ist korrekt, nicht &amp;bsquo;)</td>

refpage=${1-charref}

#htmlver=`sed -e '1 s,.*HTML \([0-9.]*\).*,\1,' -e t -e d $refpage`
#echo "	/* HTML $htmlver mnemos (named entities) */"
echo "	/* HTML mnemos (named characters, former charref) */"
#echo "	/* https://html.spec.whatwg.org/multipage/named-characters.html */"

#<tr title="U+000A3 POUND SIGN"
# data-block="C1 Controls and Latin-1 Supplement"
# data-category="Sc"
# data-set="xhtml1-lat1 9573-2003-isonum">
#<td class="character"> &#x000A3;<td class="named"><code>&amp;pound;</code>
#<td class="hex"><code>&amp;#x000A3;</code>
#<td class="dec"><code>&amp;#163;</code><td class="desc">POUND SIGN

#echo -n '	{"pound", 0x'; printf %04X 0x000A3; echo -n ' /* U+'; printf %04X 0x000A3; echo ' */},'

cat $refpage |
sed -e "s:^<tr.*<td class=\"named\"><code>&amp;\([^#][^;]*\).*<td class=\"hex\"><code>&amp;#x\([0-9A-Fa-f]*\).*:echo -n '	{\"\1\", 0x'; printf %04X 0x\2; echo -n ' /* U+'; printf %04X 0x\2; echo ' */},':" -e t -e d |
sh | ./insutf8 | sed -e 's,U+[^ ]* ,,' |
sed	-e 's,"not","\&not",' \
	-e 's,"oslash","\&oslash",' \
	-e 's,"circ","\&circ",' \
	-e 's,"phi","\&phi",' \
	-e 's,"asymp","\&asymp",' \
	-e 's,"Or","\&Or",' \
	-e 's,"Sc","\&Sc",' \
	-e 's,"amalg","\&amalg",' \
	-e 's,"ast","\&ast",' \
	-e 's,"cdot","\&cdot",' \
	-e 's,"jmath","\&jmath",' \
	-e 's,"models","\&models",' \
	-e 's,"num","\&num",' \
	-e 's,"sc","\&sc",' \
	-e 's,"star","\&star",' \
	-e 's,"wcirc","\&wcirc",'

echo "check uniqueness with ./mkmnemocheck" >&2

exit

# this would transform the recent version of the file instead,
# https://html.spec.whatwg.org/multipage/named-characters.html
# but there are some incompatible changes, and not to the better, 
# e.g. "ohm" from Ω U+2126 OHM SIGN to Ω U+03A9 CAPITAL LETTER OMEGA,
# so we refrain from a change for now

#<tr id=entity-pound><td> <code>pound;</code> <td> U+000A3 <td> <span class=glyph>£</span> 
#	 aelig; 	 U+000E6 	 æ 
#	 amp; 	 U+00026 	 &amp; 

# note: cannot match \3 below as sed fails on non-BMP in cygwin
cat $refpage |
sed -e 's,<tr,\n<tr,g' | grep '^<tr' |
sed -e 's,<td>,	,g' -e 's,<[^>]*>,,g' -e 's,&amp;,\&,g' -e 's,&lt;,<,g' |
sed -e 's:	 *\([^; 	]*\);* *	 *\(U+[^	]*\)	 *:	{"\1", \2 /* :' -e t -e d |
sed -e 's:$:*/},:' |
sed -e 's,U+0,0x,g' -e 's,U+,0x,g' -e 's,  , ,g'

echo "check uniqueness with ./mkmnemocheck" >&2

exit

