Re: Unicode character decomposition Unix library?
- From: SM Ryan <wyrmwif@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx>
- Date: Wed, 28 Nov 2007 21:34:13 -0000
William Ahern <william@xxxxxxxxxxxxxxxxxxxxxxxxx> wrote:
# SM Ryan <wyrmwif@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx> wrote:
# > Is there a library/whatever available on Unices to do
# > Unicode character decomposition normalisation?
#
# http://icu-project.org/
#
# iconv, a library for characterset conversion likely already installed on
# your system, may be capable of this. I'm unsure.
I found about the unicode data file and wrote a Tcl script that generates
a C function NFD that comverts a UTF-8 string to an NFD UTF-8 string.
In case anyone else is interested (this normalises for MacOSX paths).
./nfd.tcl <.../UnicodeData.txt >nfd.c
nfd.tcl:
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#!/usr/bin/tclsh
proc utf codes {
set utf {}
foreach code $codes {
set s [encoding convertto utf-8 [format %c $code]]
binary scan $s c* b
foreach c $b {
lappend utf [expr {($c+0x100) % 0x100}]
}
}
return $utf
}
array set choice {0 0 {0 0} {/*end of string*/ POP; continue;}}
while {[gets stdin line]>=0} {
set line [split $line ";"]
if {[llength $line]<6} continue
if {[llength [lindex $line 5]]==0} continue
set code 0x[lindex $line 0]
set code [expr {$code}]
set decomposition {}
set okay true
foreach c [lindex $line 5] {
set okay [string is xdigit -strict $c]
if {!$okay} break
set c 0x$c
lappend decomposition [expr {$c}]
}
if {!$okay} continue
set code [utf $code]
set decomposition [utf $decomposition]
set stem 0
foreach byte $code {
if {![info exists choice([concat $stem $byte])]} {
lappend choice($stem) $byte
}
lappend stem $byte
}
set decompositionstring ""
foreach byte $decomposition {
append decompositionstring \\x[format %02X $byte]
}
set choice($stem) "/*[lindex $line 1]*/ PUSH(\"$decompositionstring\"); continue;"
}
proc choose {space x} {
global choice
set n [llength $x]
set space1 "$space "
if {[string index $choice($x) 0] eq "/"} {
puts ${space}$choice($x)
} elseif {[llength $choice($x)]==1} {
set sep "${space}if ("
set i 0
while {[string index $choice($x) 0] ne "/"} {
set c [lindex $choice($x) 0]
puts -nonewline "${sep}STRING\[$i\]==0x[format %02X $c]"
lappend x $c
set sep { && }
incr i
}
puts ") \{"
puts "${space1}STRING += $i;"
choose $space1 $x
puts "${space}\}"
} else {
puts "${space}switch (*STRING++) \{"
foreach c $choice($x) {
puts "$space case 0x[format %02X $c]:"
choose $space1 [concat $x $c]
}
puts "$space default: STRING -= $n; EMIT(*STRING++); continue;"
puts "${space}\}"
}
}
puts "
char *NFD(char *utf) \{
typedef struct Chain *Chain; struct Chain {char ch; Chain prev;};
typedef struct Stack *Stack; struct Stack {unsigned char *st; Stack under;};
Chain chain = 0; int changed = 0; int n = 1; char *result; Stack stack = 0;
#ifndef MALLOC
#define MALLOC malloc
#endif
#ifndef FREE
#define FREE free
#endif
#define PUSH(s) {Stack t = MALLOC(sizeof(struct Stack)); \\
t->st = (unsigned char*)(s); t->under = stack; stack = t; changed = 1;}
#define POP {Stack t = stack->under; FREE(stack); stack = t;}
#define EMIT(c) {Chain t = MALLOC(sizeof(struct Chain)); \\
t->ch = (c); t->prev = chain; chain = t; n++;}
#define STRING (stack->st)
if (utf) PUSH(utf);
while (stack) \{"
choose " " 0
puts " \}
if (!changed) {
while (chain) {Chain t = chain->prev; FREE(chain); chain = t;}
return utf;
}
result = MALLOC(n); result\[--n\] = 0;
while (chain) {
Chain t = chain->prev; result\[--n\] = chain->ch;
FREE(chain); chain = t;
}
return result;
#undef MALLOC
#undef FREE
#undef PUSH
#undef POP
#undef EMIT
#undef STRING
\}
"
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
--
SM Ryan http://www.rawbw.com/~wyrmwif/
Who's leading this mob?
.
- References:
- Re: Unicode character decomposition Unix library?
- From: William Ahern
- Re: Unicode character decomposition Unix library?
- Prev by Date: Re: c printf/variables content visible in compiled binary. How to hide that ?
- Next by Date: Re: c printf/variables content visible in compiled binary. How to hide that ?
- Previous by thread: Re: Unicode character decomposition Unix library?
- Next by thread: Design of x-window system
- Index(es):