| /* |
| * Copyright (c) 2014 SGI. |
| * All rights reserved. |
| * |
| * This program is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU General Public License as |
| * published by the Free Software Foundation. |
| * |
| * This program is distributed in the hope that it would be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| * GNU General Public License for more details. |
| * |
| * You should have received a copy of the GNU General Public License |
| * along with this program; if not, write the Free Software Foundation, |
| * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
| */ |
| |
| /* Generator for a compact trie for unicode normalization */ |
| |
| #include <sys/types.h> |
| #include <stddef.h> |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <assert.h> |
| #include <string.h> |
| #include <unistd.h> |
| #include <errno.h> |
| |
| /* Default names of the in- and output files. */ |
| |
| #define AGE_NAME "DerivedAge.txt" |
| #define CCC_NAME "DerivedCombiningClass.txt" |
| #define PROP_NAME "DerivedCoreProperties.txt" |
| #define DATA_NAME "UnicodeData.txt" |
| #define FOLD_NAME "CaseFolding.txt" |
| #define NORM_NAME "NormalizationCorrections.txt" |
| #define TEST_NAME "NormalizationTest.txt" |
| #define UTF8_NAME "utf8data.h" |
| |
| const char *age_name = AGE_NAME; |
| const char *ccc_name = CCC_NAME; |
| const char *prop_name = PROP_NAME; |
| const char *data_name = DATA_NAME; |
| const char *fold_name = FOLD_NAME; |
| const char *norm_name = NORM_NAME; |
| const char *test_name = TEST_NAME; |
| const char *utf8_name = UTF8_NAME; |
| |
| int verbose = 0; |
| |
| /* An arbitrary line size limit on input lines. */ |
| |
| #define LINESIZE 1024 |
| char line[LINESIZE]; |
| char buf0[LINESIZE]; |
| char buf1[LINESIZE]; |
| char buf2[LINESIZE]; |
| char buf3[LINESIZE]; |
| |
| const char *argv0; |
| |
| #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) |
| |
| /* ------------------------------------------------------------------ */ |
| |
| /* |
| * Unicode version numbers consist of three parts: major, minor, and a |
| * revision. These numbers are packed into an unsigned int to obtain |
| * a single version number. |
| * |
| * To save space in the generated trie, the unicode version is not |
| * stored directly, instead we calculate a generation number from the |
| * unicode versions seen in the DerivedAge file, and use that as an |
| * index into a table of unicode versions. |
| */ |
| #define UNICODE_MAJ_SHIFT (16) |
| #define UNICODE_MIN_SHIFT (8) |
| |
| #define UNICODE_MAJ_MAX ((unsigned short)-1) |
| #define UNICODE_MIN_MAX ((unsigned char)-1) |
| #define UNICODE_REV_MAX ((unsigned char)-1) |
| |
| #define UNICODE_AGE(MAJ,MIN,REV) \ |
| (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \ |
| ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \ |
| ((unsigned int)(REV))) |
| |
| unsigned int *ages; |
| int ages_count; |
| |
| unsigned int unicode_maxage; |
| |
| static int age_valid(unsigned int major, unsigned int minor, |
| unsigned int revision) |
| { |
| if (major > UNICODE_MAJ_MAX) |
| return 0; |
| if (minor > UNICODE_MIN_MAX) |
| return 0; |
| if (revision > UNICODE_REV_MAX) |
| return 0; |
| return 1; |
| } |
| |
| /* ------------------------------------------------------------------ */ |
| |
| /* |
| * utf8trie_t |
| * |
| * A compact binary tree, used to decode UTF-8 characters. |
| * |
| * Internal nodes are one byte for the node itself, and up to three |
| * bytes for an offset into the tree. The first byte contains the |
| * following information: |
| * NEXTBYTE - flag - advance to next byte if set |
| * BITNUM - 3 bit field - the bit number to tested |
| * OFFLEN - 2 bit field - number of bytes in the offset |
| * if offlen == 0 (non-branching node) |
| * RIGHTPATH - 1 bit field - set if the following node is for the |
| * right-hand path (tested bit is set) |
| * TRIENODE - 1 bit field - set if the following node is an internal |
| * node, otherwise it is a leaf node |
| * if offlen != 0 (branching node) |
| * LEFTNODE - 1 bit field - set if the left-hand node is internal |
| * RIGHTNODE - 1 bit field - set if the right-hand node is internal |
| * |
| * Due to the way utf8 works, there cannot be branching nodes with |
| * NEXTBYTE set, and moreover those nodes always have a righthand |
| * descendant. |
| */ |
| typedef unsigned char utf8trie_t; |
| #define BITNUM 0x07 |
| #define NEXTBYTE 0x08 |
| #define OFFLEN 0x30 |
| #define OFFLEN_SHIFT 4 |
| #define RIGHTPATH 0x40 |
| #define TRIENODE 0x80 |
| #define RIGHTNODE 0x40 |
| #define LEFTNODE 0x80 |
| |
| /* |
| * utf8leaf_t |
| * |
| * The leaves of the trie are embedded in the trie, and so the same |
| * underlying datatype, unsigned char. |
| * |
| * leaf[0]: The unicode version, stored as a generation number that is |
| * an index into utf8agetab[]. With this we can filter code |
| * points based on the unicode version in which they were |
| * defined. The CCC of a non-defined code point is 0. |
| * leaf[1]: Canonical Combining Class. During normalization, we need |
| * to do a stable sort into ascending order of all characters |
| * with a non-zero CCC that occur between two characters with |
| * a CCC of 0, or at the begin or end of a string. |
| * The unicode standard guarantees that all CCC values are |
| * between 0 and 254 inclusive, which leaves 255 available as |
| * a special value. |
| * Code points with CCC 0 are known as stoppers. |
| * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the |
| * start of a NUL-terminated string that is the decomposition |
| * of the character. |
| * The CCC of a decomposable character is the same as the CCC |
| * of the first character of its decomposition. |
| * Some characters decompose as the empty string: these are |
| * characters with the Default_Ignorable_Code_Point property. |
| * These do affect normalization, as they all have CCC 0. |
| * |
| * The decompositions in the trie have been fully expanded. |
| * |
| * Casefolding, if applicable, is also done using decompositions. |
| */ |
| typedef unsigned char utf8leaf_t; |
| |
| #define LEAF_GEN(LEAF) ((LEAF)[0]) |
| #define LEAF_CCC(LEAF) ((LEAF)[1]) |
| #define LEAF_STR(LEAF) ((const char*)((LEAF) + 2)) |
| |
| #define MAXGEN (255) |
| |
| #define MINCCC (0) |
| #define MAXCCC (254) |
| #define STOPPER (0) |
| #define DECOMPOSE (255) |
| #define HANGUL ((char)(255)) |
| |
| #define UTF8HANGULLEAF (12) |
| |
| struct tree; |
| static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *, |
| const char *, size_t); |
| static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *); |
| |
| unsigned char *utf8data; |
| size_t utf8data_size; |
| |
| utf8trie_t *nfdi; |
| utf8trie_t *nfdicf; |
| |
| /* ------------------------------------------------------------------ */ |
| |
| /* |
| * UTF8 valid ranges. |
| * |
| * The UTF-8 encoding spreads the bits of a 32bit word over several |
| * bytes. This table gives the ranges that can be held and how they'd |
| * be represented. |
| * |
| * 0x00000000 0x0000007F: 0xxxxxxx |
| * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx |
| * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx |
| * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
| * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
| * |
| * There is an additional requirement on UTF-8, in that only the |
| * shortest representation of a 32bit value is to be used. A decoder |
| * must not decode sequences that do not satisfy this requirement. |
| * Thus the allowed ranges have a lower bound. |
| * |
| * 0x00000000 0x0000007F: 0xxxxxxx |
| * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx |
| * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx |
| * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
| * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
| * |
| * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, |
| * 17 planes of 65536 values. This limits the sequences actually seen |
| * even more, to just the following. |
| * |
| * 0 - 0x7f: 0 0x7f |
| * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf |
| * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf |
| * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf |
| * |
| * Even within those ranges not all values are allowed: the surrogates |
| * 0xd800 - 0xdfff should never be seen. |
| * |
| * Note that the longest sequence seen with valid usage is 4 bytes, |
| * the same a single UTF-32 character. This makes the UTF-8 |
| * representation of Unicode strictly smaller than UTF-32. |
| * |
| * The shortest sequence requirement was introduced by: |
| * Corrigendum #1: UTF-8 Shortest Form |
| * It can be found here: |
| * http://www.unicode.org/versions/corrigendum1.html |
| * |
| */ |
| |
| #define UTF8_2_BITS 0xC0 |
| #define UTF8_3_BITS 0xE0 |
| #define UTF8_4_BITS 0xF0 |
| #define UTF8_N_BITS 0x80 |
| #define UTF8_2_MASK 0xE0 |
| #define UTF8_3_MASK 0xF0 |
| #define UTF8_4_MASK 0xF8 |
| #define UTF8_N_MASK 0xC0 |
| #define UTF8_V_MASK 0x3F |
| #define UTF8_V_SHIFT 6 |
| |
| static int utf8encode(char *str, unsigned int val) |
| { |
| int len; |
| |
| if (val < 0x80) { |
| str[0] = val; |
| len = 1; |
| } else if (val < 0x800) { |
| str[1] = val & UTF8_V_MASK; |
| str[1] |= UTF8_N_BITS; |
| val >>= UTF8_V_SHIFT; |
| str[0] = val; |
| str[0] |= UTF8_2_BITS; |
| len = 2; |
| } else if (val < 0x10000) { |
| str[2] = val & UTF8_V_MASK; |
| str[2] |= UTF8_N_BITS; |
| val >>= UTF8_V_SHIFT; |
| str[1] = val & UTF8_V_MASK; |
| str[1] |= UTF8_N_BITS; |
| val >>= UTF8_V_SHIFT; |
| str[0] = val; |
| str[0] |= UTF8_3_BITS; |
| len = 3; |
| } else if (val < 0x110000) { |
| str[3] = val & UTF8_V_MASK; |
| str[3] |= UTF8_N_BITS; |
| val >>= UTF8_V_SHIFT; |
| str[2] = val & UTF8_V_MASK; |
| str[2] |= UTF8_N_BITS; |
| val >>= UTF8_V_SHIFT; |
| str[1] = val & UTF8_V_MASK; |
| str[1] |= UTF8_N_BITS; |
| val >>= UTF8_V_SHIFT; |
| str[0] = val; |
| str[0] |= UTF8_4_BITS; |
| len = 4; |
| } else { |
| printf("%#x: illegal val\n", val); |
| len = 0; |
| } |
| return len; |
| } |
| |
| static unsigned int utf8decode(const char *str) |
| { |
| const unsigned char *s = (const unsigned char*)str; |
| unsigned int unichar = 0; |
| |
| if (*s < 0x80) { |
| unichar = *s; |
| } else if (*s < UTF8_3_BITS) { |
| unichar = *s++ & 0x1F; |
| unichar <<= UTF8_V_SHIFT; |
| unichar |= *s & 0x3F; |
| } else if (*s < UTF8_4_BITS) { |
| unichar = *s++ & 0x0F; |
| unichar <<= UTF8_V_SHIFT; |
| unichar |= *s++ & 0x3F; |
| unichar <<= UTF8_V_SHIFT; |
| unichar |= *s & 0x3F; |
| } else { |
| unichar = *s++ & 0x0F; |
| unichar <<= UTF8_V_SHIFT; |
| unichar |= *s++ & 0x3F; |
| unichar <<= UTF8_V_SHIFT; |
| unichar |= *s++ & 0x3F; |
| unichar <<= UTF8_V_SHIFT; |
| unichar |= *s & 0x3F; |
| } |
| return unichar; |
| } |
| |
| static int utf32valid(unsigned int unichar) |
| { |
| return unichar < 0x110000; |
| } |
| |
| #define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3) |
| |
| #define NODE 1 |
| #define LEAF 0 |
| |
| struct tree { |
| void *root; |
| int childnode; |
| const char *type; |
| unsigned int maxage; |
| struct tree *next; |
| int (*leaf_equal)(void *, void *); |
| void (*leaf_print)(void *, int); |
| int (*leaf_mark)(void *); |
| int (*leaf_size)(void *); |
| int *(*leaf_index)(struct tree *, void *); |
| unsigned char *(*leaf_emit)(void *, unsigned char *); |
| int leafindex[0x110000]; |
| int index; |
| }; |
| |
| struct node { |
| int index; |
| int offset; |
| int mark; |
| int size; |
| struct node *parent; |
| void *left; |
| void *right; |
| unsigned char bitnum; |
| unsigned char nextbyte; |
| unsigned char leftnode; |
| unsigned char rightnode; |
| unsigned int keybits; |
| unsigned int keymask; |
| }; |
| |
| /* |
| * Example lookup function for a tree. |
| */ |
| static void *lookup(struct tree *tree, const char *key) |
| { |
| struct node *node; |
| void *leaf = NULL; |
| |
| node = tree->root; |
| while (!leaf && node) { |
| if (node->nextbyte) |
| key++; |
| if (*key & (1 << (node->bitnum & 7))) { |
| /* Right leg */ |
| if (node->rightnode == NODE) { |
| node = node->right; |
| } else if (node->rightnode == LEAF) { |
| leaf = node->right; |
| } else { |
| node = NULL; |
| } |
| } else { |
| /* Left leg */ |
| if (node->leftnode == NODE) { |
| node = node->left; |
| } else if (node->leftnode == LEAF) { |
| leaf = node->left; |
| } else { |
| node = NULL; |
| } |
| } |
| } |
| |
| return leaf; |
| } |
| |
| /* |
| * A simple non-recursive tree walker: keep track of visits to the |
| * left and right branches in the leftmask and rightmask. |
| */ |
| static void tree_walk(struct tree *tree) |
| { |
| struct node *node; |
| unsigned int leftmask; |
| unsigned int rightmask; |
| unsigned int bitmask; |
| int indent = 1; |
| int nodes, singletons, leaves; |
| |
| nodes = singletons = leaves = 0; |
| |
| printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root); |
| if (tree->childnode == LEAF) { |
| assert(tree->root); |
| tree->leaf_print(tree->root, indent); |
| leaves = 1; |
| } else { |
| assert(tree->childnode == NODE); |
| node = tree->root; |
| leftmask = rightmask = 0; |
| while (node) { |
| printf("%*snode @ %p bitnum %d nextbyte %d" |
| " left %p right %p mask %x bits %x\n", |
| indent, "", node, |
| node->bitnum, node->nextbyte, |
| node->left, node->right, |
| node->keymask, node->keybits); |
| nodes += 1; |
| if (!(node->left && node->right)) |
| singletons += 1; |
| |
| while (node) { |
| bitmask = 1 << node->bitnum; |
| if ((leftmask & bitmask) == 0) { |
| leftmask |= bitmask; |
| if (node->leftnode == LEAF) { |
| assert(node->left); |
| tree->leaf_print(node->left, |
| indent+1); |
| leaves += 1; |
| } else if (node->left) { |
| assert(node->leftnode == NODE); |
| indent += 1; |
| node = node->left; |
| break; |
| } |
| } |
| if ((rightmask & bitmask) == 0) { |
| rightmask |= bitmask; |
| if (node->rightnode == LEAF) { |
| assert(node->right); |
| tree->leaf_print(node->right, |
| indent+1); |
| leaves += 1; |
| } else if (node->right) { |
| assert(node->rightnode == NODE); |
| indent += 1; |
| node = node->right; |
| break; |
| } |
| } |
| leftmask &= ~bitmask; |
| rightmask &= ~bitmask; |
| node = node->parent; |
| indent -= 1; |
| } |
| } |
| } |
| printf("nodes %d leaves %d singletons %d\n", |
| nodes, leaves, singletons); |
| } |
| |
| /* |
| * Allocate an initialize a new internal node. |
| */ |
| static struct node *alloc_node(struct node *parent) |
| { |
| struct node *node; |
| int bitnum; |
| |
| node = malloc(sizeof(*node)); |
| node->left = node->right = NULL; |
| node->parent = parent; |
| node->leftnode = NODE; |
| node->rightnode = NODE; |
| node->keybits = 0; |
| node->keymask = 0; |
| node->mark = 0; |
| node->index = 0; |
| node->offset = -1; |
| node->size = 4; |
| |
| if (node->parent) { |
| bitnum = parent->bitnum; |
| if ((bitnum & 7) == 0) { |
| node->bitnum = bitnum + 7 + 8; |
| node->nextbyte = 1; |
| } else { |
| node->bitnum = bitnum - 1; |
| node->nextbyte = 0; |
| } |
| } else { |
| node->bitnum = 7; |
| node->nextbyte = 0; |
| } |
| |
| return node; |
| } |
| |
| /* |
| * Insert a new leaf into the tree, and collapse any subtrees that are |
| * fully populated and end in identical leaves. A nextbyte tagged |
| * internal node will not be removed to preserve the tree's integrity. |
| * Note that due to the structure of utf8, no nextbyte tagged node |
| * will be a candidate for removal. |
| */ |
| static int insert(struct tree *tree, char *key, int keylen, void *leaf) |
| { |
| struct node *node; |
| struct node *parent; |
| void **cursor; |
| int keybits; |
| |
| assert(keylen >= 1 && keylen <= 4); |
| |
| node = NULL; |
| cursor = &tree->root; |
| keybits = 8 * keylen; |
| |
| /* Insert, creating path along the way. */ |
| while (keybits) { |
| if (!*cursor) |
| *cursor = alloc_node(node); |
| node = *cursor; |
| if (node->nextbyte) |
| key++; |
| if (*key & (1 << (node->bitnum & 7))) |
| cursor = &node->right; |
| else |
| cursor = &node->left; |
| keybits--; |
| } |
| *cursor = leaf; |
| |
| /* Merge subtrees if possible. */ |
| while (node) { |
| if (*key & (1 << (node->bitnum & 7))) |
| node->rightnode = LEAF; |
| else |
| node->leftnode = LEAF; |
| if (node->nextbyte) |
| break; |
| if (node->leftnode == NODE || node->rightnode == NODE) |
| break; |
| assert(node->left); |
| assert(node->right); |
| /* Compare */ |
| if (! tree->leaf_equal(node->left, node->right)) |
| break; |
| /* Keep left, drop right leaf. */ |
| leaf = node->left; |
| /* Check in parent */ |
| parent = node->parent; |
| if (!parent) { |
| /* root of tree! */ |
| tree->root = leaf; |
| tree->childnode = LEAF; |
| } else if (parent->left == node) { |
| parent->left = leaf; |
| parent->leftnode = LEAF; |
| if (parent->right) { |
| parent->keymask = 0; |
| parent->keybits = 0; |
| } else { |
| parent->keymask |= (1 << node->bitnum); |
| } |
| } else if (parent->right == node) { |
| parent->right = leaf; |
| parent->rightnode = LEAF; |
| if (parent->left) { |
| parent->keymask = 0; |
| parent->keybits = 0; |
| } else { |
| parent->keymask |= (1 << node->bitnum); |
| parent->keybits |= (1 << node->bitnum); |
| } |
| } else { |
| /* internal tree error */ |
| assert(0); |
| } |
| free(node); |
| node = parent; |
| } |
| |
| /* Propagate keymasks up along singleton chains. */ |
| while (node) { |
| parent = node->parent; |
| if (!parent) |
| break; |
| /* Nix the mask for parents with two children. */ |
| if (node->keymask == 0) { |
| parent->keymask = 0; |
| parent->keybits = 0; |
| } else if (parent->left && parent->right) { |
| parent->keymask = 0; |
| parent->keybits = 0; |
| } else { |
| assert((parent->keymask & node->keymask) == 0); |
| parent->keymask |= node->keymask; |
| parent->keymask |= (1 << parent->bitnum); |
| parent->keybits |= node->keybits; |
| if (parent->right) |
| parent->keybits |= (1 << parent->bitnum); |
| } |
| node = parent; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Prune internal nodes. |
| * |
| * Fully populated subtrees that end at the same leaf have already |
| * been collapsed. There are still internal nodes that have for both |
| * their left and right branches a sequence of singletons that make |
| * identical choices and end in identical leaves. The keymask and |
| * keybits collected in the nodes describe the choices made in these |
| * singleton chains. When they are identical for the left and right |
| * branch of a node, and the two leaves comare identical, the node in |
| * question can be removed. |
| * |
| * Note that nodes with the nextbyte tag set will not be removed by |
| * this to ensure tree integrity. Note as well that the structure of |
| * utf8 ensures that these nodes would not have been candidates for |
| * removal in any case. |
| */ |
| static void prune(struct tree *tree) |
| { |
| struct node *node; |
| struct node *left; |
| struct node *right; |
| struct node *parent; |
| void *leftleaf; |
| void *rightleaf; |
| unsigned int leftmask; |
| unsigned int rightmask; |
| unsigned int bitmask; |
| int count; |
| |
| if (verbose > 0) |
| printf("Pruning %s_%x\n", tree->type, tree->maxage); |
| |
| count = 0; |
| if (tree->childnode == LEAF) |
| return; |
| if (!tree->root) |
| return; |
| |
| leftmask = rightmask = 0; |
| node = tree->root; |
| while (node) { |
| if (node->nextbyte) |
| goto advance; |
| if (node->leftnode == LEAF) |
| goto advance; |
| if (node->rightnode == LEAF) |
| goto advance; |
| if (!node->left) |
| goto advance; |
| if (!node->right) |
| goto advance; |
| left = node->left; |
| right = node->right; |
| if (left->keymask == 0) |
| goto advance; |
| if (right->keymask == 0) |
| goto advance; |
| if (left->keymask != right->keymask) |
| goto advance; |
| if (left->keybits != right->keybits) |
| goto advance; |
| leftleaf = NULL; |
| while (!leftleaf) { |
| assert(left->left || left->right); |
| if (left->leftnode == LEAF) |
| leftleaf = left->left; |
| else if (left->rightnode == LEAF) |
| leftleaf = left->right; |
| else if (left->left) |
| left = left->left; |
| else if (left->right) |
| left = left->right; |
| else |
| assert(0); |
| } |
| rightleaf = NULL; |
| while (!rightleaf) { |
| assert(right->left || right->right); |
| if (right->leftnode == LEAF) |
| rightleaf = right->left; |
| else if (right->rightnode == LEAF) |
| rightleaf = right->right; |
| else if (right->left) |
| right = right->left; |
| else if (right->right) |
| right = right->right; |
| else |
| assert(0); |
| } |
| if (! tree->leaf_equal(leftleaf, rightleaf)) |
| goto advance; |
| /* |
| * This node has identical singleton-only subtrees. |
| * Remove it. |
| */ |
| parent = node->parent; |
| left = node->left; |
| right = node->right; |
| if (parent->left == node) |
| parent->left = left; |
| else if (parent->right == node) |
| parent->right = left; |
| else |
| assert(0); |
| left->parent = parent; |
| left->keymask |= (1 << node->bitnum); |
| node->left = NULL; |
| while (node) { |
| bitmask = 1 << node->bitnum; |
| leftmask &= ~bitmask; |
| rightmask &= ~bitmask; |
| if (node->leftnode == NODE && node->left) { |
| left = node->left; |
| free(node); |
| count++; |
| node = left; |
| } else if (node->rightnode == NODE && node->right) { |
| right = node->right; |
| free(node); |
| count++; |
| node = right; |
| } else { |
| node = NULL; |
| } |
| } |
| /* Propagate keymasks up along singleton chains. */ |
| node = parent; |
| /* Force re-check */ |
| bitmask = 1 << node->bitnum; |
| leftmask &= ~bitmask; |
| rightmask &= ~bitmask; |
| for (;;) { |
| if (node->left && node->right) |
| break; |
| if (node->left) { |
| left = node->left; |
| node->keymask |= left->keymask; |
| node->keybits |= left->keybits; |
| } |
| if (node->right) { |
| right = node->right; |
| node->keymask |= right->keymask; |
| node->keybits |= right->keybits; |
| } |
| node->keymask |= (1 << node->bitnum); |
| node = node->parent; |
| /* Force re-check */ |
| bitmask = 1 << node->bitnum; |
| leftmask &= ~bitmask; |
| rightmask &= ~bitmask; |
| } |
| advance: |
| bitmask = 1 << node->bitnum; |
| if ((leftmask & bitmask) == 0 && |
| node->leftnode == NODE && |
| node->left) { |
| leftmask |= bitmask; |
| node = node->left; |
| } else if ((rightmask & bitmask) == 0 && |
| node->rightnode == NODE && |
| node->right) { |
| rightmask |= bitmask; |
| node = node->right; |
| } else { |
| leftmask &= ~bitmask; |
| rightmask &= ~bitmask; |
| node = node->parent; |
| } |
| } |
| if (verbose > 0) |
| printf("Pruned %d nodes\n", count); |
| } |
| |
| /* |
| * Mark the nodes in the tree that lead to leaves that must be |
| * emitted. |
| */ |
| static void mark_nodes(struct tree *tree) |
| { |
| struct node *node; |
| struct node *n; |
| unsigned int leftmask; |
| unsigned int rightmask; |
| unsigned int bitmask; |
| int marked; |
| |
| marked = 0; |
| if (verbose > 0) |
| printf("Marking %s_%x\n", tree->type, tree->maxage); |
| if (tree->childnode == LEAF) |
| goto done; |
| |
| assert(tree->childnode == NODE); |
| node = tree->root; |
| leftmask = rightmask = 0; |
| while (node) { |
| bitmask = 1 << node->bitnum; |
| if ((leftmask & bitmask) == 0) { |
| leftmask |= bitmask; |
| if (node->leftnode == LEAF) { |
| assert(node->left); |
| if (tree->leaf_mark(node->left)) { |
| n = node; |
| while (n && !n->mark) { |
| marked++; |
| n->mark = 1; |
| n = n->parent; |
| } |
| } |
| } else if (node->left) { |
| assert(node->leftnode == NODE); |
| node = node->left; |
| continue; |
| } |
| } |
| if ((rightmask & bitmask) == 0) { |
| rightmask |= bitmask; |
| if (node->rightnode == LEAF) { |
| assert(node->right); |
| if (tree->leaf_mark(node->right)) { |
| n = node; |
| while (n && !n->mark) { |
| marked++; |
| n->mark = 1; |
| n = n->parent; |
| } |
| } |
| } else if (node->right) { |
| assert(node->rightnode == NODE); |
| node = node->right; |
| continue; |
| } |
| } |
| leftmask &= ~bitmask; |
| rightmask &= ~bitmask; |
| node = node->parent; |
| } |
| |
| /* second pass: left siblings and singletons */ |
| |
| assert(tree->childnode == NODE); |
| node = tree->root; |
| leftmask = rightmask = 0; |
| while (node) { |
| bitmask = 1 << node->bitnum; |
| if ((leftmask & bitmask) == 0) { |
| leftmask |= bitmask; |
| if (node->leftnode == LEAF) { |
| assert(node->left); |
| if (tree->leaf_mark(node->left)) { |
| n = node; |
| while (n && !n->mark) { |
| marked++; |
| n->mark = 1; |
| n = n->parent; |
| } |
| } |
| } else if (node->left) { |
| assert(node->leftnode == NODE); |
| node = node->left; |
| if (!node->mark && node->parent->mark) { |
| marked++; |
| node->mark = 1; |
| } |
| continue; |
| } |
| } |
| if ((rightmask & bitmask) == 0) { |
| rightmask |= bitmask; |
| if (node->rightnode == LEAF) { |
| assert(node->right); |
| if (tree->leaf_mark(node->right)) { |
| n = node; |
| while (n && !n->mark) { |
| marked++; |
| n->mark = 1; |
| n = n->parent; |
| } |
| } |
| } else if (node->right) { |
| assert(node->rightnode == NODE); |
| node = node->right; |
| if (!node->mark && node->parent->mark && |
| !node->parent->left) { |
| marked++; |
| node->mark = 1; |
| } |
| continue; |
| } |
| } |
| leftmask &= ~bitmask; |
| rightmask &= ~bitmask; |
| node = node->parent; |
| } |
| done: |
| if (verbose > 0) |
| printf("Marked %d nodes\n", marked); |
| } |
| |
| /* |
| * Compute the index of each node and leaf, which is the offset in the |
| * emitted trie. These values must be pre-computed because relative |
| * offsets between nodes are used to navigate the tree. |
| */ |
| static int index_nodes(struct tree *tree, int index) |
| { |
| struct node *node; |
| unsigned int leftmask; |
| unsigned int rightmask; |
| unsigned int bitmask; |
| int count; |
| int indent; |
| |
| /* Align to a cache line (or half a cache line?). */ |
| while (index % 64) |
| index++; |
| tree->index = index; |
| indent = 1; |
| count = 0; |
| |
| if (verbose > 0) |
| printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index); |
| if (tree->childnode == LEAF) { |
| index += tree->leaf_size(tree->root); |
| goto done; |
| } |
| |
| assert(tree->childnode == NODE); |
| node = tree->root; |
| leftmask = rightmask = 0; |
| while (node) { |
| if (!node->mark) |
| goto skip; |
| count++; |
| if (node->index != index) |
| node->index = index; |
| index += node->size; |
| skip: |
| while (node) { |
| bitmask = 1 << node->bitnum; |
| if (node->mark && (leftmask & bitmask) == 0) { |
| leftmask |= bitmask; |
| if (node->leftnode == LEAF) { |
| assert(node->left); |
| *tree->leaf_index(tree, node->left) = |
| index; |
| index += tree->leaf_size(node->left); |
| count++; |
| } else if (node->left) { |
| assert(node->leftnode == NODE); |
| indent += 1; |
| node = node->left; |
| break; |
| } |
| } |
| if (node->mark && (rightmask & bitmask) == 0) { |
| rightmask |= bitmask; |
| if (node->rightnode == LEAF) { |
| assert(node->right); |
| *tree->leaf_index(tree, node->right) = index; |
| index += tree->leaf_size(node->right); |
| count++; |
| } else if (node->right) { |
| assert(node->rightnode == NODE); |
| indent += 1; |
| node = node->right; |
| break; |
| } |
| } |
| leftmask &= ~bitmask; |
| rightmask &= ~bitmask; |
| node = node->parent; |
| indent -= 1; |
| } |
| } |
| done: |
| /* Round up to a multiple of 16 */ |
| while (index % 16) |
| index++; |
| if (verbose > 0) |
| printf("Final index %d\n", index); |
| return index; |
| } |
| |
| /* |
| * Mark the nodes in a subtree, helper for size_nodes(). |
| */ |
| static int mark_subtree(struct node *node) |
| { |
| int changed; |
| |
| if (!node || node->mark) |
| return 0; |
| node->mark = 1; |
| node->index = node->parent->index; |
| changed = 1; |
| if (node->leftnode == NODE) |
| changed += mark_subtree(node->left); |
| if (node->rightnode == NODE) |
| changed += mark_subtree(node->right); |
| return changed; |
| } |
| |
| /* |
| * Compute the size of nodes and leaves. We start by assuming that |
| * each node needs to store a three-byte offset. The indexes of the |
| * nodes are calculated based on that, and then this function is |
| * called to see if the sizes of some nodes can be reduced. This is |
| * repeated until no more changes are seen. |
| */ |
| static int size_nodes(struct tree *tree) |
| { |
| struct tree *next; |
| struct node *node; |
| struct node *right; |
| struct node *n; |
| unsigned int leftmask; |
| unsigned int rightmask; |
| unsigned int bitmask; |
| unsigned int pathbits; |
| unsigned int pathmask; |
| unsigned int nbit; |
| int changed; |
| int offset; |
| int size; |
| int indent; |
| |
| indent = 1; |
| changed = 0; |
| size = 0; |
| |
| if (verbose > 0) |
| printf("Sizing %s_%x\n", tree->type, tree->maxage); |
| if (tree->childnode == LEAF) |
| goto done; |
| |
| assert(tree->childnode == NODE); |
| pathbits = 0; |
| pathmask = 0; |
| node = tree->root; |
| leftmask = rightmask = 0; |
| while (node) { |
| if (!node->mark) |
| goto skip; |
| offset = 0; |
| if (!node->left || !node->right) { |
| size = 1; |
| } else { |
| if (node->rightnode == NODE) { |
| /* |
| * If the right node is not marked, |
| * look for a corresponding node in |
| * the next tree. Such a node need |
| * not exist. |
| */ |
| right = node->right; |
| next = tree->next; |
| while (!right->mark) { |
| assert(next); |
| n = next->root; |
| while (n->bitnum != node->bitnum) { |
| nbit = 1 << n->bitnum; |
| if (!(pathmask & nbit)) |
| break; |
| if (pathbits & nbit) { |
| if (n->rightnode == LEAF) |
| break; |
| n = n->right; |
| } else { |
| if (n->leftnode == LEAF) |
| break; |
| n = n->left; |
| } |
| } |
| if (n->bitnum != node->bitnum) |
| break; |
| n = n->right; |
| right = n; |
| next = next->next; |
| } |
| /* Make sure the right node is marked. */ |
| if (!right->mark) |
| changed += mark_subtree(right); |
| offset = right->index - node->index; |
| } else { |
| offset = *tree->leaf_index(tree, node->right); |
| offset -= node->index; |
| } |
| assert(offset >= 0); |
| assert(offset <= 0xffffff); |
| if (offset <= 0xff) { |
| size = 2; |
| } else if (offset <= 0xffff) { |
| size = 3; |
| } else { /* offset <= 0xffffff */ |
| size = 4; |
| } |
| } |
| if (node->size != size || node->offset != offset) { |
| node->size = size; |
| node->offset = offset; |
| changed++; |
| } |
| skip: |
| while (node) { |
| bitmask = 1 << node->bitnum; |
| pathmask |= bitmask; |
| if (node->mark && (leftmask & bitmask) == 0) { |
| leftmask |= bitmask; |
| if (node->leftnode == LEAF) { |
| assert(node->left); |
| } else if (node->left) { |
| assert(node->leftnode == NODE); |
| indent += 1; |
| node = node->left; |
| break; |
| } |
| } |
| if (node->mark && (rightmask & bitmask) == 0) { |
| rightmask |= bitmask; |
| pathbits |= bitmask; |
| if (node->rightnode == LEAF) { |
| assert(node->right); |
| } else if (node->right) { |
| assert(node->rightnode == NODE); |
| indent += 1; |
| node = node->right; |
| break; |
| } |
| } |
| leftmask &= ~bitmask; |
| rightmask &= ~bitmask; |
| pathmask &= ~bitmask; |
| pathbits &= ~bitmask; |
| node = node->parent; |
| indent -= 1; |
| } |
| } |
| done: |
| if (verbose > 0) |
| printf("Found %d changes\n", changed); |
| return changed; |
| } |
| |
| /* |
| * Emit a trie for the given tree into the data array. |
| */ |
| static void emit(struct tree *tree, unsigned char *data) |
| { |
| struct node *node; |
| unsigned int leftmask; |
| unsigned int rightmask; |
| unsigned int bitmask; |
| int offlen; |
| int offset; |
| int index; |
| int indent; |
| int size; |
| int bytes; |
| int leaves; |
| int nodes[4]; |
| unsigned char byte; |
| |
| nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0; |
| leaves = 0; |
| bytes = 0; |
| index = tree->index; |
| data += index; |
| indent = 1; |
| if (verbose > 0) |
| printf("Emitting %s_%x\n", tree->type, tree->maxage); |
| if (tree->childnode == LEAF) { |
| assert(tree->root); |
| tree->leaf_emit(tree->root, data); |
| size = tree->leaf_size(tree->root); |
| index += size; |
| leaves++; |
| goto done; |
| } |
| |
| assert(tree->childnode == NODE); |
| node = tree->root; |
| leftmask = rightmask = 0; |
| while (node) { |
| if (!node->mark) |
| goto skip; |
| assert(node->offset != -1); |
| assert(node->index == index); |
| |
| byte = 0; |
| if (node->nextbyte) |
| byte |= NEXTBYTE; |
| byte |= (node->bitnum & BITNUM); |
| if (node->left && node->right) { |
| if (node->leftnode == NODE) |
| byte |= LEFTNODE; |
| if (node->rightnode == NODE) |
| byte |= RIGHTNODE; |
| if (node->offset <= 0xff) |
| offlen = 1; |
| else if (node->offset <= 0xffff) |
| offlen = 2; |
| else |
| offlen = 3; |
| nodes[offlen]++; |
| offset = node->offset; |
| byte |= offlen << OFFLEN_SHIFT; |
| *data++ = byte; |
| index++; |
| while (offlen--) { |
| *data++ = offset & 0xff; |
| index++; |
| offset >>= 8; |
| } |
| } else if (node->left) { |
| if (node->leftnode == NODE) |
| byte |= TRIENODE; |
| nodes[0]++; |
| *data++ = byte; |
| index++; |
| } else if (node->right) { |
| byte |= RIGHTNODE; |
| if (node->rightnode == NODE) |
| byte |= TRIENODE; |
| nodes[0]++; |
| *data++ = byte; |
| index++; |
| } else { |
| assert(0); |
| } |
| skip: |
| while (node) { |
| bitmask = 1 << node->bitnum; |
| if (node->mark && (leftmask & bitmask) == 0) { |
| leftmask |= bitmask; |
| if (node->leftnode == LEAF) { |
| assert(node->left); |
| data = tree->leaf_emit(node->left, |
| data); |
| size = tree->leaf_size(node->left); |
| index += size; |
| bytes += size; |
| leaves++; |
| } else if (node->left) { |
| assert(node->leftnode == NODE); |
| indent += 1; |
| node = node->left; |
| break; |
| } |
| } |
| if (node->mark && (rightmask & bitmask) == 0) { |
| rightmask |= bitmask; |
| if (node->rightnode == LEAF) { |
| assert(node->right); |
| data = tree->leaf_emit(node->right, |
| data); |
| size = tree->leaf_size(node->right); |
| index += size; |
| bytes += size; |
| leaves++; |
| } else if (node->right) { |
| assert(node->rightnode == NODE); |
| indent += 1; |
| node = node->right; |
| break; |
| } |
| } |
| leftmask &= ~bitmask; |
| rightmask &= ~bitmask; |
| node = node->parent; |
| indent -= 1; |
| } |
| } |
| done: |
| if (verbose > 0) { |
| printf("Emitted %d (%d) leaves", |
| leaves, bytes); |
| printf(" %d (%d+%d+%d+%d) nodes", |
| nodes[0] + nodes[1] + nodes[2] + nodes[3], |
| nodes[0], nodes[1], nodes[2], nodes[3]); |
| printf(" %d total\n", index - tree->index); |
| } |
| } |
| |
| /* ------------------------------------------------------------------ */ |
| |
| /* |
| * Unicode data. |
| * |
| * We need to keep track of the Canonical Combining Class, the Age, |
| * and decompositions for a code point. |
| * |
| * For the Age, we store the index into the ages table. Effectively |
| * this is a generation number that the table maps to a unicode |
| * version. |
| * |
| * The correction field is used to indicate that this entry is in the |
| * corrections array, which contains decompositions that were |
| * corrected in later revisions. The value of the correction field is |
| * the Unicode version in which the mapping was corrected. |
| */ |
| struct unicode_data { |
| unsigned int code; |
| int ccc; |
| int gen; |
| int correction; |
| unsigned int *utf32nfdi; |
| unsigned int *utf32nfdicf; |
| char *utf8nfdi; |
| char *utf8nfdicf; |
| }; |
| |
| struct unicode_data unicode_data[0x110000]; |
| struct unicode_data *corrections; |
| int corrections_count; |
| |
| struct tree *nfdi_tree; |
| struct tree *nfdicf_tree; |
| |
| struct tree *trees; |
| int trees_count; |
| |
| /* |
| * Check the corrections array to see if this entry was corrected at |
| * some point. |
| */ |
| static struct unicode_data *corrections_lookup(struct unicode_data *u) |
| { |
| int i; |
| |
| for (i = 0; i != corrections_count; i++) |
| if (u->code == corrections[i].code) |
| return &corrections[i]; |
| return u; |
| } |
| |
| static int nfdi_equal(void *l, void *r) |
| { |
| struct unicode_data *left = l; |
| struct unicode_data *right = r; |
| |
| if (left->gen != right->gen) |
| return 0; |
| if (left->ccc != right->ccc) |
| return 0; |
| if (left->utf8nfdi && right->utf8nfdi && |
| strcmp(left->utf8nfdi, right->utf8nfdi) == 0) |
| return 1; |
| if (left->utf8nfdi || right->utf8nfdi) |
| return 0; |
| return 1; |
| } |
| |
| static int nfdicf_equal(void *l, void *r) |
| { |
| struct unicode_data *left = l; |
| struct unicode_data *right = r; |
| |
| if (left->gen != right->gen) |
| return 0; |
| if (left->ccc != right->ccc) |
| return 0; |
| if (left->utf8nfdicf && right->utf8nfdicf && |
| strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0) |
| return 1; |
| if (left->utf8nfdicf && right->utf8nfdicf) |
| return 0; |
| if (left->utf8nfdicf || right->utf8nfdicf) |
| return 0; |
| if (left->utf8nfdi && right->utf8nfdi && |
| strcmp(left->utf8nfdi, right->utf8nfdi) == 0) |
| return 1; |
| if (left->utf8nfdi || right->utf8nfdi) |
| return 0; |
| return 1; |
| } |
| |
| static void nfdi_print(void *l, int indent) |
| { |
| struct unicode_data *leaf = l; |
| |
| printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, |
| leaf->code, leaf->ccc, leaf->gen); |
| |
| if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) |
| printf(" nfdi \"%s\"", "HANGUL SYLLABLE"); |
| else if (leaf->utf8nfdi) |
| printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); |
| |
| printf("\n"); |
| } |
| |
| static void nfdicf_print(void *l, int indent) |
| { |
| struct unicode_data *leaf = l; |
| |
| printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, |
| leaf->code, leaf->ccc, leaf->gen); |
| |
| if (leaf->utf8nfdicf) |
| printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf); |
| else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) |
| printf(" nfdi \"%s\"", "HANGUL SYLLABLE"); |
| else if (leaf->utf8nfdi) |
| printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); |
| printf("\n"); |
| } |
| |
| static int nfdi_mark(void *l) |
| { |
| return 1; |
| } |
| |
| static int nfdicf_mark(void *l) |
| { |
| struct unicode_data *leaf = l; |
| |
| if (leaf->utf8nfdicf) |
| return 1; |
| return 0; |
| } |
| |
| static int correction_mark(void *l) |
| { |
| struct unicode_data *leaf = l; |
| |
| return leaf->correction; |
| } |
| |
| static int nfdi_size(void *l) |
| { |
| struct unicode_data *leaf = l; |
| int size = 2; |
| |
| if (HANGUL_SYLLABLE(leaf->code)) |
| size += 1; |
| else if (leaf->utf8nfdi) |
| size += strlen(leaf->utf8nfdi) + 1; |
| return size; |
| } |
| |
| static int nfdicf_size(void *l) |
| { |
| struct unicode_data *leaf = l; |
| int size = 2; |
| |
| if (HANGUL_SYLLABLE(leaf->code)) |
| size += 1; |
| else if (leaf->utf8nfdicf) |
| size += strlen(leaf->utf8nfdicf) + 1; |
| else if (leaf->utf8nfdi) |
| size += strlen(leaf->utf8nfdi) + 1; |
| return size; |
| } |
| |
| static int *nfdi_index(struct tree *tree, void *l) |
| { |
| struct unicode_data *leaf = l; |
| |
| return &tree->leafindex[leaf->code]; |
| } |
| |
| static int *nfdicf_index(struct tree *tree, void *l) |
| { |
| struct unicode_data *leaf = l; |
| |
| return &tree->leafindex[leaf->code]; |
| } |
| |
| static unsigned char *nfdi_emit(void *l, unsigned char *data) |
| { |
| struct unicode_data *leaf = l; |
| unsigned char *s; |
| |
| *data++ = leaf->gen; |
| |
| if (HANGUL_SYLLABLE(leaf->code)) { |
| *data++ = DECOMPOSE; |
| *data++ = HANGUL; |
| } else if (leaf->utf8nfdi) { |
| *data++ = DECOMPOSE; |
| s = (unsigned char*)leaf->utf8nfdi; |
| while ((*data++ = *s++) != 0) |
| ; |
| } else { |
| *data++ = leaf->ccc; |
| } |
| return data; |
| } |
| |
| static unsigned char *nfdicf_emit(void *l, unsigned char *data) |
| { |
| struct unicode_data *leaf = l; |
| unsigned char *s; |
| |
| *data++ = leaf->gen; |
| |
| if (HANGUL_SYLLABLE(leaf->code)) { |
| *data++ = DECOMPOSE; |
| *data++ = HANGUL; |
| } else if (leaf->utf8nfdicf) { |
| *data++ = DECOMPOSE; |
| s = (unsigned char*)leaf->utf8nfdicf; |
| while ((*data++ = *s++) != 0) |
| ; |
| } else if (leaf->utf8nfdi) { |
| *data++ = DECOMPOSE; |
| s = (unsigned char*)leaf->utf8nfdi; |
| while ((*data++ = *s++) != 0) |
| ; |
| } else { |
| *data++ = leaf->ccc; |
| } |
| return data; |
| } |
| |
| static void utf8_create(struct unicode_data *data) |
| { |
| char utf[18*4+1]; |
| char *u; |
| unsigned int *um; |
| int i; |
| |
| if (data->utf8nfdi) { |
| assert(data->utf8nfdi[0] == HANGUL); |
| return; |
| } |
| |
| u = utf; |
| um = data->utf32nfdi; |
| if (um) { |
| for (i = 0; um[i]; i++) |
| u += utf8encode(u, um[i]); |
| *u = '\0'; |
| data->utf8nfdi = strdup(utf); |
| } |
| u = utf; |
| um = data->utf32nfdicf; |
| if (um) { |
| for (i = 0; um[i]; i++) |
| u += utf8encode(u, um[i]); |
| *u = '\0'; |
| if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf)) |
| data->utf8nfdicf = strdup(utf); |
| } |
| } |
| |
| static void utf8_init(void) |
| { |
| unsigned int unichar; |
| int i; |
| |
| for (unichar = 0; unichar != 0x110000; unichar++) |
| utf8_create(&unicode_data[unichar]); |
| |
| for (i = 0; i != corrections_count; i++) |
| utf8_create(&corrections[i]); |
| } |
| |
| static void trees_init(void) |
| { |
| struct unicode_data *data; |
| unsigned int maxage; |
| unsigned int nextage; |
| int count; |
| int i; |
| int j; |
| |
| /* Count the number of different ages. */ |
| count = 0; |
| nextage = (unsigned int)-1; |
| do { |
| maxage = nextage; |
| nextage = 0; |
| for (i = 0; i <= corrections_count; i++) { |
| data = &corrections[i]; |
| if (nextage < data->correction && |
| data->correction < maxage) |
| nextage = data->correction; |
| } |
| count++; |
| } while (nextage); |
| |
| /* Two trees per age: nfdi and nfdicf */ |
| trees_count = count * 2; |
| trees = calloc(trees_count, sizeof(struct tree)); |
| |
| /* Assign ages to the trees. */ |
| count = trees_count; |
| nextage = (unsigned int)-1; |
| do { |
| maxage = nextage; |
| trees[--count].maxage = maxage; |
| trees[--count].maxage = maxage; |
| nextage = 0; |
| for (i = 0; i <= corrections_count; i++) { |
| data = &corrections[i]; |
| if (nextage < data->correction && |
| data->correction < maxage) |
| nextage = data->correction; |
| } |
| } while (nextage); |
| |
| /* The ages assigned above are off by one. */ |
| for (i = 0; i != trees_count; i++) { |
| j = 0; |
| while (ages[j] < trees[i].maxage) |
| j++; |
| trees[i].maxage = ages[j-1]; |
| } |
| |
| /* Set up the forwarding between trees. */ |
| trees[trees_count-2].next = &trees[trees_count-1]; |
| trees[trees_count-1].leaf_mark = nfdi_mark; |
| trees[trees_count-2].leaf_mark = nfdicf_mark; |
| for (i = 0; i != trees_count-2; i += 2) { |
| trees[i].next = &trees[trees_count-2]; |
| trees[i].leaf_mark = correction_mark; |
| trees[i+1].next = &trees[trees_count-1]; |
| trees[i+1].leaf_mark = correction_mark; |
| } |
| |
| /* Assign the callouts. */ |
| for (i = 0; i != trees_count; i += 2) { |
| trees[i].type = "nfdicf"; |
| trees[i].leaf_equal = nfdicf_equal; |
| trees[i].leaf_print = nfdicf_print; |
| trees[i].leaf_size = nfdicf_size; |
| trees[i].leaf_index = nfdicf_index; |
| trees[i].leaf_emit = nfdicf_emit; |
| |
| trees[i+1].type = "nfdi"; |
| trees[i+1].leaf_equal = nfdi_equal; |
| trees[i+1].leaf_print = nfdi_print; |
| trees[i+1].leaf_size = nfdi_size; |
| trees[i+1].leaf_index = nfdi_index; |
| trees[i+1].leaf_emit = nfdi_emit; |
| } |
| |
| /* Finish init. */ |
| for (i = 0; i != trees_count; i++) |
| trees[i].childnode = NODE; |
| } |
| |
| static void trees_populate(void) |
| { |
| struct unicode_data *data; |
| unsigned int unichar; |
| char keyval[4]; |
| int keylen; |
| int i; |
| |
| for (i = 0; i != trees_count; i++) { |
| if (verbose > 0) { |
| printf("Populating %s_%x\n", |
| trees[i].type, trees[i].maxage); |
| } |
| for (unichar = 0; unichar != 0x110000; unichar++) { |
| if (unicode_data[unichar].gen < 0) |
| continue; |
| keylen = utf8encode(keyval, unichar); |
| data = corrections_lookup(&unicode_data[unichar]); |
| if (data->correction <= trees[i].maxage) |
| data = &unicode_data[unichar]; |
| insert(&trees[i], keyval, keylen, data); |
| } |
| } |
| } |
| |
| static void trees_reduce(void) |
| { |
| int i; |
| int size; |
| int changed; |
| |
| for (i = 0; i != trees_count; i++) |
| prune(&trees[i]); |
| for (i = 0; i != trees_count; i++) |
| mark_nodes(&trees[i]); |
| do { |
| size = 0; |
| for (i = 0; i != trees_count; i++) |
| size = index_nodes(&trees[i], size); |
| changed = 0; |
| for (i = 0; i != trees_count; i++) |
| changed += size_nodes(&trees[i]); |
| } while (changed); |
| |
| utf8data = calloc(size, 1); |
| utf8data_size = size; |
| for (i = 0; i != trees_count; i++) |
| emit(&trees[i], utf8data); |
| |
| if (verbose > 0) { |
| for (i = 0; i != trees_count; i++) { |
| printf("%s_%x idx %d\n", |
| trees[i].type, trees[i].maxage, trees[i].index); |
| } |
| } |
| |
| nfdi = utf8data + trees[trees_count-1].index; |
| nfdicf = utf8data + trees[trees_count-2].index; |
| |
| nfdi_tree = &trees[trees_count-1]; |
| nfdicf_tree = &trees[trees_count-2]; |
| } |
| |
| static void verify(struct tree *tree) |
| { |
| struct unicode_data *data; |
| utf8leaf_t *leaf; |
| unsigned int unichar; |
| char key[4]; |
| unsigned char hangul[UTF8HANGULLEAF]; |
| int report; |
| int nocf; |
| |
| if (verbose > 0) |
| printf("Verifying %s_%x\n", tree->type, tree->maxage); |
| nocf = strcmp(tree->type, "nfdicf"); |
| |
| for (unichar = 0; unichar != 0x110000; unichar++) { |
| report = 0; |
| data = corrections_lookup(&unicode_data[unichar]); |
| if (data->correction <= tree->maxage) |
| data = &unicode_data[unichar]; |
| utf8encode(key,unichar); |
| leaf = utf8lookup(tree, hangul, key); |
| |
| if (!leaf) { |
| if (data->gen != -1) |
| report++; |
| if (unichar < 0xd800 || unichar > 0xdfff) |
| report++; |
| } else { |
| if (unichar >= 0xd800 && unichar <= 0xdfff) |
| report++; |
| if (data->gen == -1) |
| report++; |
| if (data->gen != LEAF_GEN(leaf)) |
| report++; |
| if (LEAF_CCC(leaf) == DECOMPOSE) { |
| if (HANGUL_SYLLABLE(data->code)) { |
| if (data->utf8nfdi[0] != HANGUL) |
| report++; |
| } else if (nocf) { |
| if (!data->utf8nfdi) { |
| report++; |
| } else if (strcmp(data->utf8nfdi, |
| LEAF_STR(leaf))) { |
| report++; |
| } |
| } else { |
| if (!data->utf8nfdicf && |
| !data->utf8nfdi) { |
| report++; |
| } else if (data->utf8nfdicf) { |
| if (strcmp(data->utf8nfdicf, |
| LEAF_STR(leaf))) |
| report++; |
| } else if (strcmp(data->utf8nfdi, |
| LEAF_STR(leaf))) { |
| report++; |
| } |
| } |
| } else if (data->ccc != LEAF_CCC(leaf)) { |
| report++; |
| } |
| } |
| if (report) { |
| printf("%X code %X gen %d ccc %d" |
| " nfdi -> \"%s\"", |
| unichar, data->code, data->gen, |
| data->ccc, |
| data->utf8nfdi); |
| if (leaf) { |
| printf(" gen %d ccc %d" |
| " nfdi -> \"%s\"", |
| LEAF_GEN(leaf), |
| LEAF_CCC(leaf), |
| LEAF_CCC(leaf) == DECOMPOSE ? |
| LEAF_STR(leaf) : ""); |
| } |
| printf("\n"); |
| } |
| } |
| } |
| |
| static void trees_verify(void) |
| { |
| int i; |
| |
| for (i = 0; i != trees_count; i++) |
| verify(&trees[i]); |
| } |
| |
| /* ------------------------------------------------------------------ */ |
| |
| static void help(void) |
| { |
| printf("Usage: %s [options]\n", argv0); |
| printf("\n"); |
| printf("This program creates an a data trie used for parsing and\n"); |
| printf("normalization of UTF-8 strings. The trie is derived from\n"); |
| printf("a set of input files from the Unicode character database\n"); |
| printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n"); |
| printf("\n"); |
| printf("The generated tree supports two normalization forms:\n"); |
| printf("\n"); |
| printf("\tnfdi:\n"); |
| printf("\t- Apply unicode normalization form NFD.\n"); |
| printf("\t- Remove any Default_Ignorable_Code_Point.\n"); |
| printf("\n"); |
| printf("\tnfdicf:\n"); |
| printf("\t- Apply unicode normalization form NFD.\n"); |
| printf("\t- Remove any Default_Ignorable_Code_Point.\n"); |
| printf("\t- Apply a full casefold (C + F).\n"); |
| printf("\n"); |
| printf("These forms were chosen as being most useful when dealing\n"); |
| printf("with file names: NFD catches most cases where characters\n"); |
| printf("should be considered equivalent. The ignorables are mostly\n"); |
| printf("invisible, making names hard to type.\n"); |
| printf("\n"); |
| printf("The options to specify the files to be used are listed\n"); |
| printf("below with their default values, which are the names used\n"); |
| printf("by version 11.0.0 of the Unicode Character Database.\n"); |
| printf("\n"); |
| printf("The input files:\n"); |
| printf("\t-a %s\n", AGE_NAME); |
| printf("\t-c %s\n", CCC_NAME); |
| printf("\t-p %s\n", PROP_NAME); |
| printf("\t-d %s\n", DATA_NAME); |
| printf("\t-f %s\n", FOLD_NAME); |
| printf("\t-n %s\n", NORM_NAME); |
| printf("\n"); |
| printf("Additionally, the generated tables are tested using:\n"); |
| printf("\t-t %s\n", TEST_NAME); |
| printf("\n"); |
| printf("Finally, the output file:\n"); |
| printf("\t-o %s\n", UTF8_NAME); |
| printf("\n"); |
| } |
| |
| static void usage(void) |
| { |
| help(); |
| exit(1); |
| } |
| |
| static void open_fail(const char *name, int error) |
| { |
| printf("Error %d opening %s: %s\n", error, name, strerror(error)); |
| exit(1); |
| } |
| |
| static void file_fail(const char *filename) |
| { |
| printf("Error parsing %s\n", filename); |
| exit(1); |
| } |
| |
| static void line_fail(const char *filename, const char *line) |
| { |
| printf("Error parsing %s:%s\n", filename, line); |
| exit(1); |
| } |
| |
| /* ------------------------------------------------------------------ */ |
| |
| static void print_utf32(unsigned int *utf32str) |
| { |
| int i; |
| |
| for (i = 0; utf32str[i]; i++) |
| printf(" %X", utf32str[i]); |
| } |
| |
| static void print_utf32nfdi(unsigned int unichar) |
| { |
| printf(" %X ->", unichar); |
| print_utf32(unicode_data[unichar].utf32nfdi); |
| printf("\n"); |
| } |
| |
| static void print_utf32nfdicf(unsigned int unichar) |
| { |
| printf(" %X ->", unichar); |
| print_utf32(unicode_data[unichar].utf32nfdicf); |
| printf("\n"); |
| } |
| |
| /* ------------------------------------------------------------------ */ |
| |
| static void age_init(void) |
| { |
| FILE *file; |
| unsigned int first; |
| unsigned int last; |
| unsigned int unichar; |
| unsigned int major; |
| unsigned int minor; |
| unsigned int revision; |
| int gen; |
| int count; |
| int ret; |
| |
| if (verbose > 0) |
| printf("Parsing %s\n", age_name); |
| |
| file = fopen(age_name, "r"); |
| if (!file) |
| open_fail(age_name, errno); |
| count = 0; |
| |
| gen = 0; |
| while (fgets(line, LINESIZE, file)) { |
| ret = sscanf(line, "# Age=V%d_%d_%d", |
| &major, &minor, &revision); |
| if (ret == 3) { |
| ages_count++; |
| if (verbose > 1) |
| printf(" Age V%d_%d_%d\n", |
| major, minor, revision); |
| if (!age_valid(major, minor, revision)) |
| line_fail(age_name, line); |
| continue; |
| } |
| ret = sscanf(line, "# Age=V%d_%d", &major, &minor); |
| if (ret == 2) { |
| ages_count++; |
| if (verbose > 1) |
| printf(" Age V%d_%d\n", major, minor); |
| if (!age_valid(major, minor, 0)) |
| line_fail(age_name, line); |
| continue; |
| } |
| } |
| |
| /* We must have found something above. */ |
| if (verbose > 1) |
| printf("%d age entries\n", ages_count); |
| if (ages_count == 0 || ages_count > MAXGEN) |
| file_fail(age_name); |
| |
| /* There is a 0 entry. */ |
| ages_count++; |
| ages = calloc(ages_count + 1, sizeof(*ages)); |
| /* And a guard entry. */ |
| ages[ages_count] = (unsigned int)-1; |
| |
| rewind(file); |
| count = 0; |
| gen = 0; |
| while (fgets(line, LINESIZE, file)) { |
| ret = sscanf(line, "# Age=V%d_%d_%d", |
| &major, &minor, &revision); |
| if (ret == 3) { |
| ages[++gen] = |
| UNICODE_AGE(major, minor, revision); |
| if (verbose > 1) |
| printf(" Age V%d_%d_%d = gen %d\n", |
| major, minor, revision, gen); |
| if (!age_valid(major, minor, revision)) |
| line_fail(age_name, line); |
| continue; |
| } |
| ret = sscanf(line, "# Age=V%d_%d", &major, &minor); |
| if (ret == 2) { |
| ages[++gen] = UNICODE_AGE(major, minor, 0); |
| if (verbose > 1) |
| printf(" Age V%d_%d = %d\n", |
| major, minor, gen); |
| if (!age_valid(major, minor, 0)) |
| line_fail(age_name, line); |
| continue; |
| } |
| ret = sscanf(line, "%X..%X ; %d.%d #", |
| &first, &last, &major, &minor); |
| if (ret == 4) { |
| for (unichar = first; unichar <= last; unichar++) |
| unicode_data[unichar].gen = gen; |
| count += 1 + last - first; |
| if (verbose > 1) |
| printf(" %X..%X gen %d\n", first, last, gen); |
| if (!utf32valid(first) || !utf32valid(last)) |
| line_fail(age_name, line); |
| continue; |
| } |
| ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor); |
| if (ret == 3) { |
| unicode_data[unichar].gen = gen; |
| count++; |
| if (verbose > 1) |
| printf(" %X gen %d\n", unichar, gen); |
| if (!utf32valid(unichar)) |
| line_fail(age_name, line); |
| continue; |
| } |
| } |
| unicode_maxage = ages[gen]; |
| fclose(file); |
| |
| /* Nix surrogate block */ |
| if (verbose > 1) |
| printf(" Removing surrogate block D800..DFFF\n"); |
| for (unichar = 0xd800; unichar <= 0xdfff; unichar++) |
| unicode_data[unichar].gen = -1; |
| |
| if (verbose > 0) |
| printf("Found %d entries\n", count); |
| if (count == 0) |
| file_fail(age_name); |
| } |
| |
| static void ccc_init(void) |
| { |
| FILE *file; |
| unsigned int first; |
| unsigned int last; |
| unsigned int unichar; |
| unsigned int value; |
| int count; |
| int ret; |
| |
| if (verbose > 0) |
| printf("Parsing %s\n", ccc_name); |
| |
| file = fopen(ccc_name, "r"); |
| if (!file) |
| open_fail(ccc_name, errno); |
| |
| count = 0; |
| while (fgets(line, LINESIZE, file)) { |
| ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value); |
| if (ret == 3) { |
| for (unichar = first; unichar <= last; unichar++) { |
| unicode_data[unichar].ccc = value; |
| count++; |
| } |
| if (verbose > 1) |
| printf(" %X..%X ccc %d\n", first, last, value); |
| if (!utf32valid(first) || !utf32valid(last)) |
| line_fail(ccc_name, line); |
| continue; |
| } |
| ret = sscanf(line, "%X ; %d #", &unichar, &value); |
| if (ret == 2) { |
| unicode_data[unichar].ccc = value; |
| count++; |
| if (verbose > 1) |
| printf(" %X ccc %d\n", unichar, value); |
| if (!utf32valid(unichar)) |
| line_fail(ccc_name, line); |
| continue; |
| } |
| } |
| fclose(file); |
| |
| if (verbose > 0) |
| printf("Found %d entries\n", count); |
| if (count == 0) |
| file_fail(ccc_name); |
| } |
| |
| static int ignore_compatibility_form(char *type) |
| { |
| int i; |
| char *ignored_types[] = {"font", "noBreak", "initial", "medial", |
| "final", "isolated", "circle", "super", |
| "sub", "vertical", "wide", "narrow", |
| "small", "square", "fraction", "compat"}; |
| |
| for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++) |
| if (strcmp(type, ignored_types[i]) == 0) |
| return 1; |
| return 0; |
| } |
| |
| static void nfdi_init(void) |
| { |
| FILE *file; |
| unsigned int unichar; |
| unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ |
| char *s; |
| char *type; |
| unsigned int *um; |
| int count; |
| int i; |
| int ret; |
| |
| if (verbose > 0) |
| printf("Parsing %s\n", data_name); |
| file = fopen(data_name, "r"); |
| if (!file) |
| open_fail(data_name, errno); |
| |
| count = 0; |
| while (fgets(line, LINESIZE, file)) { |
| ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];", |
| &unichar, buf0); |
| if (ret != 2) |
| continue; |
| if (!utf32valid(unichar)) |
| line_fail(data_name, line); |
| |
| s = buf0; |
| /* skip over <tag> */ |
| if (*s == '<') { |
| type = ++s; |
| while (*++s != '>'); |
| *s++ = '\0'; |
| if(ignore_compatibility_form(type)) |
| continue; |
| } |
| /* decode the decomposition into UTF-32 */ |
| i = 0; |
| while (*s) { |
| mapping[i] = strtoul(s, &s, 16); |
| if (!utf32valid(mapping[i])) |
| line_fail(data_name, line); |
| i++; |
| } |
| mapping[i++] = 0; |
| |
| um = malloc(i * sizeof(unsigned int)); |
| memcpy(um, mapping, i * sizeof(unsigned int)); |
| unicode_data[unichar].utf32nfdi = um; |
| |
| if (verbose > 1) |
| print_utf32nfdi(unichar); |
| count++; |
| } |
| fclose(file); |
| if (verbose > 0) |
| printf("Found %d entries\n", count); |
| if (count == 0) |
| file_fail(data_name); |
| } |
| |
| static void nfdicf_init(void) |
| { |
| FILE *file; |
| unsigned int unichar; |
| unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ |
| char status; |
| char *s; |
| unsigned int *um; |
| int i; |
| int count; |
| int ret; |
| |
| if (verbose > 0) |
| printf("Parsing %s\n", fold_name); |
| file = fopen(fold_name, "r"); |
| if (!file) |
| open_fail(fold_name, errno); |
| |
| count = 0; |
| while (fgets(line, LINESIZE, file)) { |
| ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0); |
| if (ret != 3) |
| continue; |
| if (!utf32valid(unichar)) |
| line_fail(fold_name, line); |
| /* Use the C+F casefold. */ |
| if (status != 'C' && status != 'F') |
| continue; |
| s = buf0; |
| if (*s == '<') |
| while (*s++ != ' ') |
| ; |
| i = 0; |
| while (*s) { |
| mapping[i] = strtoul(s, &s, 16); |
| if (!utf32valid(mapping[i])) |
| line_fail(fold_name, line); |
| i++; |
| } |
| mapping[i++] = 0; |
| |
| um = malloc(i * sizeof(unsigned int)); |
| memcpy(um, mapping, i * sizeof(unsigned int)); |
| unicode_data[unichar].utf32nfdicf = um; |
| |
| if (verbose > 1) |
| print_utf32nfdicf(unichar); |
| count++; |
| } |
| fclose(file); |
| if (verbose > 0) |
| printf("Found %d entries\n", count); |
| if (count == 0) |
| file_fail(fold_name); |
| } |
| |
| static void ignore_init(void) |
| { |
| FILE *file; |
| unsigned int unichar; |
| unsigned int first; |
| unsigned int last; |
| unsigned int *um; |
| int count; |
| int ret; |
| |
| if (verbose > 0) |
| printf("Parsing %s\n", prop_name); |
| file = fopen(prop_name, "r"); |
| if (!file) |
| open_fail(prop_name, errno); |
| assert(file); |
| count = 0; |
| while (fgets(line, LINESIZE, file)) { |
| ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0); |
| if (ret == 3) { |
| if (strcmp(buf0, "Default_Ignorable_Code_Point")) |
| continue; |
| if (!utf32valid(first) || !utf32valid(last)) |
| line_fail(prop_name, line); |
| for (unichar = first; unichar <= last; unichar++) { |
| free(unicode_data[unichar].utf32nfdi); |
| um = malloc(sizeof(unsigned int)); |
| *um = 0; |
| unicode_data[unichar].utf32nfdi = um; |
| free(unicode_data[unichar].utf32nfdicf); |
| um = malloc(sizeof(unsigned int)); |
| *um = 0; |
| unicode_data[unichar].utf32nfdicf = um; |
| count++; |
| } |
| if (verbose > 1) |
| printf(" %X..%X Default_Ignorable_Code_Point\n", |
| first, last); |
| continue; |
| } |
| ret = sscanf(line, "%X ; %s # ", &unichar, buf0); |
| if (ret == 2) { |
| if (strcmp(buf0, "Default_Ignorable_Code_Point")) |
| continue; |
| if (!utf32valid(unichar)) |
| line_fail(prop_name, line); |
| free(unicode_data[unichar].utf32nfdi); |
| um = malloc(sizeof(unsigned int)); |
| *um = 0; |
| unicode_data[unichar].utf32nfdi = um; |
| free(unicode_data[unichar].utf32nfdicf); |
| um = malloc(sizeof(unsigned int)); |
| *um = 0; |
| unicode_data[unichar].utf32nfdicf = um; |
| if (verbose > 1) |
| printf(" %X Default_Ignorable_Code_Point\n", |
| unichar); |
| count++; |
| continue; |
| } |
| } |
| fclose(file); |
| |
| if (verbose > 0) |
| printf("Found %d entries\n", count); |
| if (count == 0) |
| file_fail(prop_name); |
| } |
| |
| static void corrections_init(void) |
| { |
| FILE *file; |
| unsigned int unichar; |
| unsigned int major; |
| unsigned int minor; |
| unsigned int revision; |
| unsigned int age; |
| unsigned int *um; |
| unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ |
| char *s; |
| int i; |
| int count; |
| int ret; |
| |
| if (verbose > 0) |
| printf("Parsing %s\n", norm_name); |
| file = fopen(norm_name, "r"); |
| if (!file) |
| open_fail(norm_name, errno); |
| |
| count = 0; |
| while (fgets(line, LINESIZE, file)) { |
| ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #", |
| &unichar, buf0, buf1, |
| &major, &minor, &revision); |
| if (ret != 6) |
| continue; |
| if (!utf32valid(unichar) || !age_valid(major, minor, revision)) |
| line_fail(norm_name, line); |
| count++; |
| } |
| corrections = calloc(count, sizeof(struct unicode_data)); |
| corrections_count = count; |
| rewind(file); |
| |
| count = 0; |
| while (fgets(line, LINESIZE, file)) { |
| ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #", |
| &unichar, buf0, buf1, |
| &major, &minor, &revision); |
| if (ret != 6) |
| continue; |
| if (!utf32valid(unichar) || !age_valid(major, minor, revision)) |
| line_fail(norm_name, line); |
| corrections[count] = unicode_data[unichar]; |
| assert(corrections[count].code == unichar); |
| age = UNICODE_AGE(major, minor, revision); |
| corrections[count].correction = age; |
| |
| i = 0; |
| s = buf0; |
| while (*s) { |
| mapping[i] = strtoul(s, &s, 16); |
| if (!utf32valid(mapping[i])) |
| line_fail(norm_name, line); |
| i++; |
| } |
| mapping[i++] = 0; |
| |
| um = malloc(i * sizeof(unsigned int)); |
| memcpy(um, mapping, i * sizeof(unsigned int)); |
| corrections[count].utf32nfdi = um; |
| |
| if (verbose > 1) |
| printf(" %X -> %s -> %s V%d_%d_%d\n", |
| unichar, buf0, buf1, major, minor, revision); |
| count++; |
| } |
| fclose(file); |
| |
| if (verbose > 0) |
| printf("Found %d entries\n", count); |
| if (count == 0) |
| file_fail(norm_name); |
| } |
| |
| /* ------------------------------------------------------------------ */ |
| |
| /* |
| * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) |
| * |
| * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; |
| * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; |
| * |
| * SBase = 0xAC00 |
| * LBase = 0x1100 |
| * VBase = 0x1161 |
| * TBase = 0x11A7 |
| * LCount = 19 |
| * VCount = 21 |
| * TCount = 28 |
| * NCount = 588 (VCount * TCount) |
| * SCount = 11172 (LCount * NCount) |
| * |
| * Decomposition: |
| * SIndex = s - SBase |
| * |
| * LV (Canonical/Full) |
| * LIndex = SIndex / NCount |
| * VIndex = (Sindex % NCount) / TCount |
| * LPart = LBase + LIndex |
| * VPart = VBase + VIndex |
| * |
| * LVT (Canonical) |
| * LVIndex = (SIndex / TCount) * TCount |
| * TIndex = (Sindex % TCount) |
| * LVPart = SBase + LVIndex |
| * TPart = TBase + TIndex |
| * |
| * LVT (Full) |
| * LIndex = SIndex / NCount |
| * VIndex = (Sindex % NCount) / TCount |
| * TIndex = (Sindex % TCount) |
| * LPart = LBase + LIndex |
| * VPart = VBase + VIndex |
| * if (TIndex == 0) { |
| * d = <LPart, VPart> |
| * } else { |
| * TPart = TBase + TIndex |
| * d = <LPart, VPart, TPart> |
| * } |
| * |
| */ |
| |
| static void hangul_decompose(void) |
| { |
| unsigned int sb = 0xAC00; |
| unsigned int lb = 0x1100; |
| unsigned int vb = 0x1161; |
| unsigned int tb = 0x11a7; |
| /* unsigned int lc = 19; */ |
| unsigned int vc = 21; |
| unsigned int tc = 28; |
| unsigned int nc = (vc * tc); |
| /* unsigned int sc = (lc * nc); */ |
| unsigned int unichar; |
| unsigned int mapping[4]; |
| unsigned int *um; |
| int count; |
| int i; |
| |
| if (verbose > 0) |
| printf("Decomposing hangul\n"); |
| /* Hangul */ |
| count = 0; |
| for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) { |
| unsigned int si = unichar - sb; |
| unsigned int li = si / nc; |
| unsigned int vi = (si % nc) / tc; |
| unsigned int ti = si % tc; |
| |
| i = 0; |
| mapping[i++] = lb + li; |
| mapping[i++] = vb + vi; |
| if (ti) |
| mapping[i++] = tb + ti; |
| mapping[i++] = 0; |
| |
| assert(!unicode_data[unichar].utf32nfdi); |
| um = malloc(i * sizeof(unsigned int)); |
| memcpy(um, mapping, i * sizeof(unsigned int)); |
| unicode_data[unichar].utf32nfdi = um; |
| |
| assert(!unicode_data[unichar].utf32nfdicf); |
| um = malloc(i * sizeof(unsigned int)); |
| memcpy(um, mapping, i * sizeof(unsigned int)); |
| unicode_data[unichar].utf32nfdicf = um; |
| |
| /* |
| * Add a cookie as a reminder that the hangul syllable |
| * decompositions must not be stored in the generated |
| * trie. |
| */ |
| unicode_data[unichar].utf8nfdi = malloc(2); |
| unicode_data[unichar].utf8nfdi[0] = HANGUL; |
| unicode_data[unichar].utf8nfdi[1] = '\0'; |
| |
| if (verbose > 1) |
| print_utf32nfdi(unichar); |
| |
| count++; |
| } |
| if (verbose > 0) |
| printf("Created %d entries\n", count); |
| } |
| |
| static void nfdi_decompose(void) |
| { |
| unsigned int unichar; |
| unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ |
| unsigned int *um; |
| unsigned int *dc; |
| int count; |
| int i; |
| int j; |
| int ret; |
| |
| if (verbose > 0) |
| printf("Decomposing nfdi\n"); |
| |
| count = 0; |
| for (unichar = 0; unichar != 0x110000; unichar++) { |
| if (!unicode_data[unichar].utf32nfdi) |
| continue; |
| for (;;) { |
| ret = 1; |
| i = 0; |
| um = unicode_data[unichar].utf32nfdi; |
| while (*um) { |
| dc = unicode_data[*um].utf32nfdi; |
| if (dc) { |
| for (j = 0; dc[j]; j++) |
| mapping[i++] = dc[j]; |
| ret = 0; |
| } else { |
| mapping[i++] = *um; |
| } |
| um++; |
| } |
| mapping[i++] = 0; |
| if (ret) |
| break; |
| free(unicode_data[unichar].utf32nfdi); |
| um = malloc(i * sizeof(unsigned int)); |
| memcpy(um, mapping, i * sizeof(unsigned int)); |
| unicode_data[unichar].utf32nfdi = um; |
| } |
| /* Add this decomposition to nfdicf if there is no entry. */ |
| if (!unicode_data[unichar].utf32nfdicf) { |
| um = malloc(i * sizeof(unsigned int)); |
| memcpy(um, mapping, i * sizeof(unsigned int)); |
| unicode_data[unichar].utf32nfdicf = um; |
| } |
| if (verbose > 1) |
| print_utf32nfdi(unichar); |
| count++; |
| } |
| if (verbose > 0) |
| printf("Processed %d entries\n", count); |
| } |
| |
| static void nfdicf_decompose(void) |
| { |
| unsigned int unichar; |
| unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ |
| unsigned int *um; |
| unsigned int *dc; |
| int count; |
| int i; |
| int j; |
| int ret; |
| |
| if (verbose > 0) |
| printf("Decomposing nfdicf\n"); |
| count = 0; |
| for (unichar = 0; unichar != 0x110000; unichar++) { |
| if (!unicode_data[unichar].utf32nfdicf) |
| continue; |
| for (;;) { |
| ret = 1; |
| i = 0; |
| um = unicode_data[unichar].utf32nfdicf; |
| while (*um) { |
| dc = unicode_data[*um].utf32nfdicf; |
| if (dc) { |
| for (j = 0; dc[j]; j++) |
| mapping[i++] = dc[j]; |
| ret = 0; |
| } else { |
| mapping[i++] = *um; |
| } |
| um++; |
| } |
| mapping[i++] = 0; |
| if (ret) |
| break; |
| free(unicode_data[unichar].utf32nfdicf); |
| um = malloc(i * sizeof(unsigned int)); |
| memcpy(um, mapping, i * sizeof(unsigned int)); |
| unicode_data[unichar].utf32nfdicf = um; |
| } |
| if (verbose > 1) |
| print_utf32nfdicf(unichar); |
| count++; |
| } |
| if (verbose > 0) |
| printf("Processed %d entries\n", count); |
| } |
| |
| /* ------------------------------------------------------------------ */ |
| |
| int utf8agemax(struct tree *, const char *); |
| int utf8nagemax(struct tree *, const char *, size_t); |
| int utf8agemin(struct tree *, const char *); |
| int utf8nagemin(struct tree *, const char *, size_t); |
| ssize_t utf8len(struct tree *, const char *); |
| ssize_t utf8nlen(struct tree *, const char *, size_t); |
| struct utf8cursor; |
| int utf8cursor(struct utf8cursor *, struct tree *, const char *); |
| int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t); |
| int utf8byte(struct utf8cursor *); |
| |
| /* |
| * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) |
| * |
| * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; |
| * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; |
| * |
| * SBase = 0xAC00 |
| * LBase = 0x1100 |
| * VBase = 0x1161 |
| * TBase = 0x11A7 |
| * LCount = 19 |
| * VCount = 21 |
| * TCount = 28 |
| * NCount = 588 (VCount * TCount) |
| * SCount = 11172 (LCount * NCount) |
| * |
| * Decomposition: |
| * SIndex = s - SBase |
| * |
| * LV (Canonical/Full) |
| * LIndex = SIndex / NCount |
| * VIndex = (Sindex % NCount) / TCount |
| * LPart = LBase + LIndex |
| * VPart = VBase + VIndex |
| * |
| * LVT (Canonical) |
| * LVIndex = (SIndex / TCount) * TCount |
| * TIndex = (Sindex % TCount) |
| * LVPart = SBase + LVIndex |
| * TPart = TBase + TIndex |
| * |
| * LVT (Full) |
| * LIndex = SIndex / NCount |
| * VIndex = (Sindex % NCount) / TCount |
| * TIndex = (Sindex % TCount) |
| * LPart = LBase + LIndex |
| * VPart = VBase + VIndex |
| * if (TIndex == 0) { |
| * d = <LPart, VPart> |
| * } else { |
| * TPart = TBase + TIndex |
| * d = <LPart, VPart, TPart> |
| * } |
| */ |
| |
| /* Constants */ |
| #define SB (0xAC00) |
| #define LB (0x1100) |
| #define VB (0x1161) |
| #define TB (0x11A7) |
| #define LC (19) |
| #define VC (21) |
| #define TC (28) |
| #define NC (VC * TC) |
| #define SC (LC * NC) |
| |
| /* Algorithmic decomposition of hangul syllable. */ |
| static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul) |
| { |
| unsigned int si; |
| unsigned int li; |
| unsigned int vi; |
| unsigned int ti; |
| unsigned char *h; |
| |
| /* Calculate the SI, LI, VI, and TI values. */ |
| si = utf8decode(str) - SB; |
| li = si / NC; |
| vi = (si % NC) / TC; |
| ti = si % TC; |
| |
| /* Fill in base of leaf. */ |
| h = hangul; |
| LEAF_GEN(h) = 2; |
| LEAF_CCC(h) = DECOMPOSE; |
| h += 2; |
| |
| /* Add LPart, a 3-byte UTF-8 sequence. */ |
| h += utf8encode((char *)h, li + LB); |
| |
| /* Add VPart, a 3-byte UTF-8 sequence. */ |
| h += utf8encode((char *)h, vi + VB); |
| |
| /* Add TPart if required, also a 3-byte UTF-8 sequence. */ |
| if (ti) |
| h += utf8encode((char *)h, ti + TB); |
| |
| /* Terminate string. */ |
| h[0] = '\0'; |
| |
| return hangul; |
| } |
| |
| /* |
| * Use trie to scan s, touching at most len bytes. |
| * Returns the leaf if one exists, NULL otherwise. |
| * |
| * A non-NULL return guarantees that the UTF-8 sequence starting at s |
| * is well-formed and corresponds to a known unicode code point. The |
| * shorthand for this will be "is valid UTF-8 unicode". |
| */ |
| static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul, |
| const char *s, size_t len) |
| { |
| utf8trie_t *trie; |
| int offlen; |
| int offset; |
| int mask; |
| int node; |
| |
| if (!tree) |
| return NULL; |
| if (len == 0) |
| return NULL; |
| node = 1; |
| trie = utf8data + tree->index; |
| while (node) { |
| offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; |
| if (*trie & NEXTBYTE) { |
| if (--len == 0) |
| return NULL; |
| s++; |
| } |
| mask = 1 << (*trie & BITNUM); |
| if (*s & mask) { |
| /* Right leg */ |
| if (offlen) { |
| /* Right node at offset of trie */ |
| node = (*trie & RIGHTNODE); |
| offset = trie[offlen]; |
| while (--offlen) { |
| offset <<= 8; |
| offset |= trie[offlen]; |
| } |
| trie += offset; |
| } else if (*trie & RIGHTPATH) { |
| /* Right node after this node */ |
| node = (*trie & TRIENODE); |
| trie++; |
| } else { |
| /* No right node. */ |
| return NULL; |
| } |
| } else { |
| /* Left leg */ |
| if (offlen) { |
| /* Left node after this node. */ |
| node = (*trie & LEFTNODE); |
| trie += offlen + 1; |
| } else if (*trie & RIGHTPATH) { |
| /* No left node. */ |
| return NULL; |
| } else { |
| /* Left node after this node */ |
| node = (*trie & TRIENODE); |
| trie++; |
| } |
| } |
| } |
| /* |
| * Hangul decomposition is done algorithmically. These are the |
| * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is |
| * always 3 bytes long, so s has been advanced twice, and the |
| * start of the sequence is at s-2. |
| */ |
| if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) |
| trie = utf8hangul(s - 2, hangul); |
| return trie; |
| } |
| |
| /* |
| * Use trie to scan s. |
| * Returns the leaf if one exists, NULL otherwise. |
| * |
| * Forwards to trie_nlookup(). |
| */ |
| static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul, |
| const char *s) |
| { |
| return utf8nlookup(tree, hangul, s, (size_t)-1); |
| } |
| |
| /* |
| * Return the number of bytes used by the current UTF-8 sequence. |
| * Assumes the input points to the first byte of a valid UTF-8 |
| * sequence. |
| */ |
| static inline int utf8clen(const char *s) |
| { |
| unsigned char c = *s; |
| return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); |
| } |
| |
| /* |
| * Maximum age of any character in s. |
| * Return -1 if s is not valid UTF-8 unicode. |
| * Return 0 if only non-assigned code points are used. |
| */ |
| int utf8agemax(struct tree *tree, const char *s) |
| { |
| utf8leaf_t *leaf; |
| int age = 0; |
| int leaf_age; |
| unsigned char hangul[UTF8HANGULLEAF]; |
| |
| if (!tree) |
| return -1; |
| |
| while (*s) { |
| leaf = utf8lookup(tree, hangul, s); |
| if (!leaf) |
| return -1; |
| leaf_age = ages[LEAF_GEN(leaf)]; |
| if (leaf_age <= tree->maxage && leaf_age > age) |
| age = leaf_age; |
| s += utf8clen(s); |
| } |
| return age; |
| } |
| |
| /* |
| * Minimum age of any character in s. |
| * Return -1 if s is not valid UTF-8 unicode. |
| * Return 0 if non-assigned code points are used. |
| */ |
| int utf8agemin(struct tree *tree, const char *s) |
| { |
| utf8leaf_t *leaf; |
| int age; |
| int leaf_age; |
| unsigned char hangul[UTF8HANGULLEAF]; |
| |
| if (!tree) |
| return -1; |
| age = tree->maxage; |
| while (*s) { |
| leaf = utf8lookup(tree, hangul, s); |
| if (!leaf) |
| return -1; |
| leaf_age = ages[LEAF_GEN(leaf)]; |
| if (leaf_age <= tree->maxage && leaf_age < age) |
| age = leaf_age; |
| s += utf8clen(s); |
| } |
| return age; |
| } |
| |
| /* |
| * Maximum age of any character in s, touch at most len bytes. |
| * Return -1 if s is not valid UTF-8 unicode. |
| */ |
| int utf8nagemax(struct tree *tree, const char *s, size_t len) |
| { |
| utf8leaf_t *leaf; |
| int age = 0; |
| int leaf_age; |
| unsigned char hangul[UTF8HANGULLEAF]; |
| |
| if (!tree) |
| return -1; |
| |
| while (len && *s) { |
| leaf = utf8nlookup(tree, hangul, s, len); |
| if (!leaf) |
| return -1; |
| leaf_age = ages[LEAF_GEN(leaf)]; |
| if (leaf_age <= tree->maxage && leaf_age > age) |
| age = leaf_age; |
| len -= utf8clen(s); |
| s += utf8clen(s); |
| } |
| return age; |
| } |
| |
| /* |
| * Maximum age of any character in s, touch at most len bytes. |
| * Return -1 if s is not valid UTF-8 unicode. |
| */ |
| int utf8nagemin(struct tree *tree, const char *s, size_t len) |
| { |
| utf8leaf_t *leaf; |
| int leaf_age; |
| int age; |
| unsigned char hangul[UTF8HANGULLEAF]; |
| |
| if (!tree) |
| return -1; |
| age = tree->maxage; |
| while (len && *s) { |
| leaf = utf8nlookup(tree, hangul, s, len); |
| if (!leaf) |
| return -1; |
| leaf_age = ages[LEAF_GEN(leaf)]; |
| if (leaf_age <= tree->maxage && leaf_age < age) |
| age = leaf_age; |
| len -= utf8clen(s); |
| s += utf8clen(s); |
| } |
| return age; |
| } |
| |
| /* |
| * Length of the normalization of s. |
| * Return -1 if s is not valid UTF-8 unicode. |
| * |
| * A string of Default_Ignorable_Code_Point has length 0. |
| */ |
| ssize_t utf8len(struct tree *tree, const char *s) |
| { |
| utf8leaf_t *leaf; |
| size_t ret = 0; |
| unsigned char hangul[UTF8HANGULLEAF]; |
| |
| if (!tree) |
| return -1; |
| while (*s) { |
| leaf = utf8lookup(tree, hangul, s); |
| if (!leaf) |
| return -1; |
| if (ages[LEAF_GEN(leaf)] > tree->maxage) |
| ret += utf8clen(s); |
| else if (LEAF_CCC(leaf) == DECOMPOSE) |
| ret += strlen(LEAF_STR(leaf)); |
| else |
| ret += utf8clen(s); |
| s += utf8clen(s); |
| } |
| return ret; |
| } |
| |
| /* |
| * Length of the normalization of s, touch at most len bytes. |
| * Return -1 if s is not valid UTF-8 unicode. |
| */ |
| ssize_t utf8nlen(struct tree *tree, const char *s, size_t len) |
| { |
| utf8leaf_t *leaf; |
| size_t ret = 0; |
| unsigned char hangul[UTF8HANGULLEAF]; |
| |
| if (!tree) |
| return -1; |
| while (len && *s) { |
| leaf = utf8nlookup(tree, hangul, s, len); |
| if (!leaf) |
| return -1; |
| if (ages[LEAF_GEN(leaf)] > tree->maxage) |
| ret += utf8clen(s); |
| else if (LEAF_CCC(leaf) == DECOMPOSE) |
| ret += strlen(LEAF_STR(leaf)); |
| else |
| ret += utf8clen(s); |
| len -= utf8clen(s); |
| s += utf8clen(s); |
| } |
| return ret; |
| } |
| |
| /* |
| * Cursor structure used by the normalizer. |
| */ |
| struct utf8cursor { |
| struct tree *tree; |
| const char *s; |
| const char *p; |
| const char *ss; |
| const char *sp; |
| unsigned int len; |
| unsigned int slen; |
|